From e48a678ac76cf43226763b704c6a4a60207863f5 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Mon, 14 Jun 2021 19:00:57 +0300 Subject: [PATCH 1/2] [GPU] Undo depth_and_color_formats_out renaming --- src/xenia/gpu/render_target_cache.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc index 2b4ea97ef..ec83cc4fd 100644 --- a/src/xenia/gpu/render_target_cache.cc +++ b/src/xenia/gpu/render_target_cache.cc @@ -801,10 +801,10 @@ bool RenderTargetCache::Update(bool is_rasterization_done, uint32_t RenderTargetCache::GetLastUpdateBoundRenderTargets( bool distinguish_gamma_formats, - uint32_t* depth_and_color_resource_formats_out) const { + uint32_t* depth_and_color_formats_out) const { if (GetPath() != Path::kHostRenderTargets) { - if (depth_and_color_resource_formats_out) { - std::memset(depth_and_color_resource_formats_out, 0, + if (depth_and_color_formats_out) { + std::memset(depth_and_color_formats_out, 0, sizeof(uint32_t) * (1 + xenos::kMaxColorRenderTargets)); } return 0; @@ -814,14 +814,14 @@ uint32_t RenderTargetCache::GetLastUpdateBoundRenderTargets( const RenderTarget* render_target = last_update_accumulated_render_targets_[i]; if (!render_target) { - if (depth_and_color_resource_formats_out) { - depth_and_color_resource_formats_out[i] = 0; + if (depth_and_color_formats_out) { + depth_and_color_formats_out[i] = 0; } continue; } rts_used |= uint32_t(1) << i; - if (depth_and_color_resource_formats_out) { - depth_and_color_resource_formats_out[i] = + if (depth_and_color_formats_out) { + depth_and_color_formats_out[i] = (distinguish_gamma_formats && i && (last_update_accumulated_color_targets_are_gamma_ & (uint32_t(1) << (i - 1)))) From 8e83042bdff0b9727ed7e5b80d864661565354a7 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Mon, 14 Jun 2021 19:12:12 +0300 Subject: [PATCH 2/2] [GPU] Fix host_depth_store_2xmsaa CS after 2x MSAA sample swap --- .../gpu/d3d12/d3d12_render_target_cache.cc | 7 +-- .../gpu/d3d12/d3d12_render_target_cache.h | 4 +- .../d3d12_5_1/host_depth_store_2xmsaa_cs.h | 43 ++++++++++++------- src/xenia/gpu/shaders/host_depth_store.hlsli | 4 +- .../shaders/host_depth_store_2xmsaa.cs.hlsl | 5 ++- 5 files changed, 37 insertions(+), 26 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc index 234b3f2ae..47cca30c5 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc @@ -4604,11 +4604,8 @@ void D3D12RenderTargetCache::PerformTransfersAndResolveClears( dest_rt_key.pitch_tiles_at_32bpp; host_depth_store_render_target_constant.resolution_scale = resolution_scale_; - host_depth_store_render_target_constant.second_sample_index = - (dest_rt_key.msaa_samples == xenos::MsaaSamples::k2X && - !msaa_2x_supported_) - ? 3 - : 1; + host_depth_store_render_target_constant.msaa_2x_supported = + uint32_t(msaa_2x_supported_); command_list.D3DSetComputeRoot32BitConstants( kHostDepthStoreRootParameterRenderTargetConstant, sizeof(host_depth_store_render_target_constant) / sizeof(uint32_t), diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h index ea9115251..d07f626c2 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h @@ -536,8 +536,8 @@ class D3D12RenderTargetCache final : public RenderTargetCache { uint32_t pitch_tiles : xenos::kEdramPitchTilesBits; // 1 to 3. uint32_t resolution_scale : 2; - // For native 2x MSAA vs. 2x over 4x. - uint32_t second_sample_index : 2; + // Whether 2x MSAA is supported natively rather than through 4x. + uint32_t msaa_2x_supported : 1; }; uint32_t constant = 0; }; diff --git a/src/xenia/gpu/shaders/bytecode/d3d12_5_1/host_depth_store_2xmsaa_cs.h b/src/xenia/gpu/shaders/bytecode/d3d12_5_1/host_depth_store_2xmsaa_cs.h index 07b75e759..64bb46957 100644 --- a/src/xenia/gpu/shaders/bytecode/d3d12_5_1/host_depth_store_2xmsaa_cs.h +++ b/src/xenia/gpu/shaders/bytecode/d3d12_5_1/host_depth_store_2xmsaa_cs.h @@ -77,8 +77,9 @@ imul null, r0.w, r2.z, r2.x imad r0.y, r0.z, r2.x, r0.y imad r0.x, r0.x, r0.w, r0.y ushr r0.x, r0.x, l(2) -ubfe r0.y, l(2), l(12), CB1[1][0].x -movc r0.y, r2.y, r0.y, l(0) +ubfe r0.y, l(1), l(12), CB1[1][0].x +movc r0.zw, r2.yyyy, l(0,0,0,3), l(0,0,1,0) +movc r0.y, r0.y, r0.z, r0.w mov r1.w, l(0) ldms r2.x, r1.xyww, T0[0].xyzw, r0.y iadd r3.xyzw, r1.xyxy, l(2, 0, 1, 0) @@ -106,20 +107,20 @@ mov r1.zw, l(0,0,0,0) ldms r2.w, r1.xyzw, T0[0].yzwx, r0.y store_uav_typed U0[0].xyzw, r0.zzzz, r2.xyzw ret -// Approximately 55 instruction slots used +// Approximately 56 instruction slots used #endif const BYTE host_depth_store_2xmsaa_cs[] = { - 68, 88, 66, 67, 70, 151, - 47, 41, 106, 214, 147, 230, - 77, 220, 169, 203, 166, 115, - 42, 93, 1, 0, 0, 0, - 248, 10, 0, 0, 5, 0, + 68, 88, 66, 67, 15, 231, + 223, 186, 190, 135, 229, 39, + 211, 185, 26, 121, 39, 17, + 25, 229, 1, 0, 0, 0, + 52, 11, 0, 0, 5, 0, 0, 0, 52, 0, 0, 0, 172, 2, 0, 0, 188, 2, 0, 0, 204, 2, 0, 0, - 92, 10, 0, 0, 82, 68, + 152, 10, 0, 0, 82, 68, 69, 70, 112, 2, 0, 0, 2, 0, 0, 0, 92, 1, 0, 0, 4, 0, 0, 0, @@ -231,8 +232,8 @@ const BYTE host_depth_store_2xmsaa_cs[] = 71, 78, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 83, 72, 69, 88, - 136, 7, 0, 0, 81, 0, - 5, 0, 226, 1, 0, 0, + 196, 7, 0, 0, 81, 0, + 5, 0, 241, 1, 0, 0, 106, 8, 0, 1, 89, 0, 0, 7, 70, 142, 48, 0, 0, 0, 0, 0, 0, 0, @@ -395,17 +396,27 @@ const BYTE host_depth_store_2xmsaa_cs[] = 0, 0, 2, 0, 0, 0, 138, 0, 0, 11, 34, 0, 16, 0, 0, 0, 0, 0, - 1, 64, 0, 0, 2, 0, + 1, 64, 0, 0, 1, 0, 0, 0, 1, 64, 0, 0, 12, 0, 0, 0, 10, 128, 48, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 0, 0, + 0, 0, 55, 0, 0, 15, + 194, 0, 16, 0, 0, 0, + 0, 0, 86, 5, 16, 0, + 2, 0, 0, 0, 2, 64, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 3, 0, 0, 0, + 2, 64, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 9, 34, 0, 16, 0, 0, 0, 0, 0, 26, 0, 16, 0, - 2, 0, 0, 0, 26, 0, + 0, 0, 0, 0, 42, 0, 16, 0, 0, 0, 0, 0, - 1, 64, 0, 0, 0, 0, + 58, 0, 16, 0, 0, 0, 0, 0, 54, 0, 0, 5, 130, 0, 16, 0, 1, 0, 0, 0, 1, 64, 0, 0, @@ -554,7 +565,7 @@ const BYTE host_depth_store_2xmsaa_cs[] = 70, 14, 16, 0, 2, 0, 0, 0, 62, 0, 0, 1, 83, 84, 65, 84, 148, 0, - 0, 0, 55, 0, 0, 0, + 0, 0, 56, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 16, 0, @@ -567,7 +578,7 @@ const BYTE host_depth_store_2xmsaa_cs[] = 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 13, 0, 0, 0, 1, 0, + 13, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, diff --git a/src/xenia/gpu/shaders/host_depth_store.hlsli b/src/xenia/gpu/shaders/host_depth_store.hlsli index 190d8b03e..b55c4bf4c 100644 --- a/src/xenia/gpu/shaders/host_depth_store.hlsli +++ b/src/xenia/gpu/shaders/host_depth_store.hlsli @@ -30,8 +30,8 @@ uint XeHostDepthStoreResolutionScale() { return (xe_host_depth_store_render_target >> 10u) & 0x3u; } -uint XeHostDepthStoreSecondSampleIndex() { - return (xe_host_depth_store_render_target >> 12u) & 0x3u; +bool XeHostDepthStoreMsaa2xSupported() { + return bool((xe_host_depth_store_render_target >> 12u) & 0x1u); } // 40-sample columns are not swapped for addressing simplicity (because this is diff --git a/src/xenia/gpu/shaders/host_depth_store_2xmsaa.cs.hlsl b/src/xenia/gpu/shaders/host_depth_store_2xmsaa.cs.hlsl index f8c31b22e..b118351e5 100644 --- a/src/xenia/gpu/shaders/host_depth_store_2xmsaa.cs.hlsl +++ b/src/xenia/gpu/shaders/host_depth_store_2xmsaa.cs.hlsl @@ -21,8 +21,11 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { kXenosMsaaSamples_2X, false, 0u, dest_sample_index, resolution_scale) >> 2u; + // Top and bottom to Direct3D 10.1+ top 1 and bottom 0 (for 2x) or top-left 0 + // and bottom-right 3 (for 4x). int source_sample_index = - int(dest_sample_index != 0u ? XeHostDepthStoreSecondSampleIndex() : 0u); + XeHostDepthStoreMsaa2xSupported() ? (dest_sample_index ? 0u : 1u) + : (dest_sample_index ? 3u : 0u); xe_host_depth_store_dest[edram_address_int4s] = asuint(float4( xe_host_depth_store_source.Load(int2(pixel_index), source_sample_index), xe_host_depth_store_source.Load(int2(pixel_index) + int2(1, 0),