From c9ffe98d21ddb440959a06d639aabf756df0a8e6 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 16 Sep 2018 15:11:11 +0300 Subject: [PATCH] [D3D12] 64bpp raw resolve and EDRAM refactoring --- src/xenia/gpu/d3d12/render_target_cache.cc | 70 +++++++++++-------- src/xenia/gpu/d3d12/render_target_cache.h | 4 +- .../d3d12/shaders/edram_clear_32bpp.cs.hlsl | 6 +- .../shaders/edram_clear_depth_float.cs.hlsl | 6 +- .../shaders/edram_load_color_32bpp.cs.hlsl | 10 +-- .../shaders/edram_load_color_64bpp.cs.hlsl | 17 ++--- .../shaders/edram_load_color_7e3.cs.hlsl | 20 +++--- .../shaders/edram_load_depth_float.cs.hlsl | 6 +- .../shaders/edram_load_depth_unorm.cs.hlsl | 12 ++-- .../gpu/d3d12/shaders/edram_load_store.hlsli | 18 +++-- .../shaders/edram_store_color_32bpp.cs.hlsl | 8 +-- .../shaders/edram_store_color_64bpp.cs.hlsl | 15 ++-- .../shaders/edram_store_color_7e3.cs.hlsl | 17 ++--- .../shaders/edram_store_depth_float.cs.hlsl | 6 +- .../shaders/edram_store_depth_unorm.cs.hlsl | 10 +-- .../shaders/edram_tile_sample_32bpp.cs.hlsl | 2 +- .../shaders/edram_tile_sample_64bpp.cs.hlsl | 67 ++++++++++++++++++ 17 files changed, 188 insertions(+), 106 deletions(-) create mode 100644 src/xenia/gpu/d3d12/shaders/edram_tile_sample_64bpp.cs.hlsl diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index 2acc6d52c..8d36e1854 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -41,6 +41,7 @@ namespace d3d12 { #include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_float_cs.h" #include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_unorm_cs.h" #include "xenia/gpu/d3d12/shaders/bin/edram_tile_sample_32bpp_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/edram_tile_sample_64bpp_cs.h" #include "xenia/gpu/d3d12/shaders/bin/resolve_ps.h" #include "xenia/gpu/d3d12/shaders/bin/resolve_vs.h" @@ -173,6 +174,16 @@ bool RenderTargetCache::Initialize() { return false; } edram_tile_sample_32bpp_pipeline_->SetName(L"EDRAM Raw Resolve 32bpp"); + // Tile single sample into a texture - 64 bits per pixel. + edram_tile_sample_64bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline( + device, edram_tile_sample_64bpp_cs, sizeof(edram_tile_sample_64bpp_cs), + edram_load_store_root_signature_); + if (edram_tile_sample_64bpp_pipeline_ == nullptr) { + XELOGE("Failed to create the 64bpp EDRAM raw resolve pipeline"); + Shutdown(); + return false; + } + edram_tile_sample_64bpp_pipeline_->SetName(L"EDRAM Raw Resolve 64bpp"); // Clear 32-bit color or unorm depth. edram_clear_32bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline( device, edram_clear_32bpp_cs, sizeof(edram_clear_32bpp_cs), @@ -258,6 +269,7 @@ void RenderTargetCache::Shutdown() { } resolve_pipelines_.clear(); ui::d3d12::util::ReleaseAndNull(resolve_root_signature_); + ui::d3d12::util::ReleaseAndNull(edram_tile_sample_64bpp_pipeline_); ui::d3d12::util::ReleaseAndNull(edram_tile_sample_32bpp_pipeline_); ui::d3d12::util::ReleaseAndNull(edram_clear_depth_float_pipeline_); ui::d3d12::util::ReleaseAndNull(edram_clear_32bpp_pipeline_); @@ -977,9 +989,10 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, // Validate and clamp the source region, skip parts that don't need to be // copied and calculate the number of threads needed for copying/loading. - uint32_t surface_pitch_tiles, row_tiles, rows; + uint32_t surface_pitch_tiles, row_width_ss_div_80, rows; if (!GetEDRAMLayout(surface_pitch, msaa_samples, src_64bpp, edram_base, - copy_rect, surface_pitch_tiles, row_tiles, rows)) { + copy_rect, surface_pitch_tiles, row_width_ss_div_80, + rows)) { // Nothing to copy. return true; } @@ -1008,10 +1021,6 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, // Raw copy // ************************************************************************* XELOGGPU("Resolve: Copying using a compute shader"); - if (src_64bpp) { - // TODO(Triang3l): 64bpp sample copy shader. - return false; - } // Make sure we have the memory to write to. if (!shared_memory->MakeTilesResident(dest_address, dest_size)) { @@ -1079,10 +1088,11 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, command_list->SetComputeRoot32BitConstants( 0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0); command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start); - // TODO(Triang3l): 64bpp pipeline. - command_processor_->SetComputePipeline(edram_tile_sample_32bpp_pipeline_); - // 1 group per destination 80x16 (32bpp) / 80x8 (64bpp) region. - uint32_t group_count_x = row_tiles, group_count_y = rows; + command_processor_->SetComputePipeline( + src_64bpp ? edram_tile_sample_64bpp_pipeline_ + : edram_tile_sample_32bpp_pipeline_); + // 1 group per destination 80x16 region. + uint32_t group_count_x = row_width_ss_div_80, group_count_y = rows; if (msaa_samples >= MsaaSamples::k2X) { group_count_y = (group_count_y + 1) >> 1; if (msaa_samples >= MsaaSamples::k4X) { @@ -1121,7 +1131,7 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, return false; } RenderTargetKey render_target_key; - render_target_key.width_ss_div_80 = row_tiles >> (src_64bpp ? 1 : 0); + render_target_key.width_ss_div_80 = row_width_ss_div_80; render_target_key.height_ss_div_16 = rows; render_target_key.is_depth = false; render_target_key.format = src_format; @@ -1190,7 +1200,8 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, command_processor_->SetComputePipeline( edram_load_pipelines_[size_t(GetLoadStoreMode(false, src_format))]); - command_list->Dispatch(row_tiles, rows, 1); + // 1 group per 80x16 samples. + command_list->Dispatch(row_width_ss_div_80, rows, 1); command_processor_->PushUAVBarrier(copy_buffer); // Go to the next descriptor set. @@ -1405,9 +1416,10 @@ bool RenderTargetCache::ResolveClear(uint32_t edram_base, bool is_64bpp = !is_depth && IsColorFormat64bpp(ColorRenderTargetFormat(format)); D3D12_RECT clear_rect = rect; - uint32_t surface_pitch_tiles, row_tiles, rows; + uint32_t surface_pitch_tiles, row_width_ss_div_80, rows; if (!GetEDRAMLayout(surface_pitch, msaa_samples, is_64bpp, edram_base, - clear_rect, surface_pitch_tiles, row_tiles, rows)) { + clear_rect, surface_pitch_tiles, row_width_ss_div_80, + rows)) { // Nothing to clear. return true; } @@ -1475,7 +1487,8 @@ bool RenderTargetCache::ResolveClear(uint32_t edram_base, ui::d3d12::util::CreateRawBufferUAV(device, descriptor_cpu_start, edram_buffer_, kEDRAMBufferSize); command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start); - command_list->Dispatch(row_tiles, rows, 1); + // 1 group per 80x16 samples. + command_list->Dispatch(row_width_ss_div_80, rows, 1); command_processor_->PushUAVBarrier(edram_buffer_); return true; @@ -1871,7 +1884,7 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget( bool RenderTargetCache::GetEDRAMLayout( uint32_t pitch_pixels, MsaaSamples msaa_samples, bool is_64bpp, uint32_t& base_in_out, D3D12_RECT& rect_in_out, uint32_t& pitch_tiles_out, - uint32_t& row_tiles_out, uint32_t& rows_out) { + uint32_t& row_width_ss_div_80_out, uint32_t& rows_out) { if (pitch_pixels == 0 || rect_in_out.right <= 0 || rect_in_out.bottom <= 0 || rect_in_out.top >= rect_in_out.bottom) { return false; @@ -1921,8 +1934,7 @@ bool RenderTargetCache::GetEDRAMLayout( base_in_out = base; rect_in_out = rect; pitch_tiles_out = pitch_tiles; - row_tiles_out = (((rect.right << samples_x_log2) + 79) / 80) - << sample_size_log2; + row_width_ss_div_80_out = ((rect.right << samples_x_log2) + 79) / 80; rows_out = rows; return true; } @@ -2044,14 +2056,6 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() { const RenderTarget* render_target = binding.render_target; bool is_64bpp = false; - // Get the number of X thread groups. - uint32_t rt_pitch_tiles = surface_pitch_tiles; - if (!render_target->key.is_depth && - IsColorFormat64bpp( - ColorRenderTargetFormat(render_target->key.format))) { - rt_pitch_tiles *= 2; - } - // Transition the copy buffer to copy destination. command_processor_->PushTransitionBarrier(copy_buffer, copy_buffer_state, D3D12_RESOURCE_STATE_COPY_DEST); @@ -2084,6 +2088,12 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() { root_constants.rt_stencil_pitch = location_dest.PlacedFootprint.Footprint.RowPitch; } + uint32_t rt_pitch_tiles = surface_pitch_tiles; + if (!render_target->key.is_depth && + IsColorFormat64bpp( + ColorRenderTargetFormat(render_target->key.format))) { + rt_pitch_tiles *= 2; + } root_constants.base_pitch_tiles = binding.edram_base | (rt_pitch_tiles << 11); @@ -2101,7 +2111,8 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() { render_target->key.format); command_processor_->SetComputePipeline( edram_store_pipelines_[size_t(mode)]); - command_list->Dispatch(rt_pitch_tiles, binding.edram_dirty_rows, 1); + // 1 group per 80x16 samples. + command_list->Dispatch(surface_pitch_tiles, binding.edram_dirty_rows, 1); // Commit the UAV write. command_processor_->PushUAVBarrier(edram_buffer_); @@ -2178,7 +2189,7 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM( } const RenderTarget* render_target = render_targets[i]; - // Get the number of X thread groups. + // Get the number of EDRAM tiles per row. uint32_t edram_pitch_tiles = render_target->key.width_ss_div_80; if (!render_target->key.is_depth && IsColorFormat64bpp( @@ -2218,7 +2229,8 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM( EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth, render_target->key.format); command_processor_->SetComputePipeline(edram_load_pipelines_[size_t(mode)]); - command_list->Dispatch(edram_pitch_tiles, edram_rows, 1); + // 1 group per 80x16 samples. + command_list->Dispatch(render_target->key.width_ss_div_80, edram_rows, 1); // Commit the UAV write and transition the copy buffer to copy source now. command_processor_->PushUAVBarrier(copy_buffer); diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index df4c12120..07570fd28 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -381,7 +381,8 @@ class RenderTargetCache { static bool GetEDRAMLayout(uint32_t pitch_pixels, MsaaSamples msaa_samples, bool is_64bpp, uint32_t& base_in_out, D3D12_RECT& rect_in_out, uint32_t& pitch_tiles_out, - uint32_t& row_tiles_out, uint32_t& rows_out); + uint32_t& row_width_ss_div_80_out, + uint32_t& rows_out); static EDRAMLoadStoreMode GetLoadStoreMode(bool is_depth, uint32_t format); @@ -480,6 +481,7 @@ class RenderTargetCache { ID3D12PipelineState* edram_store_pipelines_[size_t(EDRAMLoadStoreMode::kCount)] = {}; ID3D12PipelineState* edram_tile_sample_32bpp_pipeline_ = nullptr; + ID3D12PipelineState* edram_tile_sample_64bpp_pipeline_ = nullptr; ID3D12PipelineState* edram_clear_32bpp_pipeline_ = nullptr; ID3D12PipelineState* edram_clear_depth_float_pipeline_ = nullptr; diff --git a/src/xenia/gpu/d3d12/shaders/edram_clear_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_clear_32bpp.cs.hlsl index 1609e1194..f123a7ef9 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_clear_32bpp.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_clear_32bpp.cs.hlsl @@ -15,9 +15,9 @@ void main(uint3 xe_group_id : SV_GroupID, any(sample_index >= clear_rect.zw)) { return; } - uint2 tile_dword_index = xe_group_thread_id.xy; - tile_dword_index.x *= 2u; - uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index); + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 2u; + uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index); xe_edram_load_store_dest.Store(edram_offset, xe_edram_clear_color32); if (sample_index.x + 1u < clear_rect.z) { xe_edram_load_store_dest.Store(edram_offset + 4u, xe_edram_clear_color32); diff --git a/src/xenia/gpu/d3d12/shaders/edram_clear_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_clear_depth_float.cs.hlsl index 1b5ab59cf..3dc70ed73 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_clear_depth_float.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_clear_depth_float.cs.hlsl @@ -15,11 +15,11 @@ void main(uint3 xe_group_id : SV_GroupID, any(sample_index >= clear_rect.zw)) { return; } - uint2 tile_dword_index = xe_group_thread_id.xy; - tile_dword_index.x *= 2u; + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 2u; bool second_sample_inside = sample_index.x + 1u < clear_rect.z; // 24-bit depth. - uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index); + uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index); xe_edram_load_store_dest.Store(edram_offset, xe_edram_clear_depth24); [branch] if (second_sample_inside) { xe_edram_load_store_dest.Store(edram_offset + 4u, xe_edram_clear_depth24); diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl index 0bcdfb8e8..ead98d0cf 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl @@ -4,11 +4,11 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_group_thread_id : SV_GroupThreadID, uint3 xe_thread_id : SV_DispatchThreadID) { - uint2 tile_dword_index = xe_group_thread_id.xy; - tile_dword_index.x *= 4u; - uint4 pixels = xe_edram_load_store_source.Load4( - XeEDRAMOffset(xe_group_id.xy, tile_dword_index)); + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 4u; + uint4 samples = xe_edram_load_store_source.Load4( + XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index)); uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; - xe_edram_load_store_dest.Store4(rt_offset, pixels); + xe_edram_load_store_dest.Store4(rt_offset, samples); } diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl index a65aa4bf2..396462c85 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl @@ -1,19 +1,14 @@ #include "edram_load_store.hlsli" -[numthreads(40, 8, 1)] +[numthreads(40, 16, 1)] void main(uint3 xe_group_id : SV_GroupID, uint3 xe_group_thread_id : SV_GroupThreadID, uint3 xe_thread_id : SV_DispatchThreadID) { - // One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data - // from 1 render target row rather than 1. Threads with X 0-19 are for the - // first row, with 20-39 are for the second. - uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u); - [flatten] if (xe_group_thread_id.x >= 20u) { - tile_dword_index += uint2(uint(-80), 1u); - } - uint4 pixels = xe_edram_load_store_source.Load4( - XeEDRAMOffset(xe_group_id.xy, tile_dword_index)); + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 2u; + uint4 samples = xe_edram_load_store_source.Load4( + XeEDRAMOffset64bpp(xe_group_id.xy, tile_sample_index)); uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; - xe_edram_load_store_dest.Store4(rt_offset, pixels); + xe_edram_load_store_dest.Store4(rt_offset, samples); } diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl index 53c18687e..43b85be6e 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl @@ -5,16 +5,16 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_group_thread_id : SV_GroupThreadID, uint3 xe_thread_id : SV_DispatchThreadID) { - uint2 tile_dword_index = xe_group_thread_id.xy; - tile_dword_index.x *= 2u; - uint2 pixels_7e3_packed = xe_edram_load_store_source.Load2( - XeEDRAMOffset(xe_group_id.xy, tile_dword_index)); - uint4 pixel_0_f16u32 = XeFloat7e3To16(pixels_7e3_packed.x); - uint4 pixel_1_f16u32 = XeFloat7e3To16(pixels_7e3_packed.y); - uint4 pixels_f16u32_packed = - uint4(pixel_0_f16u32.xz, pixel_1_f16u32.xz) | - (uint4(pixel_0_f16u32.yw, pixel_1_f16u32.yw) << 16u); + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 2u; + uint2 samples_7e3_packed = xe_edram_load_store_source.Load2( + XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index)); + uint4 sample_0_f16u32 = XeFloat7e3To16(samples_7e3_packed.x); + uint4 sample_1_f16u32 = XeFloat7e3To16(samples_7e3_packed.y); + uint4 samples_f16u32_packed = + uint4(sample_0_f16u32.xz, sample_1_f16u32.xz) | + (uint4(sample_0_f16u32.yw, sample_1_f16u32.yw) << 16u); uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; - xe_edram_load_store_dest.Store4(rt_offset, pixels_f16u32_packed); + xe_edram_load_store_dest.Store4(rt_offset, samples_f16u32_packed); } diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl index 06eeb0080..b4c00fdd2 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl @@ -5,9 +5,9 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_group_thread_id : SV_GroupThreadID, uint3 xe_thread_id : SV_DispatchThreadID) { - uint2 tile_dword_index = xe_group_thread_id.xy; - tile_dword_index.x *= 4u; - uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index); + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 4u; + uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index); uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset); uint4 depth24 = depth24_stencil >> 8u; uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset); diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl index 0fdbadd2b..d8bcc069f 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl @@ -4,16 +4,16 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_group_thread_id : SV_GroupThreadID, uint3 xe_thread_id : SV_DispatchThreadID) { - uint2 tile_dword_index = xe_group_thread_id.xy; - tile_dword_index.x *= 4u; - uint4 pixels = xe_edram_load_store_source.Load4( - XeEDRAMOffset(xe_group_id.xy, tile_dword_index)); + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 4u; + uint4 samples = xe_edram_load_store_source.Load4( + XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index)); // Depth. uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; - xe_edram_load_store_dest.Store4(rt_offset, pixels >> 8u); + xe_edram_load_store_dest.Store4(rt_offset, samples >> 8u); // Stencil. - uint4 stencil = (pixels & 0xFFu) << uint4(0u, 8u, 16u, 24u); + uint4 stencil = (samples & 0xFFu) << uint4(0u, 8u, 16u, 24u); stencil.xy |= stencil.zw; stencil.x |= stencil.y; rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli index e572c2f03..dd17089a4 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli +++ b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli @@ -25,10 +25,10 @@ cbuffer XeEDRAMLoadStoreConstants : register(b0) { // 16:17 - sample to load (16 - vertical index, 17 - horizontal index). // 18:20 - destination endianness. // 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping. -// For 32 bits per pixel: +// For 32 bits per sample: // 21:25 - red/blue bit depth. // 26:30 - blue offset. -// For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47. +// For 64 bits per sample, it's 1 if need to swap 0:15 and 32:47. #define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w) // For clearing. @@ -45,10 +45,20 @@ ByteAddressBuffer xe_edram_load_store_source : register(t0); #endif RWByteAddressBuffer xe_edram_load_store_dest : register(u0); -uint XeEDRAMOffset(uint2 tile_index, uint2 tile_dword_index) { +uint XeEDRAMOffset32bpp(uint2 tile_index, uint2 tile_sample_index) { return ((xe_edram_base_pitch_tiles & 2047u) + tile_index.y * (xe_edram_base_pitch_tiles >> 11u) + tile_index.x) * - 5120u + tile_dword_index.y * 320u + tile_dword_index.x * 4u; + 5120u + tile_sample_index.y * 320u + tile_sample_index.x * 4u; +} + +// Instead of individual tiles, this works on two consecutive tiles, the first +// one containing the top 80x8 samples, and the second one containing the bottom +// 80x8 samples. +uint XeEDRAMOffset64bpp(uint2 tile_pair_index, uint2 tile_pair_sample_index) { + return ((xe_edram_base_pitch_tiles & 2047u) + + tile_pair_index.y * (xe_edram_base_pitch_tiles >> 11u) + + (tile_pair_index.x << 1u)) * 5120u + + tile_pair_sample_index.y * 640u + tile_pair_sample_index.x * 8u; } #endif // XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_ diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl index db8038ae6..31c9badbc 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl @@ -6,9 +6,9 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_thread_id : SV_DispatchThreadID) { uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; - uint4 pixels = xe_edram_load_store_source.Load4(rt_offset); - uint2 tile_dword_index = xe_group_thread_id.xy; - tile_dword_index.x *= 4u; + uint4 samples = xe_edram_load_store_source.Load4(rt_offset); + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 4u; xe_edram_load_store_dest.Store4( - XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels); + XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index), samples); } diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl index 7a91fe1b3..b6c13b3e3 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl @@ -1,19 +1,14 @@ #include "edram_load_store.hlsli" -[numthreads(40, 8, 1)] +[numthreads(40, 16, 1)] void main(uint3 xe_group_id : SV_GroupID, uint3 xe_group_thread_id : SV_GroupThreadID, uint3 xe_thread_id : SV_DispatchThreadID) { uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; - uint4 pixels = xe_edram_load_store_source.Load4(rt_offset); - // One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data - // from 1 render target row rather than 1. Threads with X 0-19 are for the - // first row, with 20-39 are for the second. - uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u); - [flatten] if (xe_group_thread_id.x >= 20u) { - tile_dword_index += uint2(uint(-80), 1u); - } + uint4 samples = xe_edram_load_store_source.Load4(rt_offset); + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 2u; xe_edram_load_store_dest.Store4( - XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels); + XeEDRAMOffset64bpp(xe_group_id.xy, tile_sample_index), samples); } diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl index 2beef5b4a..71d4e5f36 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl @@ -7,13 +7,14 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_thread_id : SV_DispatchThreadID) { uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; - uint4 pixels_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset); - uint4 pixel_0_f16u32 = pixels_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u); - uint4 pixel_1_f16u32 = pixels_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u); - uint2 pixels_7e3_packed = - uint2(XeFloat16To7e3(pixel_0_f16u32), XeFloat16To7e3(pixel_1_f16u32)); - uint2 tile_dword_index = xe_group_thread_id.xy; - tile_dword_index.x *= 2u; + uint4 samples_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset); + uint4 sample_0_f16u32 = samples_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u); + uint4 sample_1_f16u32 = samples_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u); + uint2 samples_7e3_packed = + uint2(XeFloat16To7e3(sample_0_f16u32), XeFloat16To7e3(sample_1_f16u32)); + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 2u; xe_edram_load_store_dest.Store2( - XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels_7e3_packed); + XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index), + samples_7e3_packed); } diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl index 4134240a4..2b7fd6ed4 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl @@ -15,9 +15,9 @@ void main(uint3 xe_group_id : SV_GroupID, xe_edram_rt_stencil_offset; depth24_stencil |= (xe_edram_load_store_source.Load(rt_offset).xxxx >> uint4(0u, 8u, 16u, 24u)) & 0xFFu; - uint2 tile_dword_index = xe_group_thread_id.xy; - tile_dword_index.x *= 4u; - uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index); + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 4u; + uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index); // Store 24-bit depth for aliasing and checking if 32-bit depth is up to date. xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil); // Store 32-bit depth so precision isn't lost when doing multipass rendering. diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl index 010cef44b..9e07aa497 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl @@ -7,15 +7,15 @@ void main(uint3 xe_group_id : SV_GroupID, // Depth. uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; - uint4 pixels = + uint4 samples = (xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu) << 8u; // Stencil. rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + xe_edram_rt_stencil_offset; - pixels |= (xe_edram_load_store_source.Load(rt_offset).xxxx >> + samples |= (xe_edram_load_store_source.Load(rt_offset).xxxx >> uint4(0u, 8u, 16u, 24u)) & 0xFFu; - uint2 tile_dword_index = xe_group_thread_id.xy; - tile_dword_index.x *= 4u; + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 4u; xe_edram_load_store_dest.Store4( - XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels); + XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index), samples); } diff --git a/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl index e4c86f62a..b309ebaeb 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl @@ -23,7 +23,7 @@ void main(uint3 xe_group_id : SV_GroupID, (xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u; uint2 edram_tile_quarter = uint2(uint2(10u, 8u) <= xe_group_thread_id.xy) * sample_info.xy; - uint edram_offset = XeEDRAMOffset( + uint edram_offset = XeEDRAMOffset32bpp( (xe_group_id.xy << sample_info.xy) + edram_tile_quarter, (xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) << (sample_info.xy + uint2(2u, 0u)) + sample_info.zw); diff --git a/src/xenia/gpu/d3d12/shaders/edram_tile_sample_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_64bpp.cs.hlsl new file mode 100644 index 000000000..c2d00c61d --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_64bpp.cs.hlsl @@ -0,0 +1,67 @@ +#include "byte_swap.hlsli" +#include "edram_load_store.hlsli" +#include "texture_address.hlsli" + +[numthreads(20, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + // Check if not outside of the destination texture completely. + uint4 copy_rect; + copy_rect.xz = xe_edram_tile_sample_rect & 0xFFFFu; + copy_rect.yw = xe_edram_tile_sample_rect >> 16u; + uint2 texel_index = xe_thread_id.xy; + texel_index.x *= 4u; + [branch] if (any(texel_index < copy_rect.xy) || + any(texel_index >= copy_rect.zw)) { + return; + } + + // Get the samples from the EDRAM buffer. + // XY - log2(pixel size), ZW - selected sample offset. + uint4 sample_info = + (xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u; + uint2 edram_tile_quarter = + uint2(uint2(10u, 8u) <= xe_group_thread_id.xy) * sample_info.xy; + uint edram_offset = XeEDRAMOffset64bpp( + (xe_group_id.xy << sample_info.xy) + edram_tile_quarter, + (xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) << + (sample_info.xy + uint2(2u, 0u)) + sample_info.zw); + // Loaded with the first 2 pixels at 1x and 2x, or the first 1 pixel at 4x. + uint4 pixels_01 = xe_edram_load_store_source.Load4(edram_offset); + // Loaded with the second 2 pixels at 1x and 2x, or the second 1 pixel at 4x. + uint4 pixels_23 = xe_edram_load_store_source.Load4(edram_offset + 16u); + [branch] if (sample_info.x != 0u) { + // Rather than 4 pixels, at 4x, we only have 2 - in xy of each variable + // rather than in xyzw of pixels_01. Combine and load 2 more. + pixels_01.zw = pixels_23.xy; + pixels_23.xy = xe_edram_load_store_source.Load2(edram_offset + 32u); + pixels_23.zw = xe_edram_load_store_source.Load2(edram_offset + 48u); + } + + if ((xe_edram_tile_sample_dest_info >> 21u) != 0u) { + // Swap red and blue - all 64bpp formats where this is possible are + // 16:16:16:16. + pixels_01 = (pixels_01 & 0xFFFF0000u) | (pixels_01.yxwz & 0xFFFFu); + pixels_23 = (pixels_23 & 0xFFFF0000u) | (pixels_23.yxwz & 0xFFFFu); + } + + // Tile the pixels to the shared memory. + pixels_01 = XeByteSwap(pixels_01, xe_edram_tile_sample_dest_info >> 18u); + pixels_23 = XeByteSwap(pixels_23, xe_edram_tile_sample_dest_info >> 18u); + uint4 texel_addresses = + xe_edram_tile_sample_dest_base + + XeTextureTiledOffset2D(texel_index - copy_rect.xy, + xe_edram_tile_sample_dest_info & 16383u, 3u); + xe_edram_load_store_dest.Store2(texel_addresses.x, pixels_01.xy); + bool3 texels_in_rect = uint3(1u, 2u, 3u) + texel_index.x < copy_rect.z; + [branch] if (texels_in_rect.x) { + xe_edram_load_store_dest.Store2(texel_addresses.y, pixels_01.zw); + [branch] if (texels_in_rect.y) { + xe_edram_load_store_dest.Store2(texel_addresses.z, pixels_23.xy); + [branch] if (texels_in_rect.z) { + xe_edram_load_store_dest.Store2(texel_addresses.w, pixels_23.zw); + } + } + } +}