diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index 915fa19c7..ad1f0010d 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -37,6 +37,7 @@ namespace d3d12 { #include "xenia/gpu/d3d12/shaders/bin/edram_store_color_7e3_cs.h" #include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_float_cs.h" #include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_unorm_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/edram_tile_sample_32bpp_cs.h" const RenderTargetCache::EDRAMLoadStoreModeInfo RenderTargetCache::edram_load_store_mode_info_[size_t( @@ -169,7 +170,7 @@ bool RenderTargetCache::Initialize() { pipeline_desc.CS.BytecodeLength = mode_info.load_shader_size; if (FAILED(device->CreateComputePipelineState( &pipeline_desc, IID_PPV_ARGS(&edram_load_pipelines_[i])))) { - XELOGE("Failed to create EDRAM load pipeline for mode %u", i); + XELOGE("Failed to create the EDRAM load pipeline for mode %u", i); Shutdown(); return false; } @@ -179,12 +180,22 @@ bool RenderTargetCache::Initialize() { pipeline_desc.CS.BytecodeLength = mode_info.store_shader_size; if (FAILED(device->CreateComputePipelineState( &pipeline_desc, IID_PPV_ARGS(&edram_store_pipelines_[i])))) { - XELOGE("Failed to create EDRAM store pipeline for mode %u", i); + XELOGE("Failed to create the EDRAM store pipeline for mode %u", i); Shutdown(); return false; } edram_store_pipelines_[i]->SetName(mode_info.store_pipeline_name); } + // Tile single sample into a texture - 32 bits per pixel. + pipeline_desc.CS.pShaderBytecode = edram_tile_sample_32bpp_cs; + pipeline_desc.CS.BytecodeLength = sizeof(edram_tile_sample_32bpp_cs); + if (FAILED(device->CreateComputePipelineState( + &pipeline_desc, IID_PPV_ARGS(&edram_tile_sample_32bpp_pipeline_)))) { + XELOGE("Failed to create the 32bpp EDRAM raw resolve pipeline"); + Shutdown(); + return false; + } + edram_tile_sample_32bpp_pipeline_->SetName(L"EDRAM Raw Resolve 32bpp"); return true; } @@ -192,6 +203,10 @@ bool RenderTargetCache::Initialize() { void RenderTargetCache::Shutdown() { ClearCache(); + if (edram_tile_sample_32bpp_pipeline_ != nullptr) { + edram_tile_sample_32bpp_pipeline_->Release(); + edram_tile_sample_32bpp_pipeline_ = nullptr; + } for (uint32_t i = 0; i < uint32_t(EDRAMLoadStoreMode::kCount); ++i) { if (edram_load_pipelines_[i] != nullptr) { edram_load_pipelines_[i]->Release(); diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index bdc6cf8b6..3c26b5f47 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -404,6 +404,7 @@ class RenderTargetCache { edram_load_pipelines_[size_t(EDRAMLoadStoreMode::kCount)] = {}; ID3D12PipelineState* edram_store_pipelines_[size_t(EDRAMLoadStoreMode::kCount)] = {}; + ID3D12PipelineState* edram_tile_sample_32bpp_pipeline_ = nullptr; // 32 MB heaps backing used render targets resources, created when needed. // 24 MB proved to be not enough to store a single render target occupying the diff --git a/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl new file mode 100644 index 000000000..29633a1aa --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl @@ -0,0 +1,59 @@ +#include "edram_load_store.hlsli" +#include "texture_address.hlsli" + +[numthreads(20, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + // Check if not outside of the destination texture completely. + uint4 copy_rect = + (xe_edram_tile_sample_rect.xyxy >> uint4(0u, 0u, 16u, 16u)) & 0xFFFFu; + uint2 texel_index = xe_thread_id.xy; + texel_index.x *= 4u; + [branch] if (any(texel_index < copy_rect.xy) || + any(texel_index >= copy_rect.zw)) { + return; + } + + // Get the samples from the EDRAM buffer. + // XY - log2(pixel size), ZW - selected sample offset. + uint4 sample_info = + (xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u; + uint edram_offset = XeEDRAMOffset( + xe_group_id.xy << sample_info.xy, + xe_thread_id.xy << (sample_info.xy + uint2(2u, 0u)) + sample_info.zw); + // At 1x and 2x, this contains samples of 4 pixels. At 4x, this contains + // samples of 2, need to load 2 more. + uint4 pixels = xe_edram_load_store_source.Load4(edram_offset); + [branch] if (sample_info.x != 0u) { + pixels.xy = pixels.xz; + pixels.zw = xe_edram_load_store_source.Load3(edram_offset + 16u).xz; + } + + uint red_blue_swap = xe_edram_tile_sample_dest_info >> 20u; + if (red_blue_swap != 0u) { + uint red_mask = (1u << (red_blue_swap & 31u)) - 1u; + // No need to be ready for a long shift Barney, it's just 16 or 20. + uint blue_shift = red_blue_swap >> 5u; + uint blue_mask = red_mask << blue_shift; + pixels = (pixels & ~(red_mask | blue_mask)) | + ((pixels & red_mask) << blue_shift) | + ((pixels >> blue_shift) & red_mask); + } + + // Tile the pixels to the shared memory. + uint4 texel_addresses = + xe_edram_tile_sample_dest_base + + XeTextureTiledOffset2D(texel_index - copy_rect.xy, + xe_edram_tile_sample_dest_info & 16383u, 2u); + xe_edram_load_store_dest.Store(texel_addresses.x, pixels.x); + [branch] if (texel_index.x + 1u < copy_rect.z) { + xe_edram_load_store_dest.Store(texel_addresses.y, pixels.y); + [branch] if (texel_index.x + 2u < copy_rect.z) { + xe_edram_load_store_dest.Store(texel_addresses.z, pixels.z); + [branch] if (texel_index.x + 3u < copy_rect.z) { + xe_edram_load_store_dest.Store(texel_addresses.w, pixels.w); + } + } + } +}