From 50470d67a881d3169a3aec898038042476f1389d Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sat, 25 Aug 2018 17:03:06 +0300 Subject: [PATCH] [D3D12] 32bpp and 20e4 clearing in resolves --- src/xenia/gpu/d3d12/render_target_cache.cc | 173 +++++++++++++++++- src/xenia/gpu/d3d12/render_target_cache.h | 26 ++- .../d3d12/shaders/edram_clear_32bpp.cs.hlsl | 25 +++ .../shaders/edram_clear_depth_float.cs.hlsl | 34 ++++ .../gpu/d3d12/shaders/edram_load_store.hlsli | 11 ++ 5 files changed, 265 insertions(+), 4 deletions(-) create mode 100644 src/xenia/gpu/d3d12/shaders/edram_clear_32bpp.cs.hlsl create mode 100644 src/xenia/gpu/d3d12/shaders/edram_clear_depth_float.cs.hlsl diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index 2f8683b2e..c9892406c 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -27,6 +27,8 @@ namespace gpu { namespace d3d12 { // Generated with `xb buildhlsl`. +#include "xenia/gpu/d3d12/shaders/bin/edram_clear_32bpp_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/edram_clear_depth_float_cs.h" #include "xenia/gpu/d3d12/shaders/bin/edram_load_color_32bpp_cs.h" #include "xenia/gpu/d3d12/shaders/bin/edram_load_color_64bpp_cs.h" #include "xenia/gpu/d3d12/shaders/bin/edram_load_color_7e3_cs.h" @@ -150,6 +152,7 @@ bool RenderTargetCache::Initialize() { } if (load_store_root_error_blob != nullptr) { load_store_root_error_blob->Release(); + load_store_root_error_blob = nullptr; } if (FAILED(device->CreateRootSignature( 0, load_store_root_blob->GetBufferPointer(), @@ -162,6 +165,36 @@ bool RenderTargetCache::Initialize() { } load_store_root_blob->Release(); + // Create the clear root signature (the same, but with the UAV only). + load_store_root_parameters[1].DescriptorTable.NumDescriptorRanges = 1; + ++load_store_root_parameters[1].DescriptorTable.pDescriptorRanges; + if (FAILED(D3D12SerializeRootSignature( + &load_store_root_desc, D3D_ROOT_SIGNATURE_VERSION_1, + &load_store_root_blob, &load_store_root_error_blob))) { + XELOGE("Failed to serialize the EDRAM buffer clear root signature"); + if (load_store_root_error_blob != nullptr) { + XELOGE("%s", reinterpret_cast( + load_store_root_error_blob->GetBufferPointer())); + load_store_root_error_blob->Release(); + } + Shutdown(); + return false; + } + if (load_store_root_error_blob != nullptr) { + load_store_root_error_blob->Release(); + load_store_root_error_blob = nullptr; + } + if (FAILED(device->CreateRootSignature( + 0, load_store_root_blob->GetBufferPointer(), + load_store_root_blob->GetBufferSize(), + IID_PPV_ARGS(&edram_clear_root_signature_)))) { + XELOGE("Failed to create the EDRAM buffer clear root signature"); + load_store_root_blob->Release(); + Shutdown(); + return false; + } + load_store_root_blob->Release(); + // Create the load/store pipelines. D3D12_COMPUTE_PIPELINE_STATE_DESC pipeline_desc; pipeline_desc.pRootSignature = edram_load_store_root_signature_; @@ -203,6 +236,29 @@ bool RenderTargetCache::Initialize() { } edram_tile_sample_32bpp_pipeline_->SetName(L"EDRAM Raw Resolve 32bpp"); + // Create the clear pipelines. + pipeline_desc.pRootSignature = edram_clear_root_signature_; + // 32-bit color or unorm depth. + pipeline_desc.CS.pShaderBytecode = edram_clear_32bpp_cs; + pipeline_desc.CS.BytecodeLength = sizeof(edram_clear_32bpp_cs); + if (FAILED(device->CreateComputePipelineState( + &pipeline_desc, IID_PPV_ARGS(&edram_clear_32bpp_pipeline_)))) { + XELOGE("Failed to create the EDRAM 32bpp clear pipeline"); + Shutdown(); + return false; + } + edram_clear_32bpp_pipeline_->SetName(L"EDRAM Clear 32bpp"); + // Float depth. + pipeline_desc.CS.pShaderBytecode = edram_clear_depth_float_cs; + pipeline_desc.CS.BytecodeLength = sizeof(edram_clear_depth_float_cs); + if (FAILED(device->CreateComputePipelineState( + &pipeline_desc, IID_PPV_ARGS(&edram_clear_depth_float_pipeline_)))) { + XELOGE("Failed to create the EDRAM float depth clear pipeline"); + Shutdown(); + return false; + } + edram_clear_depth_float_pipeline_->SetName(L"EDRAM Clear Float Depth"); + // Create the converting resolve root signature. D3D12_ROOT_PARAMETER resolve_root_parameters[2]; // Parameter 0 is constants. @@ -295,6 +351,14 @@ void RenderTargetCache::Shutdown() { edram_tile_sample_32bpp_pipeline_->Release(); edram_tile_sample_32bpp_pipeline_ = nullptr; } + if (edram_clear_depth_float_pipeline_ != nullptr) { + edram_clear_depth_float_pipeline_->Release(); + edram_clear_depth_float_pipeline_ = nullptr; + } + if (edram_clear_32bpp_pipeline_ != nullptr) { + edram_clear_32bpp_pipeline_->Release(); + edram_clear_32bpp_pipeline_ = nullptr; + } for (uint32_t i = 0; i < uint32_t(EDRAMLoadStoreMode::kCount); ++i) { if (edram_load_pipelines_[i] != nullptr) { edram_load_pipelines_[i]->Release(); @@ -305,6 +369,10 @@ void RenderTargetCache::Shutdown() { edram_store_pipelines_[i] = nullptr; } } + if (edram_clear_root_signature_ != nullptr) { + edram_clear_root_signature_->Release(); + edram_clear_root_signature_ = nullptr; + } if (edram_load_store_root_signature_ != nullptr) { edram_load_store_root_signature_->Release(); edram_load_store_root_signature_ = nullptr; @@ -924,8 +992,9 @@ bool RenderTargetCache::Resolve(SharedMemory* shared_memory, bool copied = ResolveCopy(shared_memory, texture_cache, surface_edram_base, surface_pitch, msaa_samples, surface_is_depth, surface_format, src_rect); - // TODO(Triang3l): Clear. - return copied; + bool cleared = ResolveClear(surface_edram_base, surface_pitch, msaa_samples, + surface_is_depth, surface_format, src_rect); + return copied || cleared; } bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, @@ -1459,6 +1528,106 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, return true; } +bool RenderTargetCache::ResolveClear(uint32_t edram_base, + uint32_t surface_pitch, + MsaaSamples msaa_samples, bool is_depth, + uint32_t format, const D3D12_RECT& rect) { + auto& regs = *register_file_; + + // Check if clearing is enabled. + uint32_t rb_copy_control = regs[XE_GPU_REG_RB_COPY_CONTROL].u32; + if (!(rb_copy_control & (is_depth ? (1 << 9) : (1 << 8)))) { + return true; + } + + // Calculate the layout. + bool is_64bpp = + !is_depth && IsColorFormat64bpp(ColorRenderTargetFormat(format)); + D3D12_RECT clear_rect = rect; + uint32_t surface_pitch_tiles, row_tiles, rows; + if (!GetEDRAMLayout(surface_pitch, msaa_samples, is_64bpp, edram_base, + clear_rect, surface_pitch_tiles, row_tiles, rows)) { + // Nothing to clear. + return true; + } + uint32_t samples_x_log2 = msaa_samples >= MsaaSamples::k4X ? 1 : 0; + uint32_t samples_y_log2 = msaa_samples >= MsaaSamples::k2X ? 1 : 0; + + // Get everything needed for clearing. + auto command_list = command_processor_->GetCurrentCommandList(); + if (command_list == nullptr) { + return false; + } + auto device = + command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice(); + D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start; + D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start; + if (command_processor_->RequestViewDescriptors(0, 1, 1, descriptor_cpu_start, + descriptor_gpu_start) == 0) { + return false; + } + + // Submit the clear. + command_processor_->PushTransitionBarrier( + edram_buffer_, edram_buffer_state_, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + edram_buffer_state_ = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + command_processor_->SubmitBarriers(); + EDRAMLoadStoreRootConstants root_constants; + root_constants.clear_rect_lt = (clear_rect.left << samples_x_log2) | + (clear_rect.top << (16 + samples_y_log2)); + root_constants.clear_rect_rb = (clear_rect.right << samples_x_log2) | + (clear_rect.bottom << (16 + samples_y_log2)); + root_constants.base_pitch_tiles = edram_base | (surface_pitch_tiles << 11); + if (is_depth && + DepthRenderTargetFormat(format) == DepthRenderTargetFormat::kD24FS8) { + root_constants.clear_depth24 = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32; + // 20e4 [0,2), based on CFloat24 from d3dref9.dll and on 6e4 in DirectXTex. + uint32_t depth24 = root_constants.clear_depth24 >> 8; + if (depth24 == 0) { + root_constants.clear_depth32 = 0; + } else { + uint32_t mantissa = depth24 & 0xFFFFFu, exponent = depth24 >> 20; + if (exponent == 0) { + // Normalize the value in the resulting float. + // do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x100000) == 0) + uint32_t mantissa_lzcnt = xe::lzcnt(mantissa) - (32u - 21u); + exponent = 1u - mantissa_lzcnt; + mantissa = (mantissa << mantissa_lzcnt) & 0xFFFFFu; + } + root_constants.clear_depth32 = + ((exponent + 112u) << 23) | (mantissa << 3); + } + command_processor_->SetComputePipeline(edram_clear_depth_float_pipeline_); + } else if (is_64bpp) { + // TODO(Triang3l): 64bpp color clear. + return false; + } else { + Register reg = + is_depth ? XE_GPU_REG_RB_DEPTH_CLEAR : XE_GPU_REG_RB_COLOR_CLEAR; + root_constants.clear_color_high = regs[reg].u32; + command_processor_->SetComputePipeline(edram_clear_32bpp_pipeline_); + } + command_list->SetComputeRootSignature(edram_clear_root_signature_); + command_list->SetComputeRoot32BitConstants( + 0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0); + D3D12_UNORDERED_ACCESS_VIEW_DESC uav_desc; + uav_desc.Format = DXGI_FORMAT_R32_TYPELESS; + uav_desc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER; + uav_desc.Buffer.FirstElement = 0; + uav_desc.Buffer.NumElements = 2 * 2048 * 1280; + uav_desc.Buffer.StructureByteStride = 0; + uav_desc.Buffer.CounterOffsetInBytes = 0; + uav_desc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW; + device->CreateUnorderedAccessView(edram_buffer_, nullptr, &uav_desc, + descriptor_cpu_start); + command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start); + command_list->Dispatch(row_tiles, rows, 1); + command_processor_->PushUAVBarrier(edram_buffer_); + + return true; +} + ID3D12PipelineState* RenderTargetCache::GetResolvePipeline( DXGI_FORMAT dest_format) { // Try to find an existing pipeline. diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index dbd3a4b08..acf35bc3e 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -399,6 +399,10 @@ class RenderTargetCache { uint32_t edram_base, uint32_t surface_pitch, MsaaSamples msaa_samples, bool is_depth, uint32_t src_format, const D3D12_RECT& src_rect); + // Performs the clearing part of a resolve. + bool ResolveClear(uint32_t edram_base, uint32_t surface_pitch, + MsaaSamples msaa_samples, bool is_depth, uint32_t format, + const D3D12_RECT& rect); ID3D12PipelineState* GetResolvePipeline(DXGI_FORMAT dest_format); // Returns any available resolve target placed at least at @@ -416,8 +420,9 @@ class RenderTargetCache { D3D12_RESOURCE_STATES edram_buffer_state_; bool edram_buffer_cleared_; - // EDRAM buffer load/store root signature. + // EDRAM root signatures. ID3D12RootSignature* edram_load_store_root_signature_ = nullptr; + ID3D12RootSignature* edram_clear_root_signature_ = nullptr; struct EDRAMLoadStoreRootConstants { union { struct { @@ -443,11 +448,26 @@ class RenderTargetCache { // For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47. uint32_t tile_sample_dest_info; }; + struct { + // 16 bits for X, 16 bits for Y. + uint32_t clear_rect_lt; + uint32_t clear_rect_rb; + union { + struct { + uint32_t clear_color_high; + uint32_t clear_color_low; + }; + struct { + uint32_t clear_depth24; + uint32_t clear_depth32; + }; + }; + }; }; // Base in the lower 11 bits, pitch above. uint32_t base_pitch_tiles; }; - // EDRAM buffer load/store pipelines. + // EDRAM pipelines. static const EDRAMLoadStoreModeInfo edram_load_store_mode_info_[size_t(EDRAMLoadStoreMode::kCount)]; ID3D12PipelineState* @@ -455,6 +475,8 @@ class RenderTargetCache { ID3D12PipelineState* edram_store_pipelines_[size_t(EDRAMLoadStoreMode::kCount)] = {}; ID3D12PipelineState* edram_tile_sample_32bpp_pipeline_ = nullptr; + ID3D12PipelineState* edram_clear_32bpp_pipeline_ = nullptr; + ID3D12PipelineState* edram_clear_depth_float_pipeline_ = nullptr; // 48 MB heaps backing used render targets resources, created when needed. // 24 MB proved to be not enough to store a single render target occupying the diff --git a/src/xenia/gpu/d3d12/shaders/edram_clear_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_clear_32bpp.cs.hlsl new file mode 100644 index 000000000..1609e1194 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_clear_32bpp.cs.hlsl @@ -0,0 +1,25 @@ +#define XE_EDRAM_WRITE_ONLY +#include "edram_load_store.hlsli" + +// Load4/Store4 aren't needed here, but 80x16 threads is over the limit. +[numthreads(40, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + uint4 clear_rect; + clear_rect.xz = xe_edram_clear_rect & 0xFFFFu; + clear_rect.yw = xe_edram_clear_rect >> 16u; + uint2 sample_index = xe_thread_id.xy; + sample_index.x *= 2u; + [branch] if (any(sample_index < clear_rect.xy) || + any(sample_index >= clear_rect.zw)) { + return; + } + uint2 tile_dword_index = xe_group_thread_id.xy; + tile_dword_index.x *= 2u; + uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index); + xe_edram_load_store_dest.Store(edram_offset, xe_edram_clear_color32); + if (sample_index.x + 1u < clear_rect.z) { + xe_edram_load_store_dest.Store(edram_offset + 4u, xe_edram_clear_color32); + } +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_clear_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_clear_depth_float.cs.hlsl new file mode 100644 index 000000000..1b5ab59cf --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_clear_depth_float.cs.hlsl @@ -0,0 +1,34 @@ +#define XE_EDRAM_WRITE_ONLY +#include "edram_load_store.hlsli" + +// Load4/Store4 aren't needed here, but 80x16 threads is over the limit. +[numthreads(40, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + uint4 clear_rect; + clear_rect.xz = xe_edram_clear_rect & 0xFFFFu; + clear_rect.yw = xe_edram_clear_rect >> 16u; + uint2 sample_index = xe_thread_id.xy; + sample_index.x *= 2u; + [branch] if (any(sample_index < clear_rect.xy) || + any(sample_index >= clear_rect.zw)) { + return; + } + uint2 tile_dword_index = xe_group_thread_id.xy; + tile_dword_index.x *= 2u; + bool second_sample_inside = sample_index.x + 1u < clear_rect.z; + // 24-bit depth. + uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index); + xe_edram_load_store_dest.Store(edram_offset, xe_edram_clear_depth24); + [branch] if (second_sample_inside) { + xe_edram_load_store_dest.Store(edram_offset + 4u, xe_edram_clear_depth24); + } + // 32-bit depth (pre-converted on the CPU). + xe_edram_load_store_dest.Store(edram_offset + 10485760u, + xe_edram_clear_depth32); + [branch] if (second_sample_inside) { + xe_edram_load_store_dest.Store(edram_offset + 10485764u, + xe_edram_clear_depth32); + } +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli index e55b783a4..e572c2f03 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli +++ b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli @@ -31,7 +31,18 @@ cbuffer XeEDRAMLoadStoreConstants : register(b0) { // For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47. #define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w) +// For clearing. +// Left/top of the cleared region (relative to EDRAM base) in the lower 16 bits, +// right/bottom in the upper, in samples. +#define xe_edram_clear_rect (xe_edram_load_store_constants.xy) +#define xe_edram_clear_color32 (xe_edram_load_store_constants.z) +#define xe_edram_clear_color64 (xe_edram_load_store_constants.zw) +#define xe_edram_clear_depth24 (xe_edram_load_store_constants.z) +#define xe_edram_clear_depth32 (xe_edram_load_store_constants.w) + +#ifndef XE_EDRAM_WRITE_ONLY ByteAddressBuffer xe_edram_load_store_source : register(t0); +#endif RWByteAddressBuffer xe_edram_load_store_dest : register(u0); uint XeEDRAMOffset(uint2 tile_index, uint2 tile_dword_index) {