From 952bb91c3f757e2048aaa2748d15639699933644 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Fri, 10 Aug 2018 18:06:21 +0300 Subject: [PATCH] [D3D12] Render target binding --- .../gpu/d3d12/d3d12_command_processor.cc | 42 +- src/xenia/gpu/d3d12/d3d12_command_processor.h | 4 +- src/xenia/gpu/d3d12/pipeline_cache.cc | 82 ++-- src/xenia/gpu/d3d12/pipeline_cache.h | 41 +- src/xenia/gpu/d3d12/render_target_cache.cc | 462 ++++++++++++++---- src/xenia/gpu/d3d12/render_target_cache.h | 59 ++- src/xenia/gpu/hlsl_shader_translator.cc | 20 +- src/xenia/gpu/hlsl_shader_translator.h | 3 + 8 files changed, 551 insertions(+), 162 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index b8c90444e..33d9f3448 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -489,10 +489,6 @@ bool D3D12CommandProcessor::SetupContext() { render_target_cache_ = std::make_unique(this, register_file_); - if (!render_target_cache_->Initialize()) { - XELOGE("Failed to initialize the render target cache"); - return false; - } return true; } @@ -652,9 +648,9 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, } if (reset_index != reset_index_expected) { // Only 0xFFFF and 0xFFFFFFFF primitive restart indices are supported by - // Direct3D 12 (endianness doesn't matter for them). However, Direct3D 9 - // uses 0xFFFF as the reset index. With shared memory, it's impossible to - // replace the cut index in the buffer without affecting the game memory. + // Direct3D 12 (endianness doesn't matter for them). With shared memory, + // it's impossible to replace the cut index in the buffer without + // affecting the game memory. XELOGE( "The game uses the primitive restart index 0x%X that isn't 0xFFFF or " "0xFFFFFFFF. Report the game to Xenia developers so geometry shaders " @@ -678,14 +674,19 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, pixel_shader = nullptr; } else if (!pixel_shader) { // Need a pixel shader in normal color mode. - return true; + return false; } bool new_frame = BeginFrame(); auto command_list = GetCurrentCommandList(); // Set up the render targets - this may bind pipelines. - render_target_cache_->UpdateRenderTargets(); + if (!render_target_cache_->UpdateRenderTargets()) { + // Doesn't actually draw. + return true; + } + const RenderTargetCache::PipelineRenderTarget* pipeline_render_targets = + render_target_cache_->GetCurrentPipelineRenderTargets(); // Set the primitive topology. D3D_PRIMITIVE_TOPOLOGY primitive_topology; @@ -715,8 +716,8 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, ID3D12RootSignature* root_signature; auto pipeline_status = pipeline_cache_->ConfigurePipeline( vertex_shader, pixel_shader, primitive_type, - indexed ? index_buffer_info->format : IndexFormat::kInt16, &pipeline, - &root_signature); + indexed ? index_buffer_info->format : IndexFormat::kInt16, + pipeline_render_targets, &pipeline, &root_signature); if (pipeline_status == PipelineCache::UpdateStatus::kError) { return false; } @@ -733,8 +734,9 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, SetPipeline(pipeline); // Update system constants before uploading them. - UpdateSystemConstantValues(indexed ? index_buffer_info->endianness - : Endian::kUnspecified); + UpdateSystemConstantValues( + indexed ? index_buffer_info->endianness : Endian::kUnspecified, + pipeline_render_targets); // Update constant buffers, descriptors and root parameters. if (!UpdateBindings(command_list, vertex_shader, pixel_shader, @@ -1022,7 +1024,9 @@ void D3D12CommandProcessor::UpdateFixedFunctionState( } } -void D3D12CommandProcessor::UpdateSystemConstantValues(Endian index_endian) { +void D3D12CommandProcessor::UpdateSystemConstantValues( + Endian index_endian, + const RenderTargetCache::PipelineRenderTarget render_targets[4]) { auto& regs = *register_file_; uint32_t vgt_indx_offset = regs[XE_GPU_REG_VGT_INDX_OFFSET].u32; uint32_t pa_cl_vte_cntl = regs[XE_GPU_REG_PA_CL_VTE_CNTL].u32; @@ -1067,7 +1071,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(Endian index_endian) { // viewport that is used to emulate unnormalized coordinates. // Z scale/offset is to convert from OpenGL NDC to Direct3D NDC if needed. // Also apply half-pixel offset to reproduce Direct3D 9 rasterization rules. - // TODO(Triang3l): Check if pixel coordinates need to offset depending on a + // TODO(Triang3l): Check if pixel coordinates need to be offset depending on a // different register (and if there's such register at all). bool gl_clip_space_def = !(pa_cl_clip_cntl & (1 << 19)) && (pa_cl_vte_cntl & (1 << 4)); @@ -1127,6 +1131,14 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(Endian index_endian) { system_constants_.ssaa_inv_scale[0] = ssaa_inv_scale_x; system_constants_.ssaa_inv_scale[1] = ssaa_inv_scale_y; + // Color output index mapping. + for (uint32_t i = 0; i < 4; ++i) { + dirty |= system_constants_.color_output_map[i] != + render_targets[i].guest_render_target; + system_constants_.color_output_map[i] = + render_targets[i].guest_render_target; + } + cbuffer_bindings_system_.up_to_date &= dirty; } diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 085ab2a5a..c958f471a 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -148,7 +148,9 @@ class D3D12CommandProcessor : public CommandProcessor { bool EndFrame(); void UpdateFixedFunctionState(ID3D12GraphicsCommandList* command_list); - void UpdateSystemConstantValues(Endian index_endian); + void UpdateSystemConstantValues( + Endian index_endian, + const RenderTargetCache::PipelineRenderTarget render_targets[4]); bool UpdateBindings(ID3D12GraphicsCommandList* command_list, const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader, diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index 137359d6c..35ea825df 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -9,6 +9,7 @@ #include "xenia/gpu/d3d12/pipeline_cache.h" +#include #include #include #include @@ -17,6 +18,7 @@ #include "xenia/base/logging.h" #include "xenia/base/profiling.h" #include "xenia/gpu/d3d12/d3d12_command_processor.h" +#include "xenia/gpu/d3d12/render_target_cache.h" #include "xenia/gpu/gpu_flags.h" #include "xenia/gpu/hlsl_shader_translator.h" @@ -67,6 +69,7 @@ D3D12Shader* PipelineCache::LoadShader(ShaderType shader_type, PipelineCache::UpdateStatus PipelineCache::ConfigurePipeline( D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, PrimitiveType primitive_type, IndexFormat index_format, + const RenderTargetCache::PipelineRenderTarget render_targets[5], ID3D12PipelineState** pipeline_out, ID3D12RootSignature** root_signature_out) { #if FINE_GRAINED_DRAW_SCOPES @@ -77,8 +80,8 @@ PipelineCache::UpdateStatus PipelineCache::ConfigurePipeline( assert_not_null(root_signature_out); Pipeline* pipeline = nullptr; - auto update_status = - UpdateState(vertex_shader, pixel_shader, primitive_type, index_format); + auto update_status = UpdateState(vertex_shader, pixel_shader, primitive_type, + index_format, render_targets); switch (update_status) { case UpdateStatus::kCompatible: // Requested pipeline is compatible with our previous one, so use that. @@ -190,7 +193,8 @@ bool PipelineCache::TranslateShader(D3D12Shader* shader, PipelineCache::UpdateStatus PipelineCache::UpdateState( D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, - PrimitiveType primitive_type, IndexFormat index_format) { + PrimitiveType primitive_type, IndexFormat index_format, + const RenderTargetCache::PipelineRenderTarget render_targets[5]) { bool mismatch = false; // Reset hash so we can build it up. @@ -208,18 +212,15 @@ PipelineCache::UpdateStatus PipelineCache::UpdateState( UpdateStatus status; status = UpdateShaderStages(vertex_shader, pixel_shader, primitive_type); CHECK_UPDATE_STATUS(status, mismatch, "Unable to update shader stages"); - status = UpdateBlendState(pixel_shader); + status = UpdateBlendStateAndRenderTargets(pixel_shader, render_targets); CHECK_UPDATE_STATUS(status, mismatch, "Unable to update blend state"); status = UpdateRasterizerState(primitive_type); CHECK_UPDATE_STATUS(status, mismatch, "Unable to update rasterizer state"); - status = UpdateDepthStencilState(); + status = UpdateDepthStencilState(render_targets[4].format); CHECK_UPDATE_STATUS(status, mismatch, "Unable to update depth/stencil state"); status = UpdateIBStripCutValue(index_format); CHECK_UPDATE_STATUS(status, mismatch, "Unable to update index buffer strip cut value"); - status = UpdateRenderTargetFormats(); - CHECK_UPDATE_STATUS(status, mismatch, - "Unable to update render target formats"); #undef CHECK_UPDATE_STATUS return mismatch ? UpdateStatus::kMismatch : UpdateStatus::kCompatible; @@ -303,18 +304,27 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages( return UpdateStatus::kMismatch; } -PipelineCache::UpdateStatus PipelineCache::UpdateBlendState( - D3D12Shader* pixel_shader) { - auto& regs = update_blend_state_regs_; +PipelineCache::UpdateStatus PipelineCache::UpdateBlendStateAndRenderTargets( + D3D12Shader* pixel_shader, + const RenderTargetCache::PipelineRenderTarget render_targets[4]) { + auto& regs = update_blend_state_and_render_targets_regs_; bool dirty = current_pipeline_ == nullptr; + for (uint32_t i = 0; i < 4; ++i) { + dirty |= regs.render_targets[i].guest_render_target != + render_targets[i].guest_render_target; + regs.render_targets[i].guest_render_target = + render_targets[i].guest_render_target; + dirty |= regs.render_targets[i].format != render_targets[i].format; + regs.render_targets[i].format = render_targets[i].format; + } uint32_t color_mask; if (pixel_shader != nullptr) { color_mask = register_file_->values[XE_GPU_REG_RB_COLOR_MASK].u32 & 0xFFFF; - // If the pixel shader doesn't write to a render target, writing to it is - // disabled in the blend state. Otherwise, in Halo 3, one important render - // target is destroyed by a shader not writing to one of the outputs. for (uint32_t i = 0; i < 4; ++i) { + // If the pixel shader doesn't write to a render target, writing to it is + // disabled in the blend state. Otherwise, in Halo 3, one important render + // target is destroyed by a shader not writing to one of the outputs. if (!pixel_shader->writes_color_target(i)) { color_mask &= ~(0xF << (i * 4)); } @@ -372,10 +382,14 @@ PipelineCache::UpdateStatus PipelineCache::UpdateBlendState( /* 3 */ D3D12_BLEND_OP_MAX, /* 4 */ D3D12_BLEND_OP_REV_SUBTRACT, }; + update_desc_.NumRenderTargets = 0; for (uint32_t i = 0; i < 4; ++i) { auto& blend_desc = update_desc_.BlendState.RenderTarget[i]; - if (blend_enable && (color_mask & (0xF << (i * 4)))) { - uint32_t blend_control = regs.blendcontrol[i]; + uint32_t guest_render_target = render_targets[i].guest_render_target; + DXGI_FORMAT format = render_targets[i].format; + if (blend_enable && format != DXGI_FORMAT_UNKNOWN && + (color_mask & (0xF << (guest_render_target * 4)))) { + uint32_t blend_control = regs.blendcontrol[guest_render_target]; // A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND blend_desc.SrcBlend = kBlendFactorMap[(blend_control & 0x0000001F) >> 0]; // A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND @@ -399,7 +413,12 @@ PipelineCache::UpdateStatus PipelineCache::UpdateBlendState( blend_desc.DestBlendAlpha = D3D12_BLEND_ZERO; blend_desc.BlendOpAlpha = D3D12_BLEND_OP_ADD; } - blend_desc.RenderTargetWriteMask = (color_mask >> (i * 4)) & 0xF; + blend_desc.RenderTargetWriteMask = + (color_mask >> (guest_render_target * 4)) & 0xF; + update_desc_.RTVFormats[i] = format; + if (format != DXGI_FORMAT_UNKNOWN) { + update_desc_.NumRenderTargets = i + 1; + } } return UpdateStatus::kMismatch; @@ -532,10 +551,13 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizerState( return UpdateStatus::kMismatch; } -PipelineCache::UpdateStatus PipelineCache::UpdateDepthStencilState() { +PipelineCache::UpdateStatus PipelineCache::UpdateDepthStencilState( + DXGI_FORMAT format) { auto& regs = update_depth_stencil_state_regs_; bool dirty = current_pipeline_ == nullptr; + dirty |= regs.format != format; + regs.format = format; dirty |= SetShadowRegister(®s.rb_depthcontrol, XE_GPU_REG_RB_DEPTHCONTROL); dirty |= SetShadowRegister(®s.rb_stencilrefmask, XE_GPU_REG_RB_STENCILREFMASK); @@ -544,17 +566,18 @@ PipelineCache::UpdateStatus PipelineCache::UpdateDepthStencilState() { return UpdateStatus::kCompatible; } + bool dsv_bound = format != DXGI_FORMAT_UNKNOWN; update_desc_.DepthStencilState.DepthEnable = - (regs.rb_depthcontrol & 0x2) ? TRUE : FALSE; + (dsv_bound && (regs.rb_depthcontrol & 0x2)) ? TRUE : FALSE; update_desc_.DepthStencilState.DepthWriteMask = - (regs.rb_depthcontrol & 0x4) ? D3D12_DEPTH_WRITE_MASK_ALL - : D3D12_DEPTH_WRITE_MASK_ZERO; + (dsv_bound && (regs.rb_depthcontrol & 0x4)) ? D3D12_DEPTH_WRITE_MASK_ALL + : D3D12_DEPTH_WRITE_MASK_ZERO; // Comparison functions are the same in Direct3D 12 but plus one (minus one, // bit 0 for less, bit 1 for equal, bit 2 for greater). update_desc_.DepthStencilState.DepthFunc = D3D12_COMPARISON_FUNC(((regs.rb_depthcontrol >> 4) & 0x7) + 1); update_desc_.DepthStencilState.StencilEnable = - (regs.rb_depthcontrol & 0x1) ? TRUE : FALSE; + (dsv_bound && (regs.rb_depthcontrol & 0x1)) ? TRUE : FALSE; update_desc_.DepthStencilState.StencilReadMask = (regs.rb_stencilrefmask >> 8) & 0xFF; update_desc_.DepthStencilState.StencilWriteMask = @@ -587,6 +610,8 @@ PipelineCache::UpdateStatus PipelineCache::UpdateDepthStencilState() { // test is dynamic - should be enabled anyway if there's no alpha test, // discarding and depth output). + update_desc_.DSVFormat = format; + return UpdateStatus::kMismatch; } @@ -615,19 +640,6 @@ PipelineCache::UpdateStatus PipelineCache::UpdateIBStripCutValue( return UpdateStatus::kMismatch; } -PipelineCache::UpdateStatus PipelineCache::UpdateRenderTargetFormats() { - bool dirty = current_pipeline_ == nullptr; - if (!dirty) { - return UpdateStatus::kCompatible; - } - - // TODO(Triang3l): Set the formats when RT cache is added. - update_desc_.NumRenderTargets = 0; - update_desc_.DSVFormat = DXGI_FORMAT_UNKNOWN; - - return UpdateStatus::kMismatch; -} - PipelineCache::Pipeline* PipelineCache::GetPipeline(uint64_t hash_key) { // Lookup the pipeline in the cache. auto it = pipelines_.find(hash_key); diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index 8a3d89e09..8cd5a7877 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -15,6 +15,7 @@ #include "third_party/xxhash/xxhash.h" #include "xenia/gpu/d3d12/d3d12_shader.h" +#include "xenia/gpu/d3d12/render_target_cache.h" #include "xenia/gpu/hlsl_shader_translator.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/xenos.h" @@ -42,12 +43,12 @@ class PipelineCache { D3D12Shader* LoadShader(ShaderType shader_type, uint32_t guest_address, const uint32_t* host_address, uint32_t dword_count); - UpdateStatus ConfigurePipeline(D3D12Shader* vertex_shader, - D3D12Shader* pixel_shader, - PrimitiveType primitive_type, - IndexFormat index_format, - ID3D12PipelineState** pipeline_out, - ID3D12RootSignature** root_signature_out); + UpdateStatus ConfigurePipeline( + D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, + PrimitiveType primitive_type, IndexFormat index_format, + const RenderTargetCache::PipelineRenderTarget render_targets[5], + ID3D12PipelineState** pipeline_out, + ID3D12RootSignature** root_signature_out); void ClearCache(); @@ -57,25 +58,25 @@ class PipelineCache { bool TranslateShader(D3D12Shader* shader, xenos::xe_gpu_program_cntl_t cntl); - UpdateStatus UpdateState(D3D12Shader* vertex_shader, - D3D12Shader* pixel_shader, - PrimitiveType primitive_type, - IndexFormat index_format); + UpdateStatus UpdateState( + D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, + PrimitiveType primitive_type, IndexFormat index_format, + const RenderTargetCache::PipelineRenderTarget render_targets[5]); // pRootSignature, VS, PS, GS, PrimitiveTopologyType. UpdateStatus UpdateShaderStages(D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, PrimitiveType primitive_type); - // BlendState. - UpdateStatus UpdateBlendState(D3D12Shader* pixel_shader); + // BlendState, NumRenderTargets, RTVFormats. + UpdateStatus UpdateBlendStateAndRenderTargets( + D3D12Shader* pixel_shader, + const RenderTargetCache::PipelineRenderTarget render_targets[4]); // RasterizerState. UpdateStatus UpdateRasterizerState(PrimitiveType primitive_type); - // DepthStencilState. - UpdateStatus UpdateDepthStencilState(); + // DepthStencilState, DSVFormat. + UpdateStatus UpdateDepthStencilState(DXGI_FORMAT format); // IBStripCutValue. UpdateStatus UpdateIBStripCutValue(IndexFormat index_format); - // NumRenderTargets, RTVFormats, DSVFormat. - UpdateStatus UpdateRenderTargetFormats(); D3D12CommandProcessor* command_processor_; RegisterFile* register_file_; @@ -117,16 +118,17 @@ class PipelineCache { void Reset() { std::memset(this, 0, sizeof(*this)); } } update_shader_stages_regs_; - struct UpdateBlendStateRegisters { + struct UpdateBlendStateAndRenderTargetsRegisters { + RenderTargetCache::PipelineRenderTarget render_targets[5]; // RB_COLOR_MASK with unused render targets removed. uint32_t color_mask; // Blend control updated only for used render targets. uint32_t blendcontrol[4]; bool colorcontrol_blend_enable; - UpdateBlendStateRegisters() { Reset(); } + UpdateBlendStateAndRenderTargetsRegisters() { Reset(); } void Reset() { std::memset(this, 0, sizeof(*this)); } - } update_blend_state_regs_; + } update_blend_state_and_render_targets_regs_; struct UpdateRasterizerStateRegisters { // Polygon offset is in Xenos units. @@ -142,6 +144,7 @@ class PipelineCache { } update_rasterizer_state_regs_; struct UpdateDepthStencilStateRegisters { + DXGI_FORMAT format; uint32_t rb_depthcontrol; uint32_t rb_stencilrefmask; diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index f5a675b9c..8b3cb52af 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -27,8 +27,6 @@ RenderTargetCache::RenderTargetCache(D3D12CommandProcessor* command_processor, RenderTargetCache::~RenderTargetCache() { Shutdown(); } -bool RenderTargetCache::Initialize() { return true; } - void RenderTargetCache::Shutdown() { ClearCache(); } void RenderTargetCache::ClearCache() { @@ -41,6 +39,19 @@ void RenderTargetCache::ClearCache() { } render_targets_.clear(); + while (descriptor_heaps_depth_ != nullptr) { + auto heap = descriptor_heaps_depth_; + heap->heap->Release(); + descriptor_heaps_depth_ = heap->previous; + delete heap; + } + while (descriptor_heaps_color_ != nullptr) { + auto heap = descriptor_heaps_color_; + heap->heap->Release(); + descriptor_heaps_color_ = heap->previous; + delete heap; + } + for (uint32_t i = 0; i < xe::countof(heaps_); ++i) { if (heaps_[i] != nullptr) { heaps_[i]->Release(); @@ -51,7 +62,7 @@ void RenderTargetCache::ClearCache() { void RenderTargetCache::BeginFrame() { ClearBindings(); } -void RenderTargetCache::UpdateRenderTargets() { +bool RenderTargetCache::UpdateRenderTargets() { // There are two kinds of render target binding updates in this implementation // in case something has been changed - full and partial. // @@ -90,16 +101,22 @@ void RenderTargetCache::UpdateRenderTargets() { // made to the lower part of RT0. So, before draws 2 and 3, full updates must // be done. // - // Full updates are better for memory usage than partial updates though, as - // the render targets are re-allocated in the heaps, which means that they can - // be allocated more tightly, preventing too many 32 MB heaps from being - // created. + // Direct3D 12 also requires all render targets to have the same size, so the + // height is calculated from the EDRAM space available to the last render + // target available in it. However, to make toggling render targets like in + // the Banjo-Kazooie case possible, the height may be decreased only in full + // updates. + // TODO(Triang3l): Check if it's safe to calculate the smallest EDRAM region + // without aliasing and use it for the height. This won't work if games + // actually alias active render targets for some reason. // // To summarize, a full update happens if: // - Starting a new frame. // - Drawing after resolving. // - Surface pitch changed. // - Sample count changed. + // - Render target is disabled and another render target got more space than + // is currently available in the textures. // - EDRAM base of a currently used RT changed. // - Format of a currently used RT changed. // - Current viewport contains unsaved data from previously used render @@ -112,18 +129,18 @@ void RenderTargetCache::UpdateRenderTargets() { // // A partial update happens if: // - New render target is added, but doesn't overlap unsaved data from other - // currently or previously used render targets. + // currently or previously used render targets, and it doesn't require a + // bigger size. auto command_list = command_processor_->GetCurrentCommandList(); if (command_list == nullptr) { - return; + return false; } auto& regs = *register_file_; uint32_t rb_surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32; uint32_t surface_pitch = std::min(rb_surface_info & 0x3FFF, 2560u); if (surface_pitch == 0) { - assert_always(); - return; + return false; } MsaaSamples msaa_samples = MsaaSamples((rb_surface_info >> 16) & 0x3); uint32_t msaa_samples_x = msaa_samples >= MsaaSamples::k4X ? 2 : 1; @@ -178,10 +195,26 @@ void RenderTargetCache::UpdateRenderTargets() { // clamp the dirty region heights. uint32_t edram_row_tiles_32bpp = (surface_pitch * msaa_samples_x + 79) / 80; uint32_t edram_row_tiles[5]; - uint32_t edram_max_rows[5]; + uint32_t edram_max_rows = UINT32_MAX; for (uint32_t i = 0; i < 5; ++i) { edram_row_tiles[i] = edram_row_tiles_32bpp * (formats_are_64bpp[i] ? 2 : 1); - edram_max_rows[i] = (2048 - edram_bases[i]) / edram_row_tiles[i]; + if (enabled[i]) { + // Direct3D 12 doesn't allow render targets with different sizes, so + // calculate the height from the render target closest to the end of + // EDRAM. + edram_max_rows = std::min(edram_max_rows, + (2048 - edram_bases[i]) / edram_row_tiles[i]); + } + } + if (edram_max_rows == 0 || edram_max_rows == UINT32_MAX) { + // Some render target is totally in the end of EDRAM, or nothing is drawn. + return false; + } + // Check the following full update conditions: + // - Render target is disabled and another render target got more space than + // is currently available in the textures. + if (edram_max_rows > current_edram_max_rows_) { + full_update = true; } // Get EDRAM usage of the current draw so dirty regions can be calculated. @@ -210,7 +243,8 @@ void RenderTargetCache::UpdateRenderTargets() { } uint32_t dirty_bottom = std::min(std::min(viewport_bottom, scissor_bottom), 2560u); - uint32_t edram_rows = (dirty_bottom * msaa_samples_y + 15) >> 4; + uint32_t edram_dirty_rows = + std::min((dirty_bottom * msaa_samples_y + 15) >> 4, edram_max_rows); // Check the following full update conditions: // - EDRAM base of a currently used RT changed. @@ -257,8 +291,7 @@ void RenderTargetCache::UpdateRenderTargets() { } // Checking if the new render target is overlapping any bound one. // binding_1 is the new render target. - edram_length_1 = - std::min(edram_rows, edram_max_rows[i]) * edram_row_tiles[i]; + edram_length_1 = edram_dirty_rows * edram_row_tiles[i]; } for (uint32_t j = 0; j < 5; ++j) { const RenderTargetBinding& binding_2 = current_bindings_[j]; @@ -272,8 +305,7 @@ void RenderTargetCache::UpdateRenderTargets() { } // Checking if now overlapping a previously used render target. // binding_2 is a currently used render target. - edram_length_2 = - std::min(edram_rows, edram_max_rows[j]) * edram_row_tiles[i]; + edram_length_2 = edram_dirty_rows * edram_row_tiles[i]; } else { // Checking if the new render target is overlapping any bound one. // binding_2 is another bound render target. @@ -295,82 +327,200 @@ void RenderTargetCache::UpdateRenderTargets() { } } - // If no need to attach any new render targets, update dirty regions and exit. - if (!full_update && !render_targets_to_attach) { + // Need to change the bindings. + if (full_update || render_targets_to_attach) { + uint32_t heap_usage[5] = {}; + if (full_update) { + // Export the currently bound render targets before we ruin the bindings. + WriteRenderTargetsToEDRAM(); + + ClearBindings(); + current_surface_pitch_ = surface_pitch; + current_msaa_samples_ = msaa_samples; + current_edram_max_rows_ = edram_max_rows; + + // If updating fully, need to reattach all the render targets and allocate + // from scratch. + for (uint32_t i = 0; i < 5; ++i) { + if (enabled[i]) { + render_targets_to_attach |= 1 << i; + } + } + } else { + // If updating partially, only need to attach new render targets. + for (uint32_t i = 0; i < 5; ++i) { + const RenderTargetBinding& binding = current_bindings_[i]; + if (!binding.is_bound) { + continue; + } + const RenderTarget* render_target = binding.render_target; + if (render_target != nullptr) { + // There are no holes between 4 MB pages in each heap. + heap_usage[render_target->heap_page_first >> 3] += + render_target->heap_page_count; + continue; + } + } + } + XELOGGPU("RT Cache: %s update - pitch %u, samples %u, RTs to attach %u", + full_update ? "Full" : "Partial", surface_pitch, msaa_samples, + render_targets_to_attach); + + auto device = + command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice(); + + D3D12_RESOURCE_BARRIER barriers[5]; + uint32_t barrier_count = 0; + + // Allocate new render targets and add them to the bindings list. for (uint32_t i = 0; i < 5; ++i) { - if (!enabled[i] || (i == 4 && depth_readonly)) { + if (!(render_targets_to_attach & (1 << i))) { continue; } RenderTargetBinding& binding = current_bindings_[i]; - binding.edram_dirty_length = std::max( - binding.edram_dirty_length, - std::min(edram_rows, edram_max_rows[i]) * edram_row_tiles[i]); - } - return; - } - - // From this point, the function MUST NOT FAIL, otherwise bindings will be - // left in an incomplete state. - - uint32_t heap_usage[5] = {}; - if (full_update) { - // Export the currently bound render targets before we ruin the bindings. - WriteRenderTargetsToEDRAM(); - - ClearBindings(); - current_surface_pitch_ = surface_pitch; - current_msaa_samples_ = msaa_samples; - - // If updating fully, need to reattach all the render targets and allocate - // from scratch. - for (uint32_t i = 0; i < 5; ++i) { - if (enabled[i]) { - render_targets_to_attach |= 1 << i; - } - } - } else { - // If updating partially, only need to attach new render targets. - for (uint32_t i = 0; i < 5; ++i) { - const RenderTargetBinding& binding = current_bindings_[i]; - if (!binding.is_bound) { - continue; - } - const RenderTarget* render_target = binding.render_target; - if (render_target != nullptr) { - // There are no holes between 4 MB pages in each heap. - heap_usage[render_target->heap_page_first >> 3] += - render_target->heap_page_count; - continue; - } - } - } - XELOGGPU("RT Cache: %s update - pitch %u, samples %u, RTs to attach %u", - full_update ? "Full" : "Partial", surface_pitch, msaa_samples, - render_targets_to_attach); - - // Allocate the new render targets. - // TODO(Triang3l): Actually allocate them. - // TODO(Triang3l): Load the contents from the EDRAM. - // TODO(Triang3l): Bind the render targets to the command list. - - // Write the new bindings and update the dirty regions. - for (uint32_t i = 0; i < 5; ++i) { - if (!enabled[i]) { - continue; - } - RenderTargetBinding& binding = current_bindings_[i]; - if (render_targets_to_attach & (1 << i)) { binding.is_bound = true; binding.edram_base = edram_bases[i]; binding.edram_dirty_length = 0; binding.format = formats[i]; + binding.render_target = nullptr; + + RenderTargetKey key; + key.width_ss_div_80 = edram_row_tiles_32bpp; + key.height_ss_div_16 = current_edram_max_rows_; + key.is_depth = i == 4; + key.format = formats[i]; + D3D12_RESOURCE_DESC resource_desc; + if (!GetResourceDesc(key, resource_desc)) { + // Invalid format. + continue; + } + + // Calculate the number of 4 MB pages of 32 MB heaps this RT will use. + D3D12_RESOURCE_ALLOCATION_INFO allocation_info = + device->GetResourceAllocationInfo(0, 1, &resource_desc); + if (allocation_info.SizeInBytes == 0 || + allocation_info.SizeInBytes > (32 << 20)) { + assert_always(); + continue; + } + uint32_t heap_page_count = + (uint32_t(allocation_info.SizeInBytes) + ((4 << 20) - 1)) >> 22; + + // Find the heap page range for this render target. + uint32_t heap_page_first = UINT32_MAX; + for (uint32_t j = 0; j < 5; ++j) { + if (heap_usage[j] + heap_page_count <= 8) { + heap_page_first = j * 8 + heap_usage[j]; + break; + } + } + if (heap_page_first == UINT32_MAX) { + assert_always(); + continue; + } + + // Get the render target. + binding.render_target = FindOrCreateRenderTarget(key, heap_page_first); + if (binding.render_target == nullptr) { + continue; + } + + // Inform Direct3D that we're reusing the heap for this render target. + D3D12_RESOURCE_BARRIER& barrier = barriers[barrier_count++]; + barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_ALIASING; + barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; + barrier.Aliasing.pResourceBefore = nullptr; + barrier.Aliasing.pResourceAfter = binding.render_target->resource; } - if (!(i == 4 && depth_readonly)) { - binding.edram_dirty_length = std::max( - binding.edram_dirty_length, - std::min(edram_rows, edram_max_rows[i]) * edram_row_tiles[i]); + + if (barrier_count != 0) { + command_list->ResourceBarrier(barrier_count, barriers); } + + barrier_count = 0; + + // Load the contents of the new render targets from the EDRAM buffer and + // switch their state to RTV/DSV. + for (uint32_t i = 0; i < 5; ++i) { + if (!(render_targets_to_attach & (1 << i))) { + continue; + } + RenderTarget* render_target = current_bindings_[i].render_target; + if (render_target == nullptr) { + continue; + } + + // TODO(Triang3l): Load the contents from the EDRAM buffer. + + // After loading from the EDRAM buffer (which may make this render target + // a copy destination), switch it to RTV/DSV if needed. + D3D12_RESOURCE_STATES state = i == 4 ? D3D12_RESOURCE_STATE_DEPTH_WRITE + : D3D12_RESOURCE_STATE_RENDER_TARGET; + if (render_target->state != state) { + D3D12_RESOURCE_BARRIER& barrier = barriers[barrier_count++]; + barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; + barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; + barrier.Transition.pResource = render_target->resource; + barrier.Transition.Subresource = + D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; + barrier.Transition.StateBefore = render_target->state; + barrier.Transition.StateAfter = state; + render_target->state = state; + } + } + + if (barrier_count != 0) { + command_list->ResourceBarrier(barrier_count, barriers); + } + + // Compress the list of the render target because null RTV descriptors are + // broken in Direct3D 12 and bind the render targets to the command list. + D3D12_CPU_DESCRIPTOR_HANDLE rtv_handles[4]; + uint32_t rtv_count = 0; + for (uint32_t i = 0; i < 4; ++i) { + const RenderTargetBinding& binding = current_bindings_[i]; + if (!binding.is_bound || binding.render_target == nullptr) { + continue; + } + rtv_handles[rtv_count] = binding.render_target->handle; + current_pipeline_render_targets_[rtv_count].guest_render_target = i; + current_pipeline_render_targets_[rtv_count].format = + GetColorDXGIFormat(ColorRenderTargetFormat(formats[4])); + ++rtv_count; + } + for (uint32_t i = rtv_count; i < 4; ++i) { + current_pipeline_render_targets_[i].guest_render_target = i; + current_pipeline_render_targets_[i].format = DXGI_FORMAT_UNKNOWN; + } + const D3D12_CPU_DESCRIPTOR_HANDLE* dsv_handle; + const RenderTargetBinding& depth_binding = current_bindings_[4]; + current_pipeline_render_targets_[4].guest_render_target = 4; + if (depth_binding.is_bound && depth_binding.render_target != nullptr) { + dsv_handle = &depth_binding.render_target->handle; + current_pipeline_render_targets_[4].format = + GetDepthDXGIFormat(DepthRenderTargetFormat(formats[4])); + } else { + dsv_handle = nullptr; + current_pipeline_render_targets_[4].format = DXGI_FORMAT_UNKNOWN; + } + command_list->OMSetRenderTargets(rtv_count, rtv_handles, FALSE, dsv_handle); } + + // Update the dirty regions. + for (uint32_t i = 0; i < 5; ++i) { + if (!enabled[i] || (i == 4 && depth_readonly)) { + continue; + } + RenderTargetBinding& binding = current_bindings_[i]; + if (binding.render_target == nullptr) { + // Nothing to store to the EDRAM buffer if there was an error. + continue; + } + binding.edram_dirty_length = std::max( + binding.edram_dirty_length, edram_dirty_rows * edram_row_tiles[i]); + } + + return true; } void RenderTargetCache::EndFrame() { @@ -410,9 +560,155 @@ DXGI_FORMAT RenderTargetCache::GetColorDXGIFormat( void RenderTargetCache::ClearBindings() { current_surface_pitch_ = 0; current_msaa_samples_ = MsaaSamples::k1X; + current_edram_max_rows_ = 0; std::memset(current_bindings_, 0, sizeof(current_bindings_)); } +bool RenderTargetCache::GetResourceDesc(RenderTargetKey key, + D3D12_RESOURCE_DESC& desc) { + if (key.width_ss_div_80 == 0 || key.height_ss_div_16 == 0) { + return false; + } + DXGI_FORMAT dxgi_format = + key.is_depth ? GetDepthDXGIFormat(DepthRenderTargetFormat(key.format)) + : GetColorDXGIFormat(ColorRenderTargetFormat(key.format)); + if (dxgi_format == DXGI_FORMAT_UNKNOWN) { + return false; + } + desc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D; + // TODO(Triang3l): If real MSAA is added, alignment must be 4 MB. + desc.Alignment = 0; + desc.Width = key.width_ss_div_80 * 80; + desc.Height = key.height_ss_div_16 * 16; + desc.DepthOrArraySize = 1; + desc.MipLevels = 1; + desc.Format = dxgi_format; + desc.SampleDesc.Count = 1; + desc.SampleDesc.Quality = 0; + desc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN; + desc.Flags = key.is_depth ? D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL + : D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET; + return true; +} + +RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget( + RenderTargetKey key, uint32_t heap_page_first) { + assert_true(heap_page_first <= 8 * 5); + // TODO(Triang3l): Find an existing render target. + + D3D12_RESOURCE_DESC resource_desc; + if (!GetResourceDesc(key, resource_desc)) { + return nullptr; + } + + auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); + auto device = provider->GetDevice(); + + // Get the number of heap pages needed for the render target. + D3D12_RESOURCE_ALLOCATION_INFO allocation_info = + device->GetResourceAllocationInfo(0, 1, &resource_desc); + uint32_t heap_page_count = + (uint32_t(allocation_info.SizeInBytes) + ((4 << 20) - 1)) >> 22; + if (heap_page_count == 0 || (heap_page_first & 7) + heap_page_count > 8) { + assert_always(); + return nullptr; + } + + // Create a new descriptor heap if needed, and get a place for the descriptor. + auto& descriptor_heap = + key.is_depth ? descriptor_heaps_depth_ : descriptor_heaps_color_; + if (descriptor_heap == nullptr || + descriptor_heap->descriptors_used >= kRenderTargetDescriptorHeapSize) { + D3D12_DESCRIPTOR_HEAP_DESC descriptor_heap_desc; + descriptor_heap_desc.Type = key.is_depth ? D3D12_DESCRIPTOR_HEAP_TYPE_DSV + : D3D12_DESCRIPTOR_HEAP_TYPE_RTV; + descriptor_heap_desc.NumDescriptors = kRenderTargetDescriptorHeapSize; + descriptor_heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE; + descriptor_heap_desc.NodeMask = 0; + ID3D12DescriptorHeap* new_d3d_descriptor_heap; + if (FAILED(device->CreateDescriptorHeap( + &descriptor_heap_desc, IID_PPV_ARGS(&new_d3d_descriptor_heap)))) { + XELOGE("Failed to create a heap for %u %s buffer descriptors", + kRenderTargetDescriptorHeapSize, key.is_depth ? "depth" : "color"); + return nullptr; + } + RenderTargetDescriptorHeap* new_descriptor_heap = + new RenderTargetDescriptorHeap; + new_descriptor_heap->heap = new_d3d_descriptor_heap; + new_descriptor_heap->start_handle = + new_d3d_descriptor_heap->GetCPUDescriptorHandleForHeapStart(); + new_descriptor_heap->descriptors_used = 0; + new_descriptor_heap->previous = descriptor_heap; + descriptor_heap = new_descriptor_heap; + } + + // Create the memory heap if it doesn't exist yet. + ID3D12Heap* heap = heaps_[heap_page_first >> 3]; + if (heap == nullptr) { + D3D12_HEAP_DESC heap_desc = {}; + heap_desc.SizeInBytes = 32 << 20; + heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT; + // TODO(Triang3l): If real MSAA is added, alignment must be 4 MB. + heap_desc.Alignment = 0; + heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_RT_DS_TEXTURES; + if (FAILED(device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heap)))) { + XELOGE("Failed to create a 32 MB heap for render targets"); + return nullptr; + } + heaps_[heap_page_first >> 3] = heap; + } + + // The first action likely to be done is EDRAM buffer load. + D3D12_RESOURCE_STATES state = D3D12_RESOURCE_STATE_COPY_DEST; + ID3D12Resource* resource; + if (FAILED(device->CreatePlacedResource(heap, (heap_page_first & 7) << 22, + &resource_desc, state, nullptr, + IID_PPV_ARGS(&resource)))) { + XELOGE( + "Failed to create a placed resource for %ux%u %s render target with " + "format %u at heap 4 MB pages %u:%u", + uint32_t(resource_desc.Width), resource_desc.Height, + key.is_depth ? "depth" : "color", key.format, heap_page_first, + heap_page_first + heap_page_count - 1); + return nullptr; + } + + // Create the descriptor for the render target. + D3D12_CPU_DESCRIPTOR_HANDLE descriptor_handle; + if (key.is_depth) { + descriptor_handle.ptr = + descriptor_heap->start_handle.ptr + + descriptor_heap->descriptors_used * provider->GetDescriptorSizeDSV(); + D3D12_DEPTH_STENCIL_VIEW_DESC dsv_desc; + dsv_desc.Format = resource_desc.Format; + dsv_desc.ViewDimension = D3D12_DSV_DIMENSION_TEXTURE2D; + dsv_desc.Flags = D3D12_DSV_FLAG_NONE; + dsv_desc.Texture2D.MipSlice = 0; + device->CreateDepthStencilView(resource, &dsv_desc, descriptor_handle); + } else { + descriptor_handle.ptr = + descriptor_heap->start_handle.ptr + + descriptor_heap->descriptors_used * provider->GetDescriptorSizeRTV(); + D3D12_RENDER_TARGET_VIEW_DESC rtv_desc; + rtv_desc.Format = resource_desc.Format; + rtv_desc.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE2D; + rtv_desc.Texture2D.MipSlice = 0; + rtv_desc.Texture2D.PlaneSlice = 0; + device->CreateRenderTargetView(resource, &rtv_desc, descriptor_handle); + } + ++descriptor_heap->descriptors_used; + + RenderTarget* render_target = new RenderTarget; + render_target->resource = resource; + render_target->state = state; + render_target->handle = descriptor_handle; + render_target->key = key; + render_target->heap_page_first = heap_page_first; + render_target->heap_page_count = heap_page_count; + render_targets_.insert(std::make_pair(key.value, render_target)); + return render_target; +} + void RenderTargetCache::WriteRenderTargetsToEDRAM() {} } // namespace d3d12 diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index a9d1b2b1a..b300f3e8d 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -184,17 +184,36 @@ class D3D12CommandProcessor; // in the surface info register is single-sampled. class RenderTargetCache { public: + // Direct3D 12 debug layer does some kaschenit-style trolling by giving errors + // that contradict each other when you use null RTV descriptors - if you set + // a valid format in RTVFormats in the pipeline state, it says that null + // descriptors can only be used if the format in the pipeline state is + // DXGI_FORMAT_UNKNOWN, however, if DXGI_FORMAT_UNKNOWN is set, it complains + // that the format in the pipeline doesn't match the RTV format. So we have to + // make render target bindings consecutive and remap the output indices in + // pixel shaders. + struct PipelineRenderTarget { + uint32_t guest_render_target; + DXGI_FORMAT format; + }; + RenderTargetCache(D3D12CommandProcessor* command_processor, RegisterFile* register_file); ~RenderTargetCache(); - bool Initialize(); void Shutdown(); void ClearCache(); void BeginFrame(); // Called in the beginning of a draw call - may bind pipelines. - void UpdateRenderTargets(); + bool UpdateRenderTargets(); + // Returns the host-to-guest mappings and host formats of currently bound + // render targets for pipeline creation and remapping in shaders. They are + // consecutive, and format DXGI_FORMAT_UNKNOWN terminates the list. Depth + // format is in the 5th render target. + const PipelineRenderTarget* GetCurrentPipelineRenderTargets() const { + return current_pipeline_render_targets_; + } void EndFrame(); static inline bool IsColorFormat64bpp(ColorRenderTargetFormat format) { @@ -203,12 +222,22 @@ class RenderTargetCache { format == ColorRenderTargetFormat::k_32_32_FLOAT; } static DXGI_FORMAT GetColorDXGIFormat(ColorRenderTargetFormat format); + // Nvidia may have higher performance with 24-bit depth, AMD should have no + // performance difference, but with EDRAM loads/stores less conversion should + // be performed by the shaders if D24S8 is emulated as D24_UNORM_S8_UINT, and + // it's probably more accurate. + static inline DXGI_FORMAT GetDepthDXGIFormat(DepthRenderTargetFormat format) { + return format == DepthRenderTargetFormat::kD24FS8 + ? DXGI_FORMAT_D32_FLOAT_S8X24_UINT + : DXGI_FORMAT_D24_UNORM_S8_UINT; + } private: union RenderTargetKey { struct { - // Supersampled dimensions. The limit is 2560x2560 without AA, 2560x5120 - // with 2x AA, and 5120x5120 with 4x AA. + // Supersampled (_ss - scaled 2x if needed) dimensions, divided by 80x16. + // The limit is 2560x2560 without AA, 2560x5120 with 2x AA, and 5120x5120 + // with 4x AA. uint32_t width_ss_div_80 : 7; // 7 uint32_t height_ss_div_16 : 9; // 16 uint32_t is_depth : 1; // 17 @@ -259,6 +288,12 @@ class RenderTargetCache { void ClearBindings(); + // Returns true if a render target with such key can be created. + static bool GetResourceDesc(RenderTargetKey key, D3D12_RESOURCE_DESC& desc); + + RenderTarget* FindOrCreateRenderTarget(RenderTargetKey key, + uint32_t heap_page_first); + // Must be in a frame to call. Writes the dirty areas of the currently bound // render targets and marks them as clean. void WriteRenderTargetsToEDRAM(); @@ -271,11 +306,27 @@ class RenderTargetCache { // entire EDRAM - a 32-bit depth/stencil one - at some resolution. ID3D12Heap* heaps_[5] = {}; + static constexpr uint32_t kRenderTargetDescriptorHeapSize = 2048; + // Descriptor heap, for linear allocation of heaps and descriptors. + struct RenderTargetDescriptorHeap { + ID3D12DescriptorHeap* heap; + D3D12_CPU_DESCRIPTOR_HANDLE start_handle; + // When descriptors_used is >= kRenderTargetDescriptorHeapSize, a new heap + // must be allocated and linked to the one that became full now. + uint32_t descriptors_used; + RenderTargetDescriptorHeap* previous; + }; + RenderTargetDescriptorHeap* descriptor_heaps_color_ = nullptr; + RenderTargetDescriptorHeap* descriptor_heaps_depth_ = nullptr; + std::unordered_multimap render_targets_; uint32_t current_surface_pitch_ = 0; MsaaSamples current_msaa_samples_ = MsaaSamples::k1X; + uint32_t current_edram_max_rows_ = 0; RenderTargetBinding current_bindings_[5] = {}; + + PipelineRenderTarget current_pipeline_render_targets_[5]; }; } // namespace d3d12 diff --git a/src/xenia/gpu/hlsl_shader_translator.cc b/src/xenia/gpu/hlsl_shader_translator.cc index dd3039270..ebb056e5a 100644 --- a/src/xenia/gpu/hlsl_shader_translator.cc +++ b/src/xenia/gpu/hlsl_shader_translator.cc @@ -176,6 +176,7 @@ std::vector HlslShaderTranslator::CompleteTranslation() { " float xe_pixel_half_pixel_offset;\n" " float2 xe_ssaa_inv_scale;\n" " uint xe_pixel_pos_reg;\n" + " uint4 xe_color_output_map;\n" "};\n" "\n" "cbuffer xe_loop_bool_constants : register(b1) {\n" @@ -291,10 +292,11 @@ std::vector HlslShaderTranslator::CompleteTranslation() { "XePixelShaderOutput main(XePixelShaderInput xe_input) {\n" " float4 xe_r[%u];\n" " XePixelShaderOutput xe_output;\n" - " xe_output.colors[0] = (0.0).xxxx;\n" - " xe_output.colors[1] = (0.0).xxxx;\n" - " xe_output.colors[2] = (0.0).xxxx;\n" - " xe_output.colors[3] = (0.0).xxxx;\n", + " float4 xe_color_output[4];\n" + " xe_color_output[0] = (0.0).xxxx;\n" + " xe_color_output[1] = (0.0).xxxx;\n" + " xe_color_output[2] = (0.0).xxxx;\n" + " xe_color_output[3] = (0.0).xxxx;\n", kMaxInterpolators, writes_depth_ ? " float depth : SV_Depth;\n" : "", register_count()); // Initialize SV_Depth if using it. @@ -370,6 +372,14 @@ std::vector HlslShaderTranslator::CompleteTranslation() { " xe_output.position.xyz =\n" " xe_output.position.xyz * xe_ndc_scale +\n" " xe_ndc_offset * xe_output.position.www;\n"); + } else if (is_pixel_shader()) { + // Remap guest color outputs to host render targets because null render + // target descriptors are broken. + source.Append( + " xe_output.colors[0] = xe_color_output[xe_color_output_map.r];\n" + " xe_output.colors[1] = xe_color_output[xe_color_output_map.g];\n" + " xe_output.colors[2] = xe_color_output[xe_color_output_map.b];\n" + " xe_output.colors[3] = xe_color_output[xe_color_output_map.a];\n"); } // TODO(Triang3l): Window offset, half pixel offset, alpha test, gamma. source.Append( @@ -726,7 +736,7 @@ void HlslShaderTranslator::EmitStoreResult(const InstructionResult& result, EmitSourceDepth("xe_output.point_size"); break; case InstructionStorageTarget::kColorTarget: - EmitSourceDepth("xe_output.colors"); + EmitSourceDepth("xe_color_output"); storage_is_array = true; break; case InstructionStorageTarget::kDepth: diff --git a/src/xenia/gpu/hlsl_shader_translator.h b/src/xenia/gpu/hlsl_shader_translator.h index eaa30c86e..b0cd06c0b 100644 --- a/src/xenia/gpu/hlsl_shader_translator.h +++ b/src/xenia/gpu/hlsl_shader_translator.h @@ -37,6 +37,9 @@ class HlslShaderTranslator : public ShaderTranslator { // vec4 3 float ssaa_inv_scale[2]; uint32_t pixel_pos_reg; + uint32_t padding_3; + // vec4 4 + uint32_t color_output_map[4]; }; struct TextureSRV {