diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index ccab20972..bcf5367bb 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -103,22 +103,6 @@ void D3D12CommandProcessor::RestoreEdramSnapshot(const void* snapshot) { render_target_cache_->RestoreEdramSnapshot(snapshot); } -uint32_t D3D12CommandProcessor::GetCurrentColorMask( - uint32_t shader_writes_color_targets) const { - auto& regs = *register_file_; - if (regs.Get().edram_mode != - xenos::ModeControl::kColorDepth) { - return 0; - } - uint32_t color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32 & 0xFFFF; - for (uint32_t i = 0; i < 4; ++i) { - if (!(shader_writes_color_targets & (1 << i))) { - color_mask &= ~(0xF << (i * 4)); - } - } - return color_mask; -} - void D3D12CommandProcessor::PushTransitionBarrier( ID3D12Resource* resource, D3D12_RESOURCE_STATES old_state, D3D12_RESOURCE_STATES new_state, UINT subresource) { @@ -2152,10 +2136,12 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, : DxbcShaderTranslator::Modification(0); // Set up the render targets - this may perform dispatches and draws. - uint32_t pixel_shader_writes_color_targets = - pixel_shader ? pixel_shader->writes_color_targets() : 0; + uint32_t normalized_color_mask = + pixel_shader ? draw_util::GetNormalizedColorMask( + regs, pixel_shader->writes_color_targets()) + : 0; if (!render_target_cache_->Update(is_rasterization_done, - pixel_shader_writes_color_targets)) { + normalized_color_mask)) { return false; } @@ -2186,7 +2172,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, ID3D12RootSignature* root_signature; if (!pipeline_cache_->ConfigurePipeline( vertex_shader_translation, pixel_shader_translation, - primitive_processing_result, bound_depth_and_color_render_target_bits, + primitive_processing_result, normalized_color_mask, + bound_depth_and_color_render_target_bits, bound_depth_and_color_render_target_formats, &pipeline_handle, &root_signature)) { return false; @@ -2241,9 +2228,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, memexport_used, primitive_polygonal, primitive_processing_result.line_loop_closing_index, primitive_processing_result.host_index_endian, viewport_info, - used_texture_mask, - pixel_shader ? GetCurrentColorMask(pixel_shader->writes_color_targets()) - : 0); + used_texture_mask, normalized_color_mask); // Update constant buffers, descriptors and root parameters. if (!UpdateBindings(vertex_shader, pixel_shader, root_signature)) { @@ -3114,7 +3099,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( bool shared_memory_is_uav, bool primitive_polygonal, uint32_t line_loop_closing_index, xenos::Endian index_endian, const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask, - uint32_t color_mask) { + uint32_t normalized_color_mask) { #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES @@ -3161,7 +3146,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( // Get the mask for keeping previous color's components unmodified, // or two UINT32_MAX if no colors actually existing in the RT are written. DxbcShaderTranslator::ROV_GetColorFormatSystemConstants( - color_info.color_format, (color_mask >> (i * 4)) & 0b1111, + color_info.color_format, (normalized_color_mask >> (i * 4)) & 0b1111, rt_clamp[i][0], rt_clamp[i][1], rt_clamp[i][2], rt_clamp[i][3], rt_keep_masks[i][0], rt_keep_masks[i][1]); } diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index abdd1fbda..032673704 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -83,16 +83,6 @@ class D3D12CommandProcessor : public CommandProcessor { uint64_t GetCurrentFrame() const { return frame_current_; } uint64_t GetCompletedFrame() const { return frame_completed_; } - // Gets the current color write mask, taking the pixel shader's write mask - // into account. If a shader doesn't write to a render target, it shouldn't be - // written to and it shouldn't be even bound - otherwise, in 4D5307E6, one - // render target is being destroyed by a shader not writing anything, and in - // 58410955, the result of clearing the top tile is being ignored because - // there are 4 render targets bound with the same EDRAM base (clearly not - // correct usage), but the shader only clears 1, and then EDRAM buffer stores - // conflict with each other. - uint32_t GetCurrentColorMask(uint32_t shader_writes_color_targets) const; - void PushTransitionBarrier( ID3D12Resource* resource, D3D12_RESOURCE_STATES old_state, D3D12_RESOURCE_STATES new_state, @@ -362,7 +352,7 @@ class D3D12CommandProcessor : public CommandProcessor { xenos::Endian index_endian, const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask, - uint32_t color_mask); + uint32_t normalized_color_mask); bool UpdateBindings(const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader, ID3D12RootSignature* root_signature); diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index 3a0cb3313..ff75f4151 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2020 Ben Vanik. All rights reserved. * + * Copyright 2022 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -934,6 +934,7 @@ bool PipelineCache::ConfigurePipeline( D3D12Shader::D3D12Translation* vertex_shader, D3D12Shader::D3D12Translation* pixel_shader, const PrimitiveProcessor::ProcessingResult& primitive_processing_result, + uint32_t normalized_color_mask, uint32_t bound_depth_and_color_render_target_bits, const uint32_t* bound_depth_and_color_render_target_formats, void** pipeline_handle_out, ID3D12RootSignature** root_signature_out) { @@ -1005,7 +1006,7 @@ bool PipelineCache::ConfigurePipeline( PipelineRuntimeDescription runtime_description; if (!GetCurrentStateDescription( vertex_shader, pixel_shader, primitive_processing_result, - bound_depth_and_color_render_target_bits, + normalized_color_mask, bound_depth_and_color_render_target_bits, bound_depth_and_color_render_target_formats, runtime_description)) { return false; } @@ -1272,6 +1273,7 @@ bool PipelineCache::GetCurrentStateDescription( D3D12Shader::D3D12Translation* vertex_shader, D3D12Shader::D3D12Translation* pixel_shader, const PrimitiveProcessor::ProcessingResult& primitive_processing_result, + uint32_t normalized_color_mask, uint32_t bound_depth_and_color_render_target_bits, const uint32_t* bound_depth_and_color_render_target_formats, PipelineRuntimeDescription& runtime_description_out) { @@ -1547,10 +1549,6 @@ bool PipelineCache::GetCurrentStateDescription( // Render targets and blending state. 32 because of 0x1F mask, for safety // (all unknown to zero). - uint32_t color_mask = - pixel_shader ? command_processor_.GetCurrentColorMask( - pixel_shader->shader().writes_color_targets()) - : 0; static const PipelineBlendFactor kBlendFactorMap[32] = { /* 0 */ PipelineBlendFactor::kZero, /* 1 */ PipelineBlendFactor::kOne, @@ -1622,8 +1620,7 @@ bool PipelineCache::GetCurrentStateDescription( reg::RB_COLOR_INFO::rt_register_indices[i]); rt.format = xenos::ColorRenderTargetFormat( bound_depth_and_color_render_target_formats[1 + i]); - // TODO(Triang3l): Normalize unused bits of the color write mask. - rt.write_mask = (color_mask >> (i * 4)) & 0xF; + rt.write_mask = (normalized_color_mask >> (i * 4)) & 0xF; if (rt.write_mask) { auto blendcontrol = regs.Get( reg::RB_BLENDCONTROL::rt_register_indices[i]); @@ -2017,9 +2014,6 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( } D3D12_RENDER_TARGET_BLEND_DESC& blend_desc = state_desc.BlendState.RenderTarget[i]; - // Treat 1 * src + 0 * dest as disabled blending (there are opaque - // surfaces drawn with blending enabled, but it's 1 * src + 0 * dest, in - // 415607E6 - GPU performance is better when not blending. if (rt.src_blend != PipelineBlendFactor::kOne || rt.dest_blend != PipelineBlendFactor::kZero || rt.blend_op != xenos::BlendOp::kAdd || diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index 290c47996..2f8f8b02b 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -82,6 +82,7 @@ class PipelineCache { D3D12Shader::D3D12Translation* vertex_shader, D3D12Shader::D3D12Translation* pixel_shader, const PrimitiveProcessor::ProcessingResult& primitive_processing_result, + uint32_t normalized_color_mask, uint32_t bound_depth_and_color_render_target_bits, const uint32_t* bound_depth_and_color_render_targets_formats, void** pipeline_handle_out, ID3D12RootSignature** root_signature_out); @@ -247,6 +248,7 @@ class PipelineCache { D3D12Shader::D3D12Translation* vertex_shader, D3D12Shader::D3D12Translation* pixel_shader, const PrimitiveProcessor::ProcessingResult& primitive_processing_result, + uint32_t normalized_color_mask, uint32_t bound_depth_and_color_render_target_bits, const uint32_t* bound_depth_and_color_render_target_formats, PipelineRuntimeDescription& runtime_description_out); diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 83ea6601d..370ffeb43 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2020 Ben Vanik. All rights reserved. * + * Copyright 2022 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -550,6 +550,49 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out, scissor_out.extent[1] = uint32_t(br_y - tl_y); } +uint32_t GetNormalizedColorMask(const RegisterFile& regs, + uint32_t pixel_shader_writes_color_targets) { + if (regs.Get().edram_mode != + xenos::ModeControl::kColorDepth) { + return 0; + } + uint32_t normalized_color_mask = 0; + uint32_t rb_color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32; + for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) { + // Exclude the render targets not statically written to by the pixel shader. + // If the shader doesn't write to a render target, it shouldn't be written + // to, and no ownership transfers should happen to it on the host even - + // otherwise, in 4D5307E6, one render target is being destroyed by a shader + // not writing anything, and in 58410955, the result of clearing the top + // tile is being ignored because there are 4 render targets bound with the + // same EDRAM base (clearly not correct usage), but the shader only clears + // 1, and then ownership of EDRAM portions by host render targets is + // conflicting. + if (!(pixel_shader_writes_color_targets & (uint32_t(1) << i))) { + continue; + } + // Check if any existing component is written to. + uint32_t format_component_mask = + (uint32_t(1) << xenos::GetColorRenderTargetFormatComponentCount( + regs.Get( + reg::RB_COLOR_INFO::rt_register_indices[i]) + .color_format)) - + 1; + uint32_t rt_write_mask = (rb_color_mask >> (4 * i)) & format_component_mask; + if (!rt_write_mask) { + continue; + } + // Mark the non-existent components as written so in the host driver, no + // slow path (involving reading and merging components) is taken if the + // driver doesn't perform this check internally, and some components are not + // included in the mask even though they actually don't exist in the format. + rt_write_mask |= 0b1111 & ~format_component_mask; + // Add to the normalized mask. + normalized_color_mask |= rt_write_mask << (4 * i); + } + return normalized_color_mask; +} + xenos::CopySampleSelect SanitizeCopySampleSelect( xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples, bool is_depth) { diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index 39d430676..1474461ec 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2020 Ben Vanik. All rights reserved. * + * Copyright 2022 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -186,6 +186,17 @@ struct Scissor { void GetScissor(const RegisterFile& regs, Scissor& scissor_out, bool clamp_to_surface_pitch = true); +// Returns the color component write mask for the draw command taking into +// account which color targets are written to by the pixel shader, as well as +// components that don't exist in the formats of the render targets (render +// targets with only non-existent components written are skipped, but +// non-existent components are forced to written if some existing components of +// the render target are actually used to make sure the host driver doesn't try +// to take a slow path involving reading and mixing if there are any disabled +// components even if they don't actually exist). +uint32_t GetNormalizedColorMask(const RegisterFile& regs, + uint32_t pixel_shader_writes_color_targets); + // Scales, and shift amounts of the upper 32 bits of the 32x32=64-bit // multiplication result, for fast division and multiplication by // EDRAM-tile-related amounts. diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc index 9ee219648..9e1fe50ff 100644 --- a/src/xenia/gpu/render_target_cache.cc +++ b/src/xenia/gpu/render_target_cache.cc @@ -366,7 +366,7 @@ void RenderTargetCache::ClearCache() { void RenderTargetCache::BeginFrame() { ResetAccumulatedRenderTargets(); } bool RenderTargetCache::Update(bool is_rasterization_done, - uint32_t shader_writes_color_targets) { + uint32_t normalized_color_mask) { const RegisterFile& regs = register_file(); bool interlock_barrier_only = GetPath() == Path::kPixelShaderInterlock; @@ -419,9 +419,6 @@ bool RenderTargetCache::Update(bool is_rasterization_done, } } - uint32_t rts_remaining; - uint32_t rt_index; - // Get used render targets. // [0] is depth / stencil where relevant, [1...4] is color. // Depth / stencil testing / writing is before color in the pipeline. @@ -432,7 +429,7 @@ bool RenderTargetCache::Update(bool is_rasterization_done, uint32_t rts_are_64bpp = 0; uint32_t color_rts_are_gamma = 0; if (is_rasterization_done) { - auto rb_depthcontrol = regs.Get(); + auto rb_depthcontrol = draw_util::GetDepthControlForCurrentEdramMode(regs); if (rb_depthcontrol.z_enable || rb_depthcontrol.stencil_enable) { depth_and_color_rts_used_bits |= 1; auto rb_depth_info = regs.Get(); @@ -445,50 +442,46 @@ bool RenderTargetCache::Update(bool is_rasterization_done, resource_formats[0] = interlock_barrier_only ? 0 : uint32_t(rb_depth_info.depth_format); } - if (regs.Get().edram_mode == - xenos::ModeControl::kColorDepth) { - uint32_t rb_color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32; - rts_remaining = shader_writes_color_targets; - while (xe::bit_scan_forward(rts_remaining, &rt_index)) { - rts_remaining &= ~(uint32_t(1) << rt_index); - auto color_info = regs.Get( - reg::RB_COLOR_INFO::rt_register_indices[rt_index]); - xenos::ColorRenderTargetFormat color_format = - regs.Get( - reg::RB_COLOR_INFO::rt_register_indices[rt_index]) - .color_format; - if ((rb_color_mask >> (rt_index * 4)) & - ((uint32_t(1) << xenos::GetColorRenderTargetFormatComponentCount( - color_format)) - - 1)) { - uint32_t rt_bit_index = 1 + rt_index; - depth_and_color_rts_used_bits |= uint32_t(1) << rt_bit_index; - edram_bases[rt_bit_index] = - std::min(color_info.color_base, xenos::kEdramTileCount); - bool is_64bpp = xenos::IsColorRenderTargetFormat64bpp(color_format); - if (is_64bpp) { - rts_are_64bpp |= uint32_t(1) << rt_bit_index; - } - if (color_format == xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA) { - color_rts_are_gamma |= uint32_t(1) << rt_index; - } - xenos::ColorRenderTargetFormat color_resource_format; - if (interlock_barrier_only) { - // Only changes in mapping between coordinates and addresses are - // interesting (along with access overlap between draw calls), thus - // only pixel size is relevant. - color_resource_format = - is_64bpp ? xenos::ColorRenderTargetFormat::k_16_16_16_16 - : xenos::ColorRenderTargetFormat::k_8_8_8_8; - } else { - color_resource_format = GetColorResourceFormat( - xenos::GetStorageColorFormat(color_format)); - } - resource_formats[rt_bit_index] = uint32_t(color_resource_format); - } + for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) { + if (!(normalized_color_mask & (uint32_t(0b1111) << (4 * i)))) { + continue; } + auto color_info = regs.Get( + reg::RB_COLOR_INFO::rt_register_indices[i]); + uint32_t rt_bit_index = 1 + i; + depth_and_color_rts_used_bits |= uint32_t(1) << rt_bit_index; + edram_bases[rt_bit_index] = + std::min(color_info.color_base, xenos::kEdramTileCount); + xenos::ColorRenderTargetFormat color_format = + regs.Get( + reg::RB_COLOR_INFO::rt_register_indices[i]) + .color_format; + bool is_64bpp = xenos::IsColorRenderTargetFormat64bpp(color_format); + if (is_64bpp) { + rts_are_64bpp |= uint32_t(1) << rt_bit_index; + } + if (color_format == xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA) { + color_rts_are_gamma |= uint32_t(1) << i; + } + xenos::ColorRenderTargetFormat color_resource_format; + if (interlock_barrier_only) { + // Only changes in mapping between coordinates and addresses are + // interesting (along with access overlap between draw calls), thus only + // pixel size is relevant. + color_resource_format = + is_64bpp ? xenos::ColorRenderTargetFormat::k_16_16_16_16 + : xenos::ColorRenderTargetFormat::k_8_8_8_8; + } else { + color_resource_format = + GetColorResourceFormat(xenos::GetStorageColorFormat(color_format)); + } + resource_formats[rt_bit_index] = uint32_t(color_resource_format); } } + + uint32_t rts_remaining; + uint32_t rt_index; + // Eliminate other bound render targets if their EDRAM base conflicts with // another render target - it's an error in most host implementations to bind // the same render target into multiple slots, also the behavior would be diff --git a/src/xenia/gpu/render_target_cache.h b/src/xenia/gpu/render_target_cache.h index 263d4fe60..743946438 100644 --- a/src/xenia/gpu/render_target_cache.h +++ b/src/xenia/gpu/render_target_cache.h @@ -215,7 +215,7 @@ class RenderTargetCache { virtual void BeginFrame(); virtual bool Update(bool is_rasterization_done, - uint32_t shader_writes_color_targets); + uint32_t normalized_color_mask); // Returns bits where 0 is whether a depth render target is currently bound on // the host and 1... are whether the same applies to color render targets, and