diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 1a2e07475..93750b2b3 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -2199,8 +2199,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( // Alpha test. int32_t alpha_test; - if (rb_colorcontrol & 0x8) { - uint32_t alpha_test_function = rb_colorcontrol & 0x7; + uint32_t alpha_test_function = rb_colorcontrol & 0x7; + if ((rb_colorcontrol & 0x8) && alpha_test_function != 0x7) { // 0: Never - fail in [-inf, +inf]. // 1: Less - fail in [ref, +inf]. // 2: Equal - pass in [ref, ref]. diff --git a/src/xenia/gpu/d3d12/d3d12_shader.h b/src/xenia/gpu/d3d12/d3d12_shader.h index 8a9a31a54..cbed15f9e 100644 --- a/src/xenia/gpu/d3d12/d3d12_shader.h +++ b/src/xenia/gpu/d3d12/d3d12_shader.h @@ -40,6 +40,17 @@ class D3D12Shader : public Shader { const DxbcShaderTranslator::SamplerBinding* sampler_bindings, uint32_t sampler_binding_count); + void SetForcedEarlyZShaderObject(const std::vector& shader_object) { + forced_early_z_shader_ = shader_object; + } + // Returns the shader with forced early depth/stencil set with + // SetForcedEarlyZShader after translation. If there's none (for example, + // if the shader discards pixels or writes to the depth buffer), an empty + // vector is returned. + const std::vector& GetForcedEarlyZShaderObject() const { + return forced_early_z_shader_; + } + bool DisassembleDxbc(const ui::d3d12::D3D12Provider* provider); static constexpr uint32_t kMaxTextureSRVIndexBits = @@ -78,9 +89,12 @@ class D3D12Shader : public Shader { private: PrimitiveType domain_shader_primitive_type_ = PrimitiveType::kNone; + std::vector texture_srvs_; uint32_t used_texture_mask_ = 0; std::vector sampler_bindings_; + + std::vector forced_early_z_shader_; }; } // namespace d3d12 diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index 51564a16f..695cbc039 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -331,6 +331,15 @@ bool PipelineCache::TranslateShader(D3D12Shader* shader, shader->ucode_disassembly().c_str()); } + // If may be useful, create a version of the shader with early depth/stencil + // forced. + if (shader->type() == ShaderType::kPixel && !edram_rov_used_ && + shader->early_z_allowed()) { + shader->SetForcedEarlyZShaderObject( + std::move(DxbcShaderTranslator::ForceEarlyDepthStencil( + shader->translated_binary().data()))); + } + // Disassemble the shader for dumping. if (FLAGS_d3d12_dxbc_disasm) { auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); @@ -569,6 +578,8 @@ bool PipelineCache::GetCurrentStateDescription( } if (!edram_rov_used_) { + uint32_t rb_colorcontrol = regs[XE_GPU_REG_RB_COLORCONTROL].u32; + // Depth/stencil. No stencil, always passing depth test and no depth writing // means depth disabled. if (render_targets[4].format != DXGI_FORMAT_UNKNOWN) { @@ -616,6 +627,16 @@ bool PipelineCache::GetCurrentStateDescription( description_out.depth_func = 0b111; } + // Forced early Z if the shader allows that and alpha testing is disabled. + // TODO(Triang3l): For memexporting shaders, possibly choose this according + // to the early Z toggle in RB_DEPTHCONTROL (the correct behavior is still + // unknown). + if (pixel_shader != nullptr && + pixel_shader->GetForcedEarlyZShaderObject().size() != 0 && + (!(rb_colorcontrol & 0x8) || (rb_colorcontrol & 0x7) == 0x7)) { + description_out.force_early_z = 1; + } + // Render targets and blending state. 32 because of 0x1F mask, for safety // (all unknown to zero). uint32_t color_mask = command_processor_->GetCurrentColorMask(pixel_shader); @@ -695,7 +716,7 @@ bool PipelineCache::GetCurrentStateDescription( rt.format = RenderTargetCache::GetBaseColorFormat( ColorRenderTargetFormat((color_info >> 16) & 0xF)); rt.write_mask = (color_mask >> (guest_rt_index * 4)) & 0xF; - if (rt.write_mask) { + if (!(rb_colorcontrol & 0x20) && rt.write_mask) { rt.src_blend = kBlendFactorMap[blendcontrol & 0x1F]; rt.dest_blend = kBlendFactorMap[(blendcontrol >> 8) & 0x1F]; rt.blend_op = BlendOp((blendcontrol >> 5) & 0x7); @@ -874,10 +895,17 @@ ID3D12PipelineState* PipelineCache::CreatePipelineState( assert_always(); return nullptr; } - state_desc.PS.pShaderBytecode = - description.pixel_shader->translated_binary().data(); - state_desc.PS.BytecodeLength = - description.pixel_shader->translated_binary().size(); + const auto& forced_early_z_shader = + description.pixel_shader->GetForcedEarlyZShaderObject(); + if (description.force_early_z && forced_early_z_shader.size() != 0) { + state_desc.PS.pShaderBytecode = forced_early_z_shader.data(); + state_desc.PS.BytecodeLength = forced_early_z_shader.size(); + } else { + state_desc.PS.pShaderBytecode = + description.pixel_shader->translated_binary().data(); + state_desc.PS.BytecodeLength = + description.pixel_shader->translated_binary().size(); + } } else if (edram_rov_used_) { state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data(); state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size(); diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index 6615400d3..f09b518d1 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -155,6 +155,7 @@ class PipelineCache { uint32_t depth_write : 1; // 21 uint32_t stencil_enable : 1; // 22 uint32_t stencil_read_mask : 8; // 30 + uint32_t force_early_z : 1; // 31 uint32_t stencil_write_mask : 8; // 8 uint32_t stencil_front_fail_op : 3; // 11 diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index b153fae79..5d4204f89 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -89,6 +89,55 @@ DxbcShaderTranslator::DxbcShaderTranslator(uint32_t vendor_id, } DxbcShaderTranslator::~DxbcShaderTranslator() = default; +std::vector DxbcShaderTranslator::ForceEarlyDepthStencil( + const uint8_t* shader) { + const uint32_t* old_shader = reinterpret_cast(shader); + + // To return something anyway even if patching fails. + std::vector new_shader; + uint32_t shader_size_bytes = old_shader[6]; + new_shader.resize(shader_size_bytes); + std::memcpy(new_shader.data(), shader, shader_size_bytes); + + // Find the SHEX chunk. + uint32_t chunk_count = old_shader[7]; + for (uint32_t i = 0; i < chunk_count; ++i) { + uint32_t chunk_offset_bytes = old_shader[8 + i]; + const uint32_t* chunk = old_shader + chunk_offset_bytes / sizeof(uint32_t); + if (chunk[0] != 'XEHS') { + continue; + } + // Find dcl_globalFlags and patch it. + uint32_t code_size_dwords = chunk[3]; + chunk += 4; + for (uint32_t j = 0; j < code_size_dwords;) { + uint32_t opcode_token = chunk[j]; + uint32_t opcode = DECODE_D3D10_SB_OPCODE_TYPE(opcode_token); + if (opcode == D3D10_SB_OPCODE_DCL_GLOBAL_FLAGS) { + opcode_token |= D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL; + std::memcpy(new_shader.data() + + (chunk_offset_bytes + (4 + j) * sizeof(uint32_t)), + &opcode_token, sizeof(uint32_t)); + // Recalculate the checksum since the shader was modified. + CalculateDXBCChecksum( + reinterpret_cast(new_shader.data()), + shader_size_bytes, + reinterpret_cast(new_shader.data() + + sizeof(uint32_t))); + break; + } + if (opcode == D3D10_SB_OPCODE_CUSTOMDATA) { + j += chunk[j + 1]; + } else { + j += DECODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(opcode_token); + } + } + break; + } + + return std::move(new_shader); +} + std::vector DxbcShaderTranslator::CreateDepthOnlyPixelShader() { Reset(); is_depth_only_pixel_shader_ = true; @@ -4034,7 +4083,8 @@ void DxbcShaderTranslator::WriteShaderCode() { } // Don't allow refactoring when converting to native code to maintain position - // invariance (needed even in pixel shaders for oDepth invariance). + // invariance (needed even in pixel shaders for oDepth invariance). Also this + // dcl will be modified by ForceEarlyDepthStencil. shader_object_.push_back( ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_GLOBAL_FLAGS) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 0c893c567..c5d97c1fe 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -491,6 +491,10 @@ class DxbcShaderTranslator : public ShaderTranslator { kEDRAM, }; + // Creates a copy of the shader with early depth/stencil testing forced, + // overriding that alpha testing is used in the shader. + static std::vector ForceEarlyDepthStencil(const uint8_t* shader); + // Returns the bits that need to be added to the RT flags constant - needs to // be done externally, not in SetColorFormatConstants, because the flags // contain other state. diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 78505e6a1..1ed7ac23e 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -607,6 +607,10 @@ class Shader { // Returns true if the given color target index [0-3]. bool writes_color_target(int i) const { return writes_color_targets_[i]; } + // Returns true if the pixel shader can potentially have early depth/stencil + // testing enabled, provided alpha testing is disabled. + bool early_z_allowed() const { return early_z_allowed_; } + // True if the shader was translated and prepared without error. bool is_valid() const { return is_valid_; } @@ -655,6 +659,7 @@ class Shader { std::vector texture_bindings_; ConstantRegisterMap constant_register_map_ = {0}; bool writes_color_targets_[4] = {false, false, false, false}; + bool early_z_allowed_ = true; std::vector memexport_stream_constants_; bool is_valid_ = false; diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index e829e665f..e22a8efea 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -65,6 +65,7 @@ void ShaderTranslator::Reset() { writes_color_targets_[i] = false; } writes_depth_ = false; + early_z_allowed_ = true; memexport_alloc_count_ = 0; memexport_eA_written_ = 0; std::memset(&memexport_eM_written_, 0, sizeof(memexport_eM_written_)); @@ -189,6 +190,7 @@ bool ShaderTranslator::TranslateInternal(Shader* shader) { for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) { shader->writes_color_targets_[i] = writes_color_targets_[i]; } + shader->early_z_allowed_ = early_z_allowed_; shader->memexport_stream_constants_.clear(); for (uint32_t memexport_stream_constant : memexport_stream_constants_) { shader->memexport_stream_constants_.push_back(memexport_stream_constant); @@ -288,6 +290,7 @@ void ShaderTranslator::GatherInstructionInformation( if (op.has_vector_op()) { const auto& opcode_info = alu_vector_opcode_infos_[static_cast(op.vector_opcode())]; + early_z_allowed_ &= !opcode_info.disable_early_z; for (size_t i = 0; i < opcode_info.argument_count; ++i) { if (op.src_is_temp(i + 1) && (op.src_reg(i + 1) & 0x40)) { uses_register_dynamic_addressing_ = true; @@ -299,6 +302,7 @@ void ShaderTranslator::GatherInstructionInformation( writes_color_targets_[op.vector_dest()] = true; } else if (op.vector_dest() == 61) { writes_depth_ = true; + early_z_allowed_ = false; } } if (memexport_alloc_count_ > 0 && @@ -335,6 +339,7 @@ void ShaderTranslator::GatherInstructionInformation( if (op.has_scalar_op()) { const auto& opcode_info = alu_scalar_opcode_infos_[static_cast(op.scalar_opcode())]; + early_z_allowed_ &= !opcode_info.disable_early_z; if (opcode_info.argument_count == 1 && op.src_is_temp(3) && (op.src_reg(3) & 0x40)) { uses_register_dynamic_addressing_ = true; @@ -345,6 +350,7 @@ void ShaderTranslator::GatherInstructionInformation( writes_color_targets_[op.scalar_dest()] = true; } else if (op.scalar_dest() == 61) { writes_depth_ = true; + early_z_allowed_ = false; } } if (memexport_alloc_count_ > 0 && @@ -1030,91 +1036,91 @@ void ShaderTranslator::ParseTextureFetchInstruction( const ShaderTranslator::AluOpcodeInfo ShaderTranslator::alu_vector_opcode_infos_[0x20] = { - {"add", 2, 4}, // 0 - {"mul", 2, 4}, // 1 - {"max", 2, 4}, // 2 - {"min", 2, 4}, // 3 - {"seq", 2, 4}, // 4 - {"sgt", 2, 4}, // 5 - {"sge", 2, 4}, // 6 - {"sne", 2, 4}, // 7 - {"frc", 1, 4}, // 8 - {"trunc", 1, 4}, // 9 - {"floor", 1, 4}, // 10 - {"mad", 3, 4}, // 11 - {"cndeq", 3, 4}, // 12 - {"cndge", 3, 4}, // 13 - {"cndgt", 3, 4}, // 14 - {"dp4", 2, 4}, // 15 - {"dp3", 2, 4}, // 16 - {"dp2add", 3, 4}, // 17 - {"cube", 2, 4}, // 18 - {"max4", 1, 4}, // 19 - {"setp_eq_push", 2, 4}, // 20 - {"setp_ne_push", 2, 4}, // 21 - {"setp_gt_push", 2, 4}, // 22 - {"setp_ge_push", 2, 4}, // 23 - {"kill_eq", 2, 4}, // 24 - {"kill_gt", 2, 4}, // 25 - {"kill_ge", 2, 4}, // 26 - {"kill_ne", 2, 4}, // 27 - {"dst", 2, 4}, // 28 - {"maxa", 2, 4}, // 29 + {"add", 2, 4, false}, // 0 + {"mul", 2, 4, false}, // 1 + {"max", 2, 4, false}, // 2 + {"min", 2, 4, false}, // 3 + {"seq", 2, 4, false}, // 4 + {"sgt", 2, 4, false}, // 5 + {"sge", 2, 4, false}, // 6 + {"sne", 2, 4, false}, // 7 + {"frc", 1, 4, false}, // 8 + {"trunc", 1, 4, false}, // 9 + {"floor", 1, 4, false}, // 10 + {"mad", 3, 4, false}, // 11 + {"cndeq", 3, 4, false}, // 12 + {"cndge", 3, 4, false}, // 13 + {"cndgt", 3, 4, false}, // 14 + {"dp4", 2, 4, false}, // 15 + {"dp3", 2, 4, false}, // 16 + {"dp2add", 3, 4, false}, // 17 + {"cube", 2, 4, false}, // 18 + {"max4", 1, 4, false}, // 19 + {"setp_eq_push", 2, 4, false}, // 20 + {"setp_ne_push", 2, 4, false}, // 21 + {"setp_gt_push", 2, 4, false}, // 22 + {"setp_ge_push", 2, 4, false}, // 23 + {"kill_eq", 2, 4, true}, // 24 + {"kill_gt", 2, 4, true}, // 25 + {"kill_ge", 2, 4, true}, // 26 + {"kill_ne", 2, 4, true}, // 27 + {"dst", 2, 4, false}, // 28 + {"maxa", 2, 4, false}, // 29 }; const ShaderTranslator::AluOpcodeInfo ShaderTranslator::alu_scalar_opcode_infos_[0x40] = { - {"adds", 1, 2}, // 0 - {"adds_prev", 1, 1}, // 1 - {"muls", 1, 2}, // 2 - {"muls_prev", 1, 1}, // 3 - {"muls_prev2", 1, 2}, // 4 - {"maxs", 1, 2}, // 5 - {"mins", 1, 2}, // 6 - {"seqs", 1, 1}, // 7 - {"sgts", 1, 1}, // 8 - {"sges", 1, 1}, // 9 - {"snes", 1, 1}, // 10 - {"frcs", 1, 1}, // 11 - {"truncs", 1, 1}, // 12 - {"floors", 1, 1}, // 13 - {"exp", 1, 1}, // 14 - {"logc", 1, 1}, // 15 - {"log", 1, 1}, // 16 - {"rcpc", 1, 1}, // 17 - {"rcpf", 1, 1}, // 18 - {"rcp", 1, 1}, // 19 - {"rsqc", 1, 1}, // 20 - {"rsqf", 1, 1}, // 21 - {"rsq", 1, 1}, // 22 - {"maxas", 1, 2}, // 23 - {"maxasf", 1, 2}, // 24 - {"subs", 1, 2}, // 25 - {"subs_prev", 1, 1}, // 26 - {"setp_eq", 1, 1}, // 27 - {"setp_ne", 1, 1}, // 28 - {"setp_gt", 1, 1}, // 29 - {"setp_ge", 1, 1}, // 30 - {"setp_inv", 1, 1}, // 31 - {"setp_pop", 1, 1}, // 32 - {"setp_clr", 1, 1}, // 33 - {"setp_rstr", 1, 1}, // 34 - {"kills_eq", 1, 1}, // 35 - {"kills_gt", 1, 1}, // 36 - {"kills_ge", 1, 1}, // 37 - {"kills_ne", 1, 1}, // 38 - {"kills_one", 1, 1}, // 39 - {"sqrt", 1, 1}, // 40 - {"UNKNOWN", 0, 0}, // 41 - {"mulsc", 2, 1}, // 42 - {"mulsc", 2, 1}, // 43 - {"addsc", 2, 1}, // 44 - {"addsc", 2, 1}, // 45 - {"subsc", 2, 1}, // 46 - {"subsc", 2, 1}, // 47 - {"sin", 1, 1}, // 48 - {"cos", 1, 1}, // 49 - {"retain_prev", 1, 1}, // 50 + {"adds", 1, 2, false}, // 0 + {"adds_prev", 1, 1, false}, // 1 + {"muls", 1, 2, false}, // 2 + {"muls_prev", 1, 1, false}, // 3 + {"muls_prev2", 1, 2, false}, // 4 + {"maxs", 1, 2, false}, // 5 + {"mins", 1, 2, false}, // 6 + {"seqs", 1, 1, false}, // 7 + {"sgts", 1, 1, false}, // 8 + {"sges", 1, 1, false}, // 9 + {"snes", 1, 1, false}, // 10 + {"frcs", 1, 1, false}, // 11 + {"truncs", 1, 1, false}, // 12 + {"floors", 1, 1, false}, // 13 + {"exp", 1, 1, false}, // 14 + {"logc", 1, 1, false}, // 15 + {"log", 1, 1, false}, // 16 + {"rcpc", 1, 1, false}, // 17 + {"rcpf", 1, 1, false}, // 18 + {"rcp", 1, 1, false}, // 19 + {"rsqc", 1, 1, false}, // 20 + {"rsqf", 1, 1, false}, // 21 + {"rsq", 1, 1, false}, // 22 + {"maxas", 1, 2, false}, // 23 + {"maxasf", 1, 2, false}, // 24 + {"subs", 1, 2, false}, // 25 + {"subs_prev", 1, 1, false}, // 26 + {"setp_eq", 1, 1, false}, // 27 + {"setp_ne", 1, 1, false}, // 28 + {"setp_gt", 1, 1, false}, // 29 + {"setp_ge", 1, 1, false}, // 30 + {"setp_inv", 1, 1, false}, // 31 + {"setp_pop", 1, 1, false}, // 32 + {"setp_clr", 1, 1, false}, // 33 + {"setp_rstr", 1, 1, false}, // 34 + {"kills_eq", 1, 1, true}, // 35 + {"kills_gt", 1, 1, true}, // 36 + {"kills_ge", 1, 1, true}, // 37 + {"kills_ne", 1, 1, true}, // 38 + {"kills_one", 1, 1, true}, // 39 + {"sqrt", 1, 1, false}, // 40 + {"UNKNOWN", 0, 0, false}, // 41 + {"mulsc", 2, 1, false}, // 42 + {"mulsc", 2, 1, false}, // 43 + {"addsc", 2, 1, false}, // 44 + {"addsc", 2, 1, false}, // 45 + {"subsc", 2, 1, false}, // 46 + {"subsc", 2, 1, false}, // 47 + {"sin", 1, 1, false}, // 48 + {"cos", 1, 1, false}, // 49 + {"retain_prev", 1, 1, false}, // 50 }; void ShaderTranslator::TranslateAluInstruction(const AluInstruction& op) { diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h index 9aacba3bb..48775ade9 100644 --- a/src/xenia/gpu/shader_translator.h +++ b/src/xenia/gpu/shader_translator.h @@ -58,6 +58,9 @@ class ShaderTranslator { bool writes_color_target(int i) const { return writes_color_targets_[i]; } // True if the current shader overrides the pixel depth. bool writes_depth() const { return writes_depth_; } + // True if the pixel shader can potentially have early depth/stencil testing + // enabled, provided alpha testing is disabled. + bool early_z_allowed() const { return early_z_allowed_; } // A list of all vertex bindings, populated before translation occurs. const std::vector& vertex_bindings() const { return vertex_bindings_; @@ -160,6 +163,7 @@ class ShaderTranslator { const char* name; size_t argument_count; int src_swizzle_component_count; + bool disable_early_z; }; bool TranslateInternal(Shader* shader); @@ -245,6 +249,7 @@ class ShaderTranslator { bool uses_register_dynamic_addressing_ = false; bool writes_color_targets_[4] = {false, false, false, false}; bool writes_depth_ = false; + bool early_z_allowed_ = true; uint32_t memexport_alloc_count_ = 0; // For register allocation in implementations - what was used after each