From 75d805245d2fcfca69993d3dc01bb43697bd28c4 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 9 Apr 2023 20:13:22 +0300 Subject: [PATCH] [DXBC] `discard` pixels from `kill` with ROV instead of returning Keep the current lane active as it may be needed for derivatives. --- src/xenia/gpu/dxbc_shader_translator.h | 2 + src/xenia/gpu/dxbc_shader_translator_alu.cc | 68 +++++++-------------- 2 files changed, 25 insertions(+), 45 deletions(-) diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index a75597011..bcb38a21f 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -917,6 +917,8 @@ class DxbcShaderTranslator : public ShaderTranslator { .SelectFromSwizzled(word_index & 1); } + void KillPixel(bool condition, const dxbc::Src& condition_src); + void ProcessVectorAluOperation(const ParsedAluInstruction& instr, uint32_t& result_swizzle, bool& predicate_written); diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc index 7331a7e2a..948406b90 100644 --- a/src/xenia/gpu/dxbc_shader_translator_alu.cc +++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc @@ -19,6 +19,20 @@ namespace xe { namespace gpu { using namespace ucode; +void DxbcShaderTranslator::KillPixel(bool condition, + const dxbc::Src& condition_src) { + // Discard the pixel, but continue execution if other lanes in the quad need + // this lane for derivatives. The driver may also perform early exiting + // internally if all lanes are discarded if deemed beneficial. + a_.OpDiscard(condition, condition_src); + if (edram_rov_used_) { + // Even though discarding disables all subsequent UAV/ROV writes, also skip + // as much of the Render Backend emulation logic as possible by setting the + // coverage and the mask of the written render targets to zero. + a_.OpMov(dxbc::Dest::R(system_temp_rov_params_, 0b0001), dxbc::Src::LU(0)); + } +} + void DxbcShaderTranslator::ProcessVectorAluOperation( const ParsedAluInstruction& instr, uint32_t& result_swizzle, bool& predicate_written) { @@ -492,11 +506,7 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY)); - if (edram_rov_used_) { - a_.OpRetC(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); - } else { - a_.OpDiscard(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); - } + KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); if (used_result_components) { a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), @@ -512,11 +522,7 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY)); - if (edram_rov_used_) { - a_.OpRetC(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); - } else { - a_.OpDiscard(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); - } + KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); if (used_result_components) { a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), @@ -532,11 +538,7 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY)); - if (edram_rov_used_) { - a_.OpRetC(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); - } else { - a_.OpDiscard(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); - } + KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); if (used_result_components) { a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), @@ -552,11 +554,7 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY)); - if (edram_rov_used_) { - a_.OpRetC(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); - } else { - a_.OpDiscard(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); - } + KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); if (used_result_components) { a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), @@ -952,47 +950,27 @@ void DxbcShaderTranslator::ProcessScalarAluOperation( case AluScalarOpcode::kKillsEq: a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(0.0f)); - if (edram_rov_used_) { - a_.OpRetC(true, ps_src); - } else { - a_.OpDiscard(true, ps_src); - } + KillPixel(true, ps_src); a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f)); break; case AluScalarOpcode::kKillsGt: a_.OpLT(ps_dest, dxbc::Src::LF(0.0f), operand_0_a); - if (edram_rov_used_) { - a_.OpRetC(true, ps_src); - } else { - a_.OpDiscard(true, ps_src); - } + KillPixel(true, ps_src); a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f)); break; case AluScalarOpcode::kKillsGe: a_.OpGE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f)); - if (edram_rov_used_) { - a_.OpRetC(true, ps_src); - } else { - a_.OpDiscard(true, ps_src); - } + KillPixel(true, ps_src); a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f)); break; case AluScalarOpcode::kKillsNe: a_.OpNE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f)); - if (edram_rov_used_) { - a_.OpRetC(true, ps_src); - } else { - a_.OpDiscard(true, ps_src); - } + KillPixel(true, ps_src); a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f)); break; case AluScalarOpcode::kKillsOne: a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(1.0f)); - if (edram_rov_used_) { - a_.OpRetC(true, ps_src); - } else { - a_.OpDiscard(true, ps_src); - } + KillPixel(true, ps_src); a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f)); break;