From d6188c5d7ebcd1c2ee2a014dcbc6accb83b5ad7a Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 9 Jan 2022 14:58:38 +0300 Subject: [PATCH] [GPU] Reuse base+index*stride in vfetch_mini instead of reloading the index GPR The wheel shader in 4D530910 does vfetch_full to r0 with the index from r0.x, and then vfetch_mini. Thanks @Gliniak for the finding :3 Also small formatting cleanup in commented-out code. --- src/xenia/gpu/dxbc_shader_translator.cc | 4 +- src/xenia/gpu/dxbc_shader_translator.h | 4 +- src/xenia/gpu/dxbc_shader_translator_fetch.cc | 124 +++++++++++------- src/xenia/gpu/shader.h | 7 + src/xenia/gpu/shader_translator.cc | 2 +- src/xenia/gpu/shader_translator_disasm.cc | 9 +- src/xenia/gpu/ucode.h | 19 ++- 7 files changed, 103 insertions(+), 66 deletions(-) diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 9f52bf046..350ea6895 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -849,7 +849,7 @@ void DxbcShaderTranslator::StartTranslation() { system_temp_aL_ = PushSystemTemp(0b1111); system_temp_loop_count_ = PushSystemTemp(0b1111); system_temp_grad_h_lod_ = PushSystemTemp(0b1111); - system_temp_grad_v_ = PushSystemTemp(0b0111); + system_temp_grad_v_vfetch_address_ = PushSystemTemp(0b1111); // Zero general-purpose registers to prevent crashes when the game // references them after only initializing them conditionally. @@ -1039,7 +1039,7 @@ void DxbcShaderTranslator::CompleteShaderCode() { // - system_temp_aL_. // - system_temp_loop_count_. // - system_temp_grad_h_lod_. - // - system_temp_grad_v_. + // - system_temp_grad_v_vfetch_address_. PopSystemTemp(6); // Write memexported data to the shared memory UAV. diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 3bdde19be..0a25cef21 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -1104,7 +1104,9 @@ class DxbcShaderTranslator : public ShaderTranslator { uint32_t system_temp_loop_count_; // Explicitly set texture gradients and LOD. uint32_t system_temp_grad_h_lod_; - uint32_t system_temp_grad_v_; + // .w stores `base + index * stride` in bytes from the last vfetch_full as it + // may be needed by vfetch_mini. + uint32_t system_temp_grad_v_vfetch_address_; // The bool constant number containing the condition for the currently // processed exec (or the last - unless a label has reset this), or diff --git a/src/xenia/gpu/dxbc_shader_translator_fetch.cc b/src/xenia/gpu/dxbc_shader_translator_fetch.cc index 192b29a33..63480b76c 100644 --- a/src/xenia/gpu/dxbc_shader_translator_fetch.cc +++ b/src/xenia/gpu/dxbc_shader_translator_fetch.cc @@ -59,47 +59,67 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( // fetch constants on the CPU when proper bound checks are added - vfetch may // be conditional, so fetch constants may also be used conditionally. - // - Load the byte address in physical memory to system_temp_result_.w (so - // it's not overwritten by data loads until the last one). + // - Load the part of the byte address in the physical memory that is the same + // in vfetch_full and vfetch_mini to system_temp_grad_v_vfetch_address_.w + // (the index operand GPR must not be reloaded in vfetch_mini because it + // might have been overwritten previously, but that shouldn't have effect on + // vfetch_mini). - dxbc::Dest address_dest(dxbc::Dest::R(system_temp_result_, 0b1000)); - dxbc::Src address_src(dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW)); - if (instr.attributes.stride) { - // Convert the index to an integer by flooring or by rounding to the nearest - // (as floor(index + 0.5) because rounding to the nearest even makes no - // sense for addressing, both 1.5 and 2.5 would be 2). - // http://web.archive.org/web/20100302145413/http://msdn.microsoft.com:80/en-us/library/bb313960.aspx - { - bool index_operand_temp_pushed = false; - dxbc::Src index_operand( - LoadOperand(instr.operands[0], 0b0001, index_operand_temp_pushed) - .SelectFromSwizzled(0)); - if (instr.attributes.is_index_rounded) { - a_.OpAdd(address_dest, index_operand, dxbc::Src::LF(0.5f)); - a_.OpRoundNI(address_dest, address_src); - } else { - a_.OpRoundNI(address_dest, index_operand); - } - if (index_operand_temp_pushed) { - PopSystemTemp(); + dxbc::Src address_src( + dxbc::Src::R(system_temp_grad_v_vfetch_address_, dxbc::Src::kWWWW)); + if (!instr.is_mini_fetch) { + dxbc::Dest address_dest( + dxbc::Dest::R(system_temp_grad_v_vfetch_address_, 0b1000)); + if (instr.attributes.stride) { + // Convert the index to an integer by flooring or by rounding to the + // nearest (as floor(index + 0.5) because rounding to the nearest even + // makes no sense for addressing, both 1.5 and 2.5 would be 2). + { + bool index_operand_temp_pushed = false; + dxbc::Src index_operand( + LoadOperand(instr.operands[0], 0b0001, index_operand_temp_pushed) + .SelectFromSwizzled(0)); + if (instr.attributes.is_index_rounded) { + a_.OpAdd(address_dest, index_operand, dxbc::Src::LF(0.5f)); + a_.OpRoundNI(address_dest, address_src); + } else { + a_.OpRoundNI(address_dest, index_operand); + } + if (index_operand_temp_pushed) { + PopSystemTemp(); + } } + a_.OpFToI(address_dest, address_src); + // Extract the byte address from the fetch constant to + // system_temp_result_.w (which is not used yet). + a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b1000), + fetch_constant_src.SelectFromSwizzled(0), + dxbc::Src::LU(~uint32_t(3))); + // Merge the index and the base address. + a_.OpIMAd(address_dest, address_src, + dxbc::Src::LU(instr.attributes.stride * sizeof(uint32_t)), + dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW)); + } else { + // Fetching from the same location - extract the byte address of the + // beginning of the buffer. + a_.OpAnd(address_dest, fetch_constant_src.SelectFromSwizzled(0), + dxbc::Src::LU(~uint32_t(3))); } - a_.OpFToI(address_dest, address_src); - // Extract the byte address from the fetch constant to - // system_temp_result_.z. - a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0100), - fetch_constant_src.SelectFromSwizzled(0), - dxbc::Src::LU(~uint32_t(3))); - // Merge the index and the base address. - a_.OpIMAd(address_dest, address_src, - dxbc::Src::LU(instr.attributes.stride * sizeof(uint32_t)), - dxbc::Src::R(system_temp_result_, dxbc::Src::kZZZZ)); - } else { - // Fetching from the same location - extract the byte address of the - // beginning of the buffer. - a_.OpAnd(address_dest, fetch_constant_src.SelectFromSwizzled(0), - dxbc::Src::LU(~uint32_t(3))); } + + dxbc::Dest address_temp_dest(dxbc::Dest::R(system_temp_result_, 0b1000)); + dxbc::Src address_temp_src( + dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW)); + + // - From now on, if any additional offset must be applied to the + // `base + index * stride` part of the address, it must be done by writing + // to system_temp_result_.w (address_temp_dest) instead of + // system_temp_grad_v_vfetch_address_.w (since it must stay the same for the + // vfetch_full and all its vfetch_mini invocations), and changing + // address_src to address_temp_src afterwards. system_temp_result_.w can be + // used for this purpose safely because it won't be overwritten until the + // last dword is loaded (after which the address won't be needed anymore). + // Add the word offset from the instruction (signed), plus the offset of the // first needed word within the element. uint32_t first_word_index; @@ -108,8 +128,9 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( instr.attributes.offset + int32_t(first_word_index); if (first_word_buffer_offset) { // Add the constant word offset. - a_.OpIAdd(address_dest, address_src, + a_.OpIAdd(address_temp_dest, address_src, dxbc::Src::LI(first_word_buffer_offset * sizeof(uint32_t))); + address_src = address_temp_src; } // - Load needed words to system_temp_result_, words 0, 1, 2, 3 to X, Y, Z, W @@ -159,9 +180,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ~((uint32_t(1) << (word_index + word_count)) - uint32_t(1)); if (word_index != word_index_previous) { // Go to the word in the buffer. - a_.OpIAdd(address_dest, address_src, + a_.OpIAdd(address_temp_dest, address_src, dxbc::Src::LU((word_index - word_index_previous) * sizeof(uint32_t))); + address_src = address_temp_src; word_index_previous = word_index; } // Can ld_raw either to the first multiple components, or to any scalar @@ -592,7 +614,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( case FetchOpcode::kSetTextureGradientsVert: { bool grad_operand_temp_pushed = false; a_.OpMov( - dxbc::Dest::R(system_temp_grad_v_, 0b0111), + dxbc::Dest::R(system_temp_grad_v_vfetch_address_, 0b0111), LoadOperand(instr.operands[0], 0b0111, grad_operand_temp_pushed)); if (grad_operand_temp_pushed) { PopSystemTemp(); @@ -1521,15 +1543,15 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( // Extract gradient exponent biases from the fetch constant and merge // them with the LOD bias. a_.OpIBFE(dxbc::Dest::R(grad_h_lod_temp, 0b0011), dxbc::Src::LU(5), - dxbc::Src::LU(22, 27, 0, 0), - RequestTextureFetchConstantWord(tfetch_index, 4)); + dxbc::Src::LU(22, 27, 0, 0), + RequestTextureFetchConstantWord(tfetch_index, 4)); a_.OpIMAd(dxbc::Dest::R(grad_h_lod_temp, 0b0011), - dxbc::Src::R(grad_h_lod_temp), dxbc::Src::LI(int32_t(1) << 23), - dxbc::Src::LF(1.0f)); + dxbc::Src::R(grad_h_lod_temp), + dxbc::Src::LI(int32_t(1) << 23), dxbc::Src::LF(1.0f)); a_.OpMul(dxbc::Dest::R(grad_v_temp, 0b1000), lod_src, - dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kYYYY)); + dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kYYYY)); a_.OpMul(lod_dest, lod_src, - dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kXXXX)); + dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kXXXX)); #endif // Obtain the gradients and apply biases to them. if (instr.attributes.use_register_gradients) { @@ -1540,11 +1562,11 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( // done in getCompTexLOD, so don't do it here too. #if 0 a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask), - dxbc::Src::R(system_temp_grad_v_), - dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW)); + dxbc::Src::R(system_temp_grad_v_vfetch_address_), + dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW)); #else a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask), - dxbc::Src::R(system_temp_grad_v_), lod_src); + dxbc::Src::R(system_temp_grad_v_vfetch_address_), lod_src); #endif // TODO(Triang3l): Are cube map register gradients unnormalized if // the coordinates themselves are unnormalized? @@ -1586,8 +1608,8 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( // done in getCompTexLOD, so don't do it here too. #if 0 a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask), - dxbc::Src::R(grad_v_temp), - dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW)); + dxbc::Src::R(grad_v_temp), + dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW)); #else a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask), dxbc::Src::R(grad_v_temp), lod_src); diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index f7c52cab0..2ce81409a 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -440,6 +440,13 @@ struct ParsedVertexFetchInstruction { // Number of source operands. size_t operand_count = 0; // Describes each source operand. + // Note that for vfetch_mini, which inherits the operands from vfetch_full, + // the index operand register may been overwritten between the vfetch_full and + // the vfetch_mini (happens in 4D530910 for wheels), but that should have no + // effect on the index actually used for fetching. A copy of the index + // therefore must be stored by vfetch_full (the base address, stride and + // rounding may be pre-applied to it since they will be the same in the + // vfetch_full and all its vfetch_mini instructions). InstructionOperand operands[2]; struct Attributes { diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index b1d1a060e..612edbfbf 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -876,7 +876,7 @@ bool ParseVertexFetchInstruction(const VertexFetchInstruction& op, instr.attributes.stride = full_op.stride(); instr.attributes.exp_adjust = op.exp_adjust(); instr.attributes.prefetch_count = op.prefetch_count(); - instr.attributes.is_index_rounded = op.is_index_rounded(); + instr.attributes.is_index_rounded = full_op.is_index_rounded(); instr.attributes.is_signed = op.is_signed(); instr.attributes.is_integer = !op.is_normalized(); instr.attributes.signed_rf_mode = op.signed_rf_mode(); diff --git a/src/xenia/gpu/shader_translator_disasm.cc b/src/xenia/gpu/shader_translator_disasm.cc index cf7e94d52..8dd72413a 100644 --- a/src/xenia/gpu/shader_translator_disasm.cc +++ b/src/xenia/gpu/shader_translator_disasm.cc @@ -328,13 +328,12 @@ void ParsedVertexFetchInstruction::Disassemble(StringBuffer* out) const { if (!is_mini_fetch) { out->Append(", "); DisassembleSourceOperand(operands[0], out); - out->Append(", "); - out->AppendFormat("vf{}", 95 - operands[1].storage_index); + out->AppendFormat(", vf{}", 95 - operands[1].storage_index); + if (attributes.is_index_rounded) { + out->Append(", RoundIndex=true"); + } } - if (attributes.is_index_rounded) { - out->Append(", RoundIndex=true"); - } if (attributes.exp_adjust) { out->AppendFormat(", ExpAdjust={}", attributes.exp_adjust); } diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index 798fd5367..e86387535 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -599,6 +599,8 @@ struct alignas(uint32_t) VertexFetchInstruction { // Required condition value of the comparision (true or false). bool predicate_condition() const { return data_.pred_condition == 1; } // Vertex fetch constant index [0-95]. + // Applicable only to vfetch_full (the address from vfetch_full is reused in + // vfetch_mini). uint32_t fetch_constant_index() const { return data_.const_index * 3 + data_.const_index_sel; } @@ -606,6 +608,8 @@ struct alignas(uint32_t) VertexFetchInstruction { uint32_t dest() const { return data_.dst_reg; } uint32_t dest_swizzle() const { return data_.dst_swiz; } bool is_dest_relative() const { return data_.dst_reg_am; } + // The source is applicable only to vfetch_full (the address from vfetch_full + // is reused in vfetch_mini). uint32_t src() const { return data_.src_reg; } uint32_t src_swizzle() const { return data_.src_swiz; } bool is_src_relative() const { return data_.src_reg_am; } @@ -644,18 +648,21 @@ struct alignas(uint32_t) VertexFetchInstruction { xenos::SignedRepeatingFractionMode signed_rf_mode() const { return data_.signed_rf_mode_all; } + // If true, the floating-point index is rounded to the nearest integer (likely + // as floor(index + 0.5) because rounding to the nearest even makes no sense + // for addressing, both 1.5 and 2.5 would be 2). + // Otherwise, it's floored (rounded towards negative infinity). + // Applicable only to vfetch_full (the address from vfetch_full is reused in + // vfetch_mini). + // http://web.archive.org/web/20090914055358/http://msdn.microsoft.com/en-us/library/bb313960.aspx bool is_index_rounded() const { return data_.is_index_rounded == 1; } // Dword stride, [0, 255]. + // Applicable only to vfetch_full (the address from vfetch_full is reused in + // vfetch_mini). uint32_t stride() const { return data_.stride; } // Dword offset, [-4194304, 4194303]. int32_t offset() const { return data_.offset; } - void AssignFromFull(const VertexFetchInstruction& full) { - data_.stride = full.data_.stride; - data_.const_index = full.data_.const_index; - data_.const_index_sel = full.data_.const_index_sel; - } - private: struct Data { struct {