[GPU] Reuse base+index*stride in vfetch_mini instead of reloading the index GPR

The wheel shader in 4D530910 does vfetch_full to r0 with the index from r0.x, and then vfetch_mini.
Thanks @Gliniak for the finding :3
Also small formatting cleanup in commented-out code.
This commit is contained in:
Triang3l 2022-01-09 14:58:38 +03:00
parent 600c14b3f0
commit d6188c5d7e
7 changed files with 103 additions and 66 deletions

View File

@ -849,7 +849,7 @@ void DxbcShaderTranslator::StartTranslation() {
system_temp_aL_ = PushSystemTemp(0b1111);
system_temp_loop_count_ = PushSystemTemp(0b1111);
system_temp_grad_h_lod_ = PushSystemTemp(0b1111);
system_temp_grad_v_ = PushSystemTemp(0b0111);
system_temp_grad_v_vfetch_address_ = PushSystemTemp(0b1111);
// Zero general-purpose registers to prevent crashes when the game
// references them after only initializing them conditionally.
@ -1039,7 +1039,7 @@ void DxbcShaderTranslator::CompleteShaderCode() {
// - system_temp_aL_.
// - system_temp_loop_count_.
// - system_temp_grad_h_lod_.
// - system_temp_grad_v_.
// - system_temp_grad_v_vfetch_address_.
PopSystemTemp(6);
// Write memexported data to the shared memory UAV.

View File

@ -1104,7 +1104,9 @@ class DxbcShaderTranslator : public ShaderTranslator {
uint32_t system_temp_loop_count_;
// Explicitly set texture gradients and LOD.
uint32_t system_temp_grad_h_lod_;
uint32_t system_temp_grad_v_;
// .w stores `base + index * stride` in bytes from the last vfetch_full as it
// may be needed by vfetch_mini.
uint32_t system_temp_grad_v_vfetch_address_;
// The bool constant number containing the condition for the currently
// processed exec (or the last - unless a label has reset this), or

View File

@ -59,47 +59,67 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
// fetch constants on the CPU when proper bound checks are added - vfetch may
// be conditional, so fetch constants may also be used conditionally.
// - Load the byte address in physical memory to system_temp_result_.w (so
// it's not overwritten by data loads until the last one).
// - Load the part of the byte address in the physical memory that is the same
// in vfetch_full and vfetch_mini to system_temp_grad_v_vfetch_address_.w
// (the index operand GPR must not be reloaded in vfetch_mini because it
// might have been overwritten previously, but that shouldn't have effect on
// vfetch_mini).
dxbc::Dest address_dest(dxbc::Dest::R(system_temp_result_, 0b1000));
dxbc::Src address_src(dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW));
if (instr.attributes.stride) {
// Convert the index to an integer by flooring or by rounding to the nearest
// (as floor(index + 0.5) because rounding to the nearest even makes no
// sense for addressing, both 1.5 and 2.5 would be 2).
// http://web.archive.org/web/20100302145413/http://msdn.microsoft.com:80/en-us/library/bb313960.aspx
{
bool index_operand_temp_pushed = false;
dxbc::Src index_operand(
LoadOperand(instr.operands[0], 0b0001, index_operand_temp_pushed)
.SelectFromSwizzled(0));
if (instr.attributes.is_index_rounded) {
a_.OpAdd(address_dest, index_operand, dxbc::Src::LF(0.5f));
a_.OpRoundNI(address_dest, address_src);
} else {
a_.OpRoundNI(address_dest, index_operand);
}
if (index_operand_temp_pushed) {
PopSystemTemp();
dxbc::Src address_src(
dxbc::Src::R(system_temp_grad_v_vfetch_address_, dxbc::Src::kWWWW));
if (!instr.is_mini_fetch) {
dxbc::Dest address_dest(
dxbc::Dest::R(system_temp_grad_v_vfetch_address_, 0b1000));
if (instr.attributes.stride) {
// Convert the index to an integer by flooring or by rounding to the
// nearest (as floor(index + 0.5) because rounding to the nearest even
// makes no sense for addressing, both 1.5 and 2.5 would be 2).
{
bool index_operand_temp_pushed = false;
dxbc::Src index_operand(
LoadOperand(instr.operands[0], 0b0001, index_operand_temp_pushed)
.SelectFromSwizzled(0));
if (instr.attributes.is_index_rounded) {
a_.OpAdd(address_dest, index_operand, dxbc::Src::LF(0.5f));
a_.OpRoundNI(address_dest, address_src);
} else {
a_.OpRoundNI(address_dest, index_operand);
}
if (index_operand_temp_pushed) {
PopSystemTemp();
}
}
a_.OpFToI(address_dest, address_src);
// Extract the byte address from the fetch constant to
// system_temp_result_.w (which is not used yet).
a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b1000),
fetch_constant_src.SelectFromSwizzled(0),
dxbc::Src::LU(~uint32_t(3)));
// Merge the index and the base address.
a_.OpIMAd(address_dest, address_src,
dxbc::Src::LU(instr.attributes.stride * sizeof(uint32_t)),
dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW));
} else {
// Fetching from the same location - extract the byte address of the
// beginning of the buffer.
a_.OpAnd(address_dest, fetch_constant_src.SelectFromSwizzled(0),
dxbc::Src::LU(~uint32_t(3)));
}
a_.OpFToI(address_dest, address_src);
// Extract the byte address from the fetch constant to
// system_temp_result_.z.
a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0100),
fetch_constant_src.SelectFromSwizzled(0),
dxbc::Src::LU(~uint32_t(3)));
// Merge the index and the base address.
a_.OpIMAd(address_dest, address_src,
dxbc::Src::LU(instr.attributes.stride * sizeof(uint32_t)),
dxbc::Src::R(system_temp_result_, dxbc::Src::kZZZZ));
} else {
// Fetching from the same location - extract the byte address of the
// beginning of the buffer.
a_.OpAnd(address_dest, fetch_constant_src.SelectFromSwizzled(0),
dxbc::Src::LU(~uint32_t(3)));
}
dxbc::Dest address_temp_dest(dxbc::Dest::R(system_temp_result_, 0b1000));
dxbc::Src address_temp_src(
dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW));
// - From now on, if any additional offset must be applied to the
// `base + index * stride` part of the address, it must be done by writing
// to system_temp_result_.w (address_temp_dest) instead of
// system_temp_grad_v_vfetch_address_.w (since it must stay the same for the
// vfetch_full and all its vfetch_mini invocations), and changing
// address_src to address_temp_src afterwards. system_temp_result_.w can be
// used for this purpose safely because it won't be overwritten until the
// last dword is loaded (after which the address won't be needed anymore).
// Add the word offset from the instruction (signed), plus the offset of the
// first needed word within the element.
uint32_t first_word_index;
@ -108,8 +128,9 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
instr.attributes.offset + int32_t(first_word_index);
if (first_word_buffer_offset) {
// Add the constant word offset.
a_.OpIAdd(address_dest, address_src,
a_.OpIAdd(address_temp_dest, address_src,
dxbc::Src::LI(first_word_buffer_offset * sizeof(uint32_t)));
address_src = address_temp_src;
}
// - Load needed words to system_temp_result_, words 0, 1, 2, 3 to X, Y, Z, W
@ -159,9 +180,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
~((uint32_t(1) << (word_index + word_count)) - uint32_t(1));
if (word_index != word_index_previous) {
// Go to the word in the buffer.
a_.OpIAdd(address_dest, address_src,
a_.OpIAdd(address_temp_dest, address_src,
dxbc::Src::LU((word_index - word_index_previous) *
sizeof(uint32_t)));
address_src = address_temp_src;
word_index_previous = word_index;
}
// Can ld_raw either to the first multiple components, or to any scalar
@ -592,7 +614,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
case FetchOpcode::kSetTextureGradientsVert: {
bool grad_operand_temp_pushed = false;
a_.OpMov(
dxbc::Dest::R(system_temp_grad_v_, 0b0111),
dxbc::Dest::R(system_temp_grad_v_vfetch_address_, 0b0111),
LoadOperand(instr.operands[0], 0b0111, grad_operand_temp_pushed));
if (grad_operand_temp_pushed) {
PopSystemTemp();
@ -1521,15 +1543,15 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
// Extract gradient exponent biases from the fetch constant and merge
// them with the LOD bias.
a_.OpIBFE(dxbc::Dest::R(grad_h_lod_temp, 0b0011), dxbc::Src::LU(5),
dxbc::Src::LU(22, 27, 0, 0),
RequestTextureFetchConstantWord(tfetch_index, 4));
dxbc::Src::LU(22, 27, 0, 0),
RequestTextureFetchConstantWord(tfetch_index, 4));
a_.OpIMAd(dxbc::Dest::R(grad_h_lod_temp, 0b0011),
dxbc::Src::R(grad_h_lod_temp), dxbc::Src::LI(int32_t(1) << 23),
dxbc::Src::LF(1.0f));
dxbc::Src::R(grad_h_lod_temp),
dxbc::Src::LI(int32_t(1) << 23), dxbc::Src::LF(1.0f));
a_.OpMul(dxbc::Dest::R(grad_v_temp, 0b1000), lod_src,
dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kYYYY));
dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kYYYY));
a_.OpMul(lod_dest, lod_src,
dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kXXXX));
dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kXXXX));
#endif
// Obtain the gradients and apply biases to them.
if (instr.attributes.use_register_gradients) {
@ -1540,11 +1562,11 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
// done in getCompTexLOD, so don't do it here too.
#if 0
a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
dxbc::Src::R(system_temp_grad_v_),
dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
dxbc::Src::R(system_temp_grad_v_vfetch_address_),
dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
#else
a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
dxbc::Src::R(system_temp_grad_v_), lod_src);
dxbc::Src::R(system_temp_grad_v_vfetch_address_), lod_src);
#endif
// TODO(Triang3l): Are cube map register gradients unnormalized if
// the coordinates themselves are unnormalized?
@ -1586,8 +1608,8 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
// done in getCompTexLOD, so don't do it here too.
#if 0
a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
dxbc::Src::R(grad_v_temp),
dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
dxbc::Src::R(grad_v_temp),
dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
#else
a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
dxbc::Src::R(grad_v_temp), lod_src);

View File

@ -440,6 +440,13 @@ struct ParsedVertexFetchInstruction {
// Number of source operands.
size_t operand_count = 0;
// Describes each source operand.
// Note that for vfetch_mini, which inherits the operands from vfetch_full,
// the index operand register may been overwritten between the vfetch_full and
// the vfetch_mini (happens in 4D530910 for wheels), but that should have no
// effect on the index actually used for fetching. A copy of the index
// therefore must be stored by vfetch_full (the base address, stride and
// rounding may be pre-applied to it since they will be the same in the
// vfetch_full and all its vfetch_mini instructions).
InstructionOperand operands[2];
struct Attributes {

View File

@ -876,7 +876,7 @@ bool ParseVertexFetchInstruction(const VertexFetchInstruction& op,
instr.attributes.stride = full_op.stride();
instr.attributes.exp_adjust = op.exp_adjust();
instr.attributes.prefetch_count = op.prefetch_count();
instr.attributes.is_index_rounded = op.is_index_rounded();
instr.attributes.is_index_rounded = full_op.is_index_rounded();
instr.attributes.is_signed = op.is_signed();
instr.attributes.is_integer = !op.is_normalized();
instr.attributes.signed_rf_mode = op.signed_rf_mode();

View File

@ -328,13 +328,12 @@ void ParsedVertexFetchInstruction::Disassemble(StringBuffer* out) const {
if (!is_mini_fetch) {
out->Append(", ");
DisassembleSourceOperand(operands[0], out);
out->Append(", ");
out->AppendFormat("vf{}", 95 - operands[1].storage_index);
out->AppendFormat(", vf{}", 95 - operands[1].storage_index);
if (attributes.is_index_rounded) {
out->Append(", RoundIndex=true");
}
}
if (attributes.is_index_rounded) {
out->Append(", RoundIndex=true");
}
if (attributes.exp_adjust) {
out->AppendFormat(", ExpAdjust={}", attributes.exp_adjust);
}

View File

@ -599,6 +599,8 @@ struct alignas(uint32_t) VertexFetchInstruction {
// Required condition value of the comparision (true or false).
bool predicate_condition() const { return data_.pred_condition == 1; }
// Vertex fetch constant index [0-95].
// Applicable only to vfetch_full (the address from vfetch_full is reused in
// vfetch_mini).
uint32_t fetch_constant_index() const {
return data_.const_index * 3 + data_.const_index_sel;
}
@ -606,6 +608,8 @@ struct alignas(uint32_t) VertexFetchInstruction {
uint32_t dest() const { return data_.dst_reg; }
uint32_t dest_swizzle() const { return data_.dst_swiz; }
bool is_dest_relative() const { return data_.dst_reg_am; }
// The source is applicable only to vfetch_full (the address from vfetch_full
// is reused in vfetch_mini).
uint32_t src() const { return data_.src_reg; }
uint32_t src_swizzle() const { return data_.src_swiz; }
bool is_src_relative() const { return data_.src_reg_am; }
@ -644,18 +648,21 @@ struct alignas(uint32_t) VertexFetchInstruction {
xenos::SignedRepeatingFractionMode signed_rf_mode() const {
return data_.signed_rf_mode_all;
}
// If true, the floating-point index is rounded to the nearest integer (likely
// as floor(index + 0.5) because rounding to the nearest even makes no sense
// for addressing, both 1.5 and 2.5 would be 2).
// Otherwise, it's floored (rounded towards negative infinity).
// Applicable only to vfetch_full (the address from vfetch_full is reused in
// vfetch_mini).
// http://web.archive.org/web/20090914055358/http://msdn.microsoft.com/en-us/library/bb313960.aspx
bool is_index_rounded() const { return data_.is_index_rounded == 1; }
// Dword stride, [0, 255].
// Applicable only to vfetch_full (the address from vfetch_full is reused in
// vfetch_mini).
uint32_t stride() const { return data_.stride; }
// Dword offset, [-4194304, 4194303].
int32_t offset() const { return data_.offset; }
void AssignFromFull(const VertexFetchInstruction& full) {
data_.stride = full.data_.stride;
data_.const_index = full.data_.const_index;
data_.const_index_sel = full.data_.const_index_sel;
}
private:
struct Data {
struct {