[GPU] Reuse base+index*stride in vfetch_mini instead of reloading the index GPR
The wheel shader in 4D530910 does vfetch_full to r0 with the index from r0.x, and then vfetch_mini. Thanks @Gliniak for the finding :3 Also small formatting cleanup in commented-out code.
This commit is contained in:
parent
600c14b3f0
commit
d6188c5d7e
|
@ -849,7 +849,7 @@ void DxbcShaderTranslator::StartTranslation() {
|
|||
system_temp_aL_ = PushSystemTemp(0b1111);
|
||||
system_temp_loop_count_ = PushSystemTemp(0b1111);
|
||||
system_temp_grad_h_lod_ = PushSystemTemp(0b1111);
|
||||
system_temp_grad_v_ = PushSystemTemp(0b0111);
|
||||
system_temp_grad_v_vfetch_address_ = PushSystemTemp(0b1111);
|
||||
|
||||
// Zero general-purpose registers to prevent crashes when the game
|
||||
// references them after only initializing them conditionally.
|
||||
|
@ -1039,7 +1039,7 @@ void DxbcShaderTranslator::CompleteShaderCode() {
|
|||
// - system_temp_aL_.
|
||||
// - system_temp_loop_count_.
|
||||
// - system_temp_grad_h_lod_.
|
||||
// - system_temp_grad_v_.
|
||||
// - system_temp_grad_v_vfetch_address_.
|
||||
PopSystemTemp(6);
|
||||
|
||||
// Write memexported data to the shared memory UAV.
|
||||
|
|
|
@ -1104,7 +1104,9 @@ class DxbcShaderTranslator : public ShaderTranslator {
|
|||
uint32_t system_temp_loop_count_;
|
||||
// Explicitly set texture gradients and LOD.
|
||||
uint32_t system_temp_grad_h_lod_;
|
||||
uint32_t system_temp_grad_v_;
|
||||
// .w stores `base + index * stride` in bytes from the last vfetch_full as it
|
||||
// may be needed by vfetch_mini.
|
||||
uint32_t system_temp_grad_v_vfetch_address_;
|
||||
|
||||
// The bool constant number containing the condition for the currently
|
||||
// processed exec (or the last - unless a label has reset this), or
|
||||
|
|
|
@ -59,47 +59,67 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
|
|||
// fetch constants on the CPU when proper bound checks are added - vfetch may
|
||||
// be conditional, so fetch constants may also be used conditionally.
|
||||
|
||||
// - Load the byte address in physical memory to system_temp_result_.w (so
|
||||
// it's not overwritten by data loads until the last one).
|
||||
// - Load the part of the byte address in the physical memory that is the same
|
||||
// in vfetch_full and vfetch_mini to system_temp_grad_v_vfetch_address_.w
|
||||
// (the index operand GPR must not be reloaded in vfetch_mini because it
|
||||
// might have been overwritten previously, but that shouldn't have effect on
|
||||
// vfetch_mini).
|
||||
|
||||
dxbc::Dest address_dest(dxbc::Dest::R(system_temp_result_, 0b1000));
|
||||
dxbc::Src address_src(dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW));
|
||||
if (instr.attributes.stride) {
|
||||
// Convert the index to an integer by flooring or by rounding to the nearest
|
||||
// (as floor(index + 0.5) because rounding to the nearest even makes no
|
||||
// sense for addressing, both 1.5 and 2.5 would be 2).
|
||||
// http://web.archive.org/web/20100302145413/http://msdn.microsoft.com:80/en-us/library/bb313960.aspx
|
||||
{
|
||||
bool index_operand_temp_pushed = false;
|
||||
dxbc::Src index_operand(
|
||||
LoadOperand(instr.operands[0], 0b0001, index_operand_temp_pushed)
|
||||
.SelectFromSwizzled(0));
|
||||
if (instr.attributes.is_index_rounded) {
|
||||
a_.OpAdd(address_dest, index_operand, dxbc::Src::LF(0.5f));
|
||||
a_.OpRoundNI(address_dest, address_src);
|
||||
} else {
|
||||
a_.OpRoundNI(address_dest, index_operand);
|
||||
}
|
||||
if (index_operand_temp_pushed) {
|
||||
PopSystemTemp();
|
||||
dxbc::Src address_src(
|
||||
dxbc::Src::R(system_temp_grad_v_vfetch_address_, dxbc::Src::kWWWW));
|
||||
if (!instr.is_mini_fetch) {
|
||||
dxbc::Dest address_dest(
|
||||
dxbc::Dest::R(system_temp_grad_v_vfetch_address_, 0b1000));
|
||||
if (instr.attributes.stride) {
|
||||
// Convert the index to an integer by flooring or by rounding to the
|
||||
// nearest (as floor(index + 0.5) because rounding to the nearest even
|
||||
// makes no sense for addressing, both 1.5 and 2.5 would be 2).
|
||||
{
|
||||
bool index_operand_temp_pushed = false;
|
||||
dxbc::Src index_operand(
|
||||
LoadOperand(instr.operands[0], 0b0001, index_operand_temp_pushed)
|
||||
.SelectFromSwizzled(0));
|
||||
if (instr.attributes.is_index_rounded) {
|
||||
a_.OpAdd(address_dest, index_operand, dxbc::Src::LF(0.5f));
|
||||
a_.OpRoundNI(address_dest, address_src);
|
||||
} else {
|
||||
a_.OpRoundNI(address_dest, index_operand);
|
||||
}
|
||||
if (index_operand_temp_pushed) {
|
||||
PopSystemTemp();
|
||||
}
|
||||
}
|
||||
a_.OpFToI(address_dest, address_src);
|
||||
// Extract the byte address from the fetch constant to
|
||||
// system_temp_result_.w (which is not used yet).
|
||||
a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b1000),
|
||||
fetch_constant_src.SelectFromSwizzled(0),
|
||||
dxbc::Src::LU(~uint32_t(3)));
|
||||
// Merge the index and the base address.
|
||||
a_.OpIMAd(address_dest, address_src,
|
||||
dxbc::Src::LU(instr.attributes.stride * sizeof(uint32_t)),
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW));
|
||||
} else {
|
||||
// Fetching from the same location - extract the byte address of the
|
||||
// beginning of the buffer.
|
||||
a_.OpAnd(address_dest, fetch_constant_src.SelectFromSwizzled(0),
|
||||
dxbc::Src::LU(~uint32_t(3)));
|
||||
}
|
||||
a_.OpFToI(address_dest, address_src);
|
||||
// Extract the byte address from the fetch constant to
|
||||
// system_temp_result_.z.
|
||||
a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0100),
|
||||
fetch_constant_src.SelectFromSwizzled(0),
|
||||
dxbc::Src::LU(~uint32_t(3)));
|
||||
// Merge the index and the base address.
|
||||
a_.OpIMAd(address_dest, address_src,
|
||||
dxbc::Src::LU(instr.attributes.stride * sizeof(uint32_t)),
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kZZZZ));
|
||||
} else {
|
||||
// Fetching from the same location - extract the byte address of the
|
||||
// beginning of the buffer.
|
||||
a_.OpAnd(address_dest, fetch_constant_src.SelectFromSwizzled(0),
|
||||
dxbc::Src::LU(~uint32_t(3)));
|
||||
}
|
||||
|
||||
dxbc::Dest address_temp_dest(dxbc::Dest::R(system_temp_result_, 0b1000));
|
||||
dxbc::Src address_temp_src(
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kWWWW));
|
||||
|
||||
// - From now on, if any additional offset must be applied to the
|
||||
// `base + index * stride` part of the address, it must be done by writing
|
||||
// to system_temp_result_.w (address_temp_dest) instead of
|
||||
// system_temp_grad_v_vfetch_address_.w (since it must stay the same for the
|
||||
// vfetch_full and all its vfetch_mini invocations), and changing
|
||||
// address_src to address_temp_src afterwards. system_temp_result_.w can be
|
||||
// used for this purpose safely because it won't be overwritten until the
|
||||
// last dword is loaded (after which the address won't be needed anymore).
|
||||
|
||||
// Add the word offset from the instruction (signed), plus the offset of the
|
||||
// first needed word within the element.
|
||||
uint32_t first_word_index;
|
||||
|
@ -108,8 +128,9 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
|
|||
instr.attributes.offset + int32_t(first_word_index);
|
||||
if (first_word_buffer_offset) {
|
||||
// Add the constant word offset.
|
||||
a_.OpIAdd(address_dest, address_src,
|
||||
a_.OpIAdd(address_temp_dest, address_src,
|
||||
dxbc::Src::LI(first_word_buffer_offset * sizeof(uint32_t)));
|
||||
address_src = address_temp_src;
|
||||
}
|
||||
|
||||
// - Load needed words to system_temp_result_, words 0, 1, 2, 3 to X, Y, Z, W
|
||||
|
@ -159,9 +180,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
|
|||
~((uint32_t(1) << (word_index + word_count)) - uint32_t(1));
|
||||
if (word_index != word_index_previous) {
|
||||
// Go to the word in the buffer.
|
||||
a_.OpIAdd(address_dest, address_src,
|
||||
a_.OpIAdd(address_temp_dest, address_src,
|
||||
dxbc::Src::LU((word_index - word_index_previous) *
|
||||
sizeof(uint32_t)));
|
||||
address_src = address_temp_src;
|
||||
word_index_previous = word_index;
|
||||
}
|
||||
// Can ld_raw either to the first multiple components, or to any scalar
|
||||
|
@ -592,7 +614,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
|
|||
case FetchOpcode::kSetTextureGradientsVert: {
|
||||
bool grad_operand_temp_pushed = false;
|
||||
a_.OpMov(
|
||||
dxbc::Dest::R(system_temp_grad_v_, 0b0111),
|
||||
dxbc::Dest::R(system_temp_grad_v_vfetch_address_, 0b0111),
|
||||
LoadOperand(instr.operands[0], 0b0111, grad_operand_temp_pushed));
|
||||
if (grad_operand_temp_pushed) {
|
||||
PopSystemTemp();
|
||||
|
@ -1521,15 +1543,15 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
|
|||
// Extract gradient exponent biases from the fetch constant and merge
|
||||
// them with the LOD bias.
|
||||
a_.OpIBFE(dxbc::Dest::R(grad_h_lod_temp, 0b0011), dxbc::Src::LU(5),
|
||||
dxbc::Src::LU(22, 27, 0, 0),
|
||||
RequestTextureFetchConstantWord(tfetch_index, 4));
|
||||
dxbc::Src::LU(22, 27, 0, 0),
|
||||
RequestTextureFetchConstantWord(tfetch_index, 4));
|
||||
a_.OpIMAd(dxbc::Dest::R(grad_h_lod_temp, 0b0011),
|
||||
dxbc::Src::R(grad_h_lod_temp), dxbc::Src::LI(int32_t(1) << 23),
|
||||
dxbc::Src::LF(1.0f));
|
||||
dxbc::Src::R(grad_h_lod_temp),
|
||||
dxbc::Src::LI(int32_t(1) << 23), dxbc::Src::LF(1.0f));
|
||||
a_.OpMul(dxbc::Dest::R(grad_v_temp, 0b1000), lod_src,
|
||||
dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kYYYY));
|
||||
dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kYYYY));
|
||||
a_.OpMul(lod_dest, lod_src,
|
||||
dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kXXXX));
|
||||
dxbc::Src::R(grad_h_lod_temp, dxbc::Src::kXXXX));
|
||||
#endif
|
||||
// Obtain the gradients and apply biases to them.
|
||||
if (instr.attributes.use_register_gradients) {
|
||||
|
@ -1540,11 +1562,11 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
|
|||
// done in getCompTexLOD, so don't do it here too.
|
||||
#if 0
|
||||
a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
|
||||
dxbc::Src::R(system_temp_grad_v_),
|
||||
dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
|
||||
dxbc::Src::R(system_temp_grad_v_vfetch_address_),
|
||||
dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
|
||||
#else
|
||||
a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
|
||||
dxbc::Src::R(system_temp_grad_v_), lod_src);
|
||||
dxbc::Src::R(system_temp_grad_v_vfetch_address_), lod_src);
|
||||
#endif
|
||||
// TODO(Triang3l): Are cube map register gradients unnormalized if
|
||||
// the coordinates themselves are unnormalized?
|
||||
|
@ -1586,8 +1608,8 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
|
|||
// done in getCompTexLOD, so don't do it here too.
|
||||
#if 0
|
||||
a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
|
||||
dxbc::Src::R(grad_v_temp),
|
||||
dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
|
||||
dxbc::Src::R(grad_v_temp),
|
||||
dxbc::Src::R(grad_v_temp, dxbc::Src::kWWWW));
|
||||
#else
|
||||
a_.OpMul(dxbc::Dest::R(grad_v_temp, grad_mask),
|
||||
dxbc::Src::R(grad_v_temp), lod_src);
|
||||
|
|
|
@ -440,6 +440,13 @@ struct ParsedVertexFetchInstruction {
|
|||
// Number of source operands.
|
||||
size_t operand_count = 0;
|
||||
// Describes each source operand.
|
||||
// Note that for vfetch_mini, which inherits the operands from vfetch_full,
|
||||
// the index operand register may been overwritten between the vfetch_full and
|
||||
// the vfetch_mini (happens in 4D530910 for wheels), but that should have no
|
||||
// effect on the index actually used for fetching. A copy of the index
|
||||
// therefore must be stored by vfetch_full (the base address, stride and
|
||||
// rounding may be pre-applied to it since they will be the same in the
|
||||
// vfetch_full and all its vfetch_mini instructions).
|
||||
InstructionOperand operands[2];
|
||||
|
||||
struct Attributes {
|
||||
|
|
|
@ -876,7 +876,7 @@ bool ParseVertexFetchInstruction(const VertexFetchInstruction& op,
|
|||
instr.attributes.stride = full_op.stride();
|
||||
instr.attributes.exp_adjust = op.exp_adjust();
|
||||
instr.attributes.prefetch_count = op.prefetch_count();
|
||||
instr.attributes.is_index_rounded = op.is_index_rounded();
|
||||
instr.attributes.is_index_rounded = full_op.is_index_rounded();
|
||||
instr.attributes.is_signed = op.is_signed();
|
||||
instr.attributes.is_integer = !op.is_normalized();
|
||||
instr.attributes.signed_rf_mode = op.signed_rf_mode();
|
||||
|
|
|
@ -328,13 +328,12 @@ void ParsedVertexFetchInstruction::Disassemble(StringBuffer* out) const {
|
|||
if (!is_mini_fetch) {
|
||||
out->Append(", ");
|
||||
DisassembleSourceOperand(operands[0], out);
|
||||
out->Append(", ");
|
||||
out->AppendFormat("vf{}", 95 - operands[1].storage_index);
|
||||
out->AppendFormat(", vf{}", 95 - operands[1].storage_index);
|
||||
if (attributes.is_index_rounded) {
|
||||
out->Append(", RoundIndex=true");
|
||||
}
|
||||
}
|
||||
|
||||
if (attributes.is_index_rounded) {
|
||||
out->Append(", RoundIndex=true");
|
||||
}
|
||||
if (attributes.exp_adjust) {
|
||||
out->AppendFormat(", ExpAdjust={}", attributes.exp_adjust);
|
||||
}
|
||||
|
|
|
@ -599,6 +599,8 @@ struct alignas(uint32_t) VertexFetchInstruction {
|
|||
// Required condition value of the comparision (true or false).
|
||||
bool predicate_condition() const { return data_.pred_condition == 1; }
|
||||
// Vertex fetch constant index [0-95].
|
||||
// Applicable only to vfetch_full (the address from vfetch_full is reused in
|
||||
// vfetch_mini).
|
||||
uint32_t fetch_constant_index() const {
|
||||
return data_.const_index * 3 + data_.const_index_sel;
|
||||
}
|
||||
|
@ -606,6 +608,8 @@ struct alignas(uint32_t) VertexFetchInstruction {
|
|||
uint32_t dest() const { return data_.dst_reg; }
|
||||
uint32_t dest_swizzle() const { return data_.dst_swiz; }
|
||||
bool is_dest_relative() const { return data_.dst_reg_am; }
|
||||
// The source is applicable only to vfetch_full (the address from vfetch_full
|
||||
// is reused in vfetch_mini).
|
||||
uint32_t src() const { return data_.src_reg; }
|
||||
uint32_t src_swizzle() const { return data_.src_swiz; }
|
||||
bool is_src_relative() const { return data_.src_reg_am; }
|
||||
|
@ -644,18 +648,21 @@ struct alignas(uint32_t) VertexFetchInstruction {
|
|||
xenos::SignedRepeatingFractionMode signed_rf_mode() const {
|
||||
return data_.signed_rf_mode_all;
|
||||
}
|
||||
// If true, the floating-point index is rounded to the nearest integer (likely
|
||||
// as floor(index + 0.5) because rounding to the nearest even makes no sense
|
||||
// for addressing, both 1.5 and 2.5 would be 2).
|
||||
// Otherwise, it's floored (rounded towards negative infinity).
|
||||
// Applicable only to vfetch_full (the address from vfetch_full is reused in
|
||||
// vfetch_mini).
|
||||
// http://web.archive.org/web/20090914055358/http://msdn.microsoft.com/en-us/library/bb313960.aspx
|
||||
bool is_index_rounded() const { return data_.is_index_rounded == 1; }
|
||||
// Dword stride, [0, 255].
|
||||
// Applicable only to vfetch_full (the address from vfetch_full is reused in
|
||||
// vfetch_mini).
|
||||
uint32_t stride() const { return data_.stride; }
|
||||
// Dword offset, [-4194304, 4194303].
|
||||
int32_t offset() const { return data_.offset; }
|
||||
|
||||
void AssignFromFull(const VertexFetchInstruction& full) {
|
||||
data_.stride = full.data_.stride;
|
||||
data_.const_index = full.data_.const_index;
|
||||
data_.const_index_sel = full.data_.const_index_sel;
|
||||
}
|
||||
|
||||
private:
|
||||
struct Data {
|
||||
struct {
|
||||
|
|
Loading…
Reference in New Issue