[DXBC] ALU vector ops to new codegen

This commit is contained in:
Triang3l 2020-05-10 16:33:39 +03:00
parent 3aa0ce3096
commit b79ba69548
6 changed files with 986 additions and 1609 deletions

View File

@ -875,7 +875,7 @@ void DxbcShaderTranslator::StartTranslation() {
// depends on the guest code (thus no guarantees), initialize everything
// now (except for pv, it's an internal temporary variable, not accessible
// by the guest).
system_temp_pv_ = PushSystemTemp();
system_temp_result_ = PushSystemTemp();
system_temp_ps_pc_p0_a0_ = PushSystemTemp(0b1111);
system_temp_aL_ = PushSystemTemp(0b1111);
system_temp_loop_count_ = PushSystemTemp(0b1111);
@ -1089,7 +1089,7 @@ void DxbcShaderTranslator::CompleteShaderCode() {
DxbcOpEndLoop();
// Release the following system temporary values so epilogue can reuse them:
// - system_temp_pv_.
// - system_temp_result_.
// - system_temp_ps_pc_p0_a0_.
// - system_temp_aL_.
// - system_temp_loop_count_.
@ -1306,6 +1306,96 @@ void DxbcShaderTranslator::EmitInstructionDisassembly() {
length_dwords * sizeof(uint32_t) - length - 1);
}
DxbcShaderTranslator::DxbcSrc DxbcShaderTranslator::LoadOperand(
const InstructionOperand& operand, uint32_t needed_components,
bool& temp_pushed_out) {
temp_pushed_out = false;
uint32_t first_needed_component;
if (!xe::bit_scan_forward(needed_components, &first_needed_component)) {
return DxbcSrc::LF(0.0f);
}
DxbcIndex index(operand.storage_index);
switch (operand.storage_addressing_mode) {
case InstructionStorageAddressingMode::kStatic:
break;
case InstructionStorageAddressingMode::kAddressAbsolute:
index = DxbcIndex(system_temp_ps_pc_p0_a0_, 3, operand.storage_index);
break;
case InstructionStorageAddressingMode::kAddressRelative:
index = DxbcIndex(system_temp_aL_, 0, operand.storage_index);
break;
}
DxbcSrc src(DxbcSrc::LF(0.0f));
switch (operand.storage_source) {
case InstructionStorageSource::kRegister: {
if (uses_register_dynamic_addressing()) {
// Load x#[#] to r# because x#[#] can be used only with mov.
uint32_t temp = PushSystemTemp();
temp_pushed_out = true;
uint32_t used_swizzle_components = 0;
for (uint32_t i = 0; i < uint32_t(operand.component_count); ++i) {
if (!(needed_components & (1 << i))) {
continue;
}
SwizzleSource component = operand.GetComponent(i);
assert_true(component >= SwizzleSource::kX &&
component <= SwizzleSource::kW);
used_swizzle_components |=
1 << (uint32_t(component) - uint32_t(SwizzleSource::kX));
}
assert_not_zero(used_swizzle_components);
DxbcOpMov(DxbcDest::R(temp, used_swizzle_components),
DxbcSrc::X(0, index));
src = DxbcSrc::R(temp);
} else {
assert_true(operand.storage_addressing_mode ==
InstructionStorageAddressingMode::kStatic);
src = DxbcSrc::R(index.index_);
}
} break;
case InstructionStorageSource::kConstantFloat: {
if (cbuffer_index_float_constants_ == kCbufferIndexUnallocated) {
cbuffer_index_float_constants_ = cbuffer_count_++;
}
if (operand.storage_addressing_mode ==
InstructionStorageAddressingMode::kStatic) {
uint32_t float_constant_index =
constant_register_map().GetPackedFloatConstantIndex(
operand.storage_index);
assert_true(float_constant_index != UINT32_MAX);
if (float_constant_index == UINT32_MAX) {
return DxbcSrc::LF(0.0f);
}
index.index_ = float_constant_index;
} else {
assert_true(constant_register_map().float_dynamic_addressing);
}
src = DxbcSrc::CB(cbuffer_index_float_constants_,
uint32_t(CbufferRegister::kFloatConstants), index);
} break;
default:
assert_unhandled_case(operand.storage_source);
return DxbcSrc::LF(0.0f);
}
// Swizzle, skipping unneeded components similar to how FXC skips components,
// by replacing them with the leftmost used one.
uint32_t swizzle = 0;
for (uint32_t i = 0; i < 4; ++i) {
SwizzleSource component = operand.GetComponent(
(needed_components & (1 << i)) ? i : first_needed_component);
assert_true(component >= SwizzleSource::kX &&
component <= SwizzleSource::kW);
swizzle |= (uint32_t(component) - uint32_t(SwizzleSource::kX)) << (i * 2);
}
src = src.Swizzle(swizzle);
return src.WithModifiers(operand.is_absolute_value, operand.is_negated);
}
void DxbcShaderTranslator::LoadDxbcSourceOperand(
const InstructionOperand& operand, DxbcSourceOperand& dxbc_operand) {
// Initialize the values to their defaults.
@ -1693,306 +1783,151 @@ void DxbcShaderTranslator::UnloadDxbcSourceOperand(
}
void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
uint32_t reg, bool replicate_x,
const DxbcSrc& src,
bool can_store_memexport_address) {
uint32_t used_write_mask = result.GetUsedWriteMask();
if (result.storage_target == InstructionStorageTarget::kNone ||
!result.GetUsedWriteMask()) {
if (!used_write_mask) {
return;
}
// Validate memexport writes (Halo 3 has some weird invalid ones).
if (result.storage_target == InstructionStorageTarget::kExportAddress) {
if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 ||
memexport_alloc_current_count_ > kMaxMemExports ||
system_temps_memexport_address_[memexport_alloc_current_count_ - 1] ==
UINT32_MAX) {
// Get the destination address and type.
DxbcDest dest(DxbcDest::Null());
bool is_clamped = result.is_clamped;
switch (result.storage_target) {
case InstructionStorageTarget::kNone:
return;
}
} else if (result.storage_target == InstructionStorageTarget::kExportData) {
if (memexport_alloc_current_count_ == 0 ||
memexport_alloc_current_count_ > kMaxMemExports ||
system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
[result.storage_index] == UINT32_MAX) {
return;
}
}
uint32_t saturate_bit =
ENCODE_D3D10_SB_INSTRUCTION_SATURATE(result.is_clamped);
// Scalar targets get only one component.
// TODO(Triang3l): It's not replicated, it's X specifically.
if (result.storage_target == InstructionStorageTarget::kDepth) {
assert_not_zero(used_write_mask & 0b0001);
SwizzleSource component = result.components[0];
if (replicate_x && component <= SwizzleSource::kW) {
component = SwizzleSource::kX;
}
// Both r[imm32] and imm32 operands are 2 tokens long.
switch (result.storage_target) {
case InstructionStorageTarget::kDepth:
assert_true(writes_depth());
if (writes_depth()) {
if (edram_rov_used_) {
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_INSTRUCTION_SATURATE(1) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5));
shader_code_.push_back(EncodeVectorMaskedOperand(
D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1));
shader_code_.push_back(system_temp_rov_depth_stencil_);
} else {
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_INSTRUCTION_SATURATE(1) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(4));
shader_code_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_OUTPUT_DEPTH, 0));
}
case InstructionStorageTarget::kRegister:
if (uses_register_dynamic_addressing()) {
DxbcIndex register_index(result.storage_index);
switch (result.storage_addressing_mode) {
case InstructionStorageAddressingMode::kStatic:
break;
case InstructionStorageAddressingMode::kAddressAbsolute:
register_index =
DxbcIndex(system_temp_ps_pc_p0_a0_, 3, result.storage_index);
break;
case InstructionStorageAddressingMode::kAddressRelative:
register_index =
DxbcIndex(system_temp_aL_, 0, result.storage_index);
break;
}
break;
default:
assert_unhandled_case(result.storage_target);
dest = DxbcDest::X(0, register_index);
} else {
assert_true(result.storage_addressing_mode ==
InstructionStorageAddressingMode::kStatic);
dest = DxbcDest::R(result.storage_index);
}
break;
case InstructionStorageTarget::kInterpolator:
dest = DxbcDest::O(uint32_t(InOutRegister::kVSDSOutInterpolators) +
result.storage_index);
break;
case InstructionStorageTarget::kPosition:
dest = DxbcDest::R(system_temp_position_);
break;
case InstructionStorageTarget::kPointSizeEdgeFlagKillVertex:
assert_zero(used_write_mask & 0b1000);
dest = DxbcDest::R(system_temp_point_size_edge_flag_kill_vertex_);
break;
case InstructionStorageTarget::kExportAddress:
// Validate memexport writes (Halo 3 has some weird invalid ones).
if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 ||
memexport_alloc_current_count_ > kMaxMemExports ||
system_temps_memexport_address_[memexport_alloc_current_count_ - 1] ==
UINT32_MAX) {
return;
}
if (component <= SwizzleSource::kW) {
shader_code_.push_back(EncodeVectorSelectOperand(
D3D10_SB_OPERAND_TYPE_TEMP, uint32_t(component), 1));
shader_code_.push_back(reg);
} else {
shader_code_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
shader_code_.push_back(component == SwizzleSource::k1 ? 0x3F800000 : 0);
}
++stat_.instruction_count;
++stat_.mov_instruction_count;
}
dest = DxbcDest::R(
system_temps_memexport_address_[memexport_alloc_current_count_ - 1]);
break;
case InstructionStorageTarget::kExportData: {
// Validate memexport writes (Halo 3 has some weird invalid ones).
if (memexport_alloc_current_count_ == 0 ||
memexport_alloc_current_count_ > kMaxMemExports ||
system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
[result.storage_index] == UINT32_MAX) {
return;
}
dest = DxbcDest::R(
system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
[result.storage_index]);
// Mark that the eM# has been written to and needs to be exported.
assert_not_zero(used_write_mask);
uint32_t memexport_index = memexport_alloc_current_count_ - 1;
DxbcOpOr(DxbcDest::R(system_temp_memexport_written_,
1 << (memexport_index >> 2)),
DxbcSrc::R(system_temp_memexport_written_)
.Select(memexport_index >> 2),
DxbcSrc::LU(uint32_t(1) << (result.storage_index +
((memexport_index & 3) << 3))));
} break;
case InstructionStorageTarget::kColor:
assert_not_zero(used_write_mask);
assert_true(writes_color_target(result.storage_index));
dest = DxbcDest::R(system_temps_color_[result.storage_index]);
if (edram_rov_used_) {
// For ROV output, mark that the color has been written to.
// According to:
// https://docs.microsoft.com/en-us/windows/desktop/direct3dhlsl/dx9-graphics-reference-asm-ps-registers-output-color
// if a color target hasn't been written to - including due to flow
// control - the render target must not be modified (the unwritten
// components of a written target are undefined, not sure if this
// behavior is respected on the real GPU, but the ROV code currently
// doesn't preserve unmodified components).
DxbcOpOr(DxbcDest::R(system_temp_rov_params_, 0b0001),
DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kXXXX),
DxbcSrc::LU(uint32_t(1) << (8 + result.storage_index)));
}
break;
case InstructionStorageTarget::kDepth:
// Writes X to scalar oDepth or to X of system_temp_rov_depth_stencil_, no
// additional swizzling needed.
assert_true(used_write_mask == 0b0001);
assert_true(writes_depth());
if (edram_rov_used_) {
dest = DxbcDest::R(system_temp_rov_depth_stencil_);
} else {
dest = DxbcDest::ODepth();
}
// Depth outside [0, 1] is not safe for use with the ROV code. Though 20e4
// float depth can store values below 2, it's a very unusual case.
// Direct3D 10+ SV_Depth, however, can accept any values, including
// specials, when the depth buffer is floating-point.
is_clamped = true;
break;
}
if (dest.type_ == DxbcOperandType::kNull) {
return;
}
// Get the write masks and data required for loading of both the swizzled part
// and the constant (zero/one) part. The write mask is treated also as a read
// mask in DXBC, and `mov r0.zw, r1.xyzw` actually means r0.zw = r1.zw, not
// r0.zw = r1.xy.
uint32_t swizzle_mask = 0;
uint32_t swizzle_components = 0;
uint32_t constant_mask = 0;
uint32_t constant_values = 0;
// Write.
uint32_t src_additional_swizzle = 0;
uint32_t constant_mask = 0, constant_1_mask = 0;
for (uint32_t i = 0; i < 4; ++i) {
if (!(used_write_mask & (1 << i))) {
continue;
}
SwizzleSource component = result.components[i];
if (component <= SwizzleSource::kW) {
swizzle_mask |= 1 << i;
// If replicating X, just keep zero swizzle (XXXX).
if (!replicate_x) {
swizzle_components |= uint32_t(component) << (i * 2);
}
if (component >= SwizzleSource::kX && component <= SwizzleSource::kW) {
src_additional_swizzle |=
(uint32_t(component) - uint32_t(SwizzleSource::kX)) << (i * 2);
} else {
constant_mask |= 1 << i;
constant_values |= (component == SwizzleSource::k1 ? 1 : 0) << i;
}
}
bool is_static = result.storage_addressing_mode ==
InstructionStorageAddressingMode::kStatic;
// If the index is dynamic, choose where it's taken from.
uint32_t dynamic_address_register, dynamic_address_component;
if (result.storage_addressing_mode ==
InstructionStorageAddressingMode::kAddressRelative) {
// Addressed by aL.x.
dynamic_address_register = system_temp_aL_;
dynamic_address_component = 0;
} else {
// Addressed by a0.
dynamic_address_register = system_temp_ps_pc_p0_a0_;
dynamic_address_component = 3;
}
// Store both parts of the write (i == 0 - swizzled, i == 1 - constant).
for (uint32_t i = 0; i < 2; ++i) {
uint32_t mask = i == 0 ? swizzle_mask : constant_mask;
if (mask == 0) {
continue;
}
// r# for the swizzled part, 4-component imm32 for the constant part.
uint32_t source_length = i != 0 ? 5 : 2;
switch (result.storage_target) {
case InstructionStorageTarget::kRegister:
if (uses_register_dynamic_addressing()) {
++stat_.instruction_count;
++stat_.array_instruction_count;
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH((is_static ? 4 : 6) +
source_length) |
saturate_bit);
shader_code_.push_back(EncodeVectorMaskedOperand(
D3D10_SB_OPERAND_TYPE_INDEXABLE_TEMP, mask, 2,
D3D10_SB_OPERAND_INDEX_IMMEDIATE32,
is_static ? D3D10_SB_OPERAND_INDEX_IMMEDIATE32
: D3D10_SB_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE));
shader_code_.push_back(0);
shader_code_.push_back(result.storage_index);
if (!is_static) {
shader_code_.push_back(EncodeVectorSelectOperand(
D3D10_SB_OPERAND_TYPE_TEMP, dynamic_address_component, 1));
shader_code_.push_back(dynamic_address_register);
}
} else {
assert_true(is_static);
++stat_.instruction_count;
++stat_.mov_instruction_count;
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) |
saturate_bit);
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1));
shader_code_.push_back(result.storage_index);
}
break;
case InstructionStorageTarget::kInterpolator:
++stat_.instruction_count;
++stat_.mov_instruction_count;
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) |
saturate_bit);
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_OUTPUT, mask, 1));
shader_code_.push_back(uint32_t(InOutRegister::kVSDSOutInterpolators) +
uint32_t(result.storage_index));
break;
case InstructionStorageTarget::kPosition:
++stat_.instruction_count;
++stat_.mov_instruction_count;
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) |
saturate_bit);
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1));
shader_code_.push_back(system_temp_position_);
break;
case InstructionStorageTarget::kPointSizeEdgeFlagKillVertex:
++stat_.instruction_count;
++stat_.mov_instruction_count;
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) |
saturate_bit);
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1));
shader_code_.push_back(system_temp_point_size_edge_flag_kill_vertex_);
break;
case InstructionStorageTarget::kExportAddress:
++stat_.instruction_count;
++stat_.mov_instruction_count;
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) |
saturate_bit);
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1));
shader_code_.push_back(
system_temps_memexport_address_[memexport_alloc_current_count_ -
1]);
break;
case InstructionStorageTarget::kExportData:
++stat_.instruction_count;
++stat_.mov_instruction_count;
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) |
saturate_bit);
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1));
shader_code_.push_back(
system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
[uint32_t(result.storage_index)]);
break;
case InstructionStorageTarget::kColor:
++stat_.instruction_count;
++stat_.mov_instruction_count;
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) |
saturate_bit);
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1));
shader_code_.push_back(system_temps_color_[result.storage_index]);
break;
default:
continue;
}
if (i == 0) {
// Copy from the source r#.
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, swizzle_components, 1));
shader_code_.push_back(reg);
} else {
// Load constants.
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
for (uint32_t j = 0; j < 4; ++j) {
shader_code_.push_back((constant_values & (1 << j)) ? 0x3F800000 : 0);
if (component == SwizzleSource::k1) {
constant_1_mask |= 1 << i;
}
}
}
if (result.storage_target == InstructionStorageTarget::kExportData) {
// Mark that the eM# has been written to and needs to be exported.
uint32_t memexport_index = memexport_alloc_current_count_ - 1;
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
shader_code_.push_back(EncodeVectorMaskedOperand(
D3D10_SB_OPERAND_TYPE_TEMP, 1 << (memexport_index >> 2), 1));
shader_code_.push_back(system_temp_memexport_written_);
shader_code_.push_back(EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP,
memexport_index >> 2, 1));
shader_code_.push_back(system_temp_memexport_written_);
shader_code_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
shader_code_.push_back(
uint32_t(1) << (result.storage_index + ((memexport_index & 3) << 3)));
++stat_.instruction_count;
++stat_.uint_instruction_count;
if (used_write_mask != constant_mask) {
DxbcOpMov(dest.Mask(used_write_mask & ~constant_mask),
src.SwizzleSwizzled(src_additional_swizzle), is_clamped);
}
if (edram_rov_used_ &&
result.storage_target == InstructionStorageTarget::kColor) {
// For ROV output, mark that the color has been written to.
// According to:
// https://docs.microsoft.com/en-us/windows/desktop/direct3dhlsl/dx9-graphics-reference-asm-ps-registers-output-color
// if a color target has been written to - including due to flow control -
// the render target must not be modified (the unwritten components of a
// written target are undefined, not sure if this behavior is respected on
// the real GPU, but the ROV code currently uses pre-packed masks to keep
// the old values, so preservation of components is not done).
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1));
shader_code_.push_back(system_temp_rov_params_);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1));
shader_code_.push_back(system_temp_rov_params_);
shader_code_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
shader_code_.push_back(1 << (8 + result.storage_index));
++stat_.instruction_count;
++stat_.uint_instruction_count;
if (constant_mask) {
DxbcOpMov(dest.Mask(constant_mask),
DxbcSrc::LF(float(constant_1_mask & 1),
float((constant_1_mask >> 1) & 1),
float((constant_1_mask >> 2) & 1),
float((constant_1_mask >> 3) & 1)));
}
}
@ -2192,8 +2127,8 @@ void DxbcShaderTranslator::ProcessLoopStartInstruction(
EmitInstructionDisassembly();
}
// Count (as uint) in bits 0:7 of the loop constant, initial aL in 8:15.
// Starting from vector 2 because of bool constants.
// Count (unsigned) in bits 0:7 of the loop constant, initial aL (unsigned) in
// 8:15. Starting from vector 2 because of bool constants.
if (cbuffer_index_bool_loop_constants_ == kCbufferIndexUnallocated) {
cbuffer_index_bool_loop_constants_ = cbuffer_count_++;
}
@ -2280,12 +2215,12 @@ void DxbcShaderTranslator::ProcessLoopEndInstruction(
{
// Continue case.
uint32_t aL_add_temp = PushSystemTemp();
// Extract the value to add to aL (in bits 16:23 of the loop constant).
// Starting from vector 2 because of bool constants.
// Extract the value to add to aL (signed, in bits 16:23 of the loop
// constant). Starting from vector 2 because of bool constants.
if (cbuffer_index_bool_loop_constants_ == kCbufferIndexUnallocated) {
cbuffer_index_bool_loop_constants_ = cbuffer_count_++;
}
DxbcOpUBFE(DxbcDest::R(aL_add_temp, 0b0001), DxbcSrc::LU(8),
DxbcOpIBFE(DxbcDest::R(aL_add_temp, 0b0001), DxbcSrc::LU(8),
DxbcSrc::LU(16),
DxbcSrc::CB(cbuffer_index_bool_loop_constants_,
uint32_t(CbufferRegister::kBoolLoopConstants),

View File

@ -764,7 +764,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
if (index_dimension > 1) {
operand_token |= uint32_t(index_2d_.GetRepresentation()) << 25;
if (index_dimension > 2) {
operand_token |= uint32_t(index_2d_.GetRepresentation()) << 28;
operand_token |= uint32_t(index_3d_.GetRepresentation()) << 28;
}
}
}
@ -1084,12 +1084,15 @@ class DxbcShaderTranslator : public ShaderTranslator {
kDefault = 10,
kDiscard = 13,
kDiv = 14,
kDP2 = 15,
kDP3 = 16,
kDP4 = 17,
kElse = 18,
kEndIf = 21,
kEndLoop = 22,
kEndSwitch = 23,
kEq = 24,
kFrc = 26,
kFToI = 27,
kFToU = 28,
kGE = 29,
@ -1118,6 +1121,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
kRet = 62,
kRetC = 63,
kRoundNE = 64,
kRoundNI = 65,
kRoundZ = 67,
kSwitch = 76,
kULT = 79,
@ -1291,6 +1295,32 @@ class DxbcShaderTranslator : public ShaderTranslator {
DxbcEmitAluOp(DxbcOpcode::kDiv, 0b00, dest, src0, src1, saturate);
++stat_.float_instruction_count;
}
void DxbcOpDP2(const DxbcDest& dest, const DxbcSrc& src0, const DxbcSrc& src1,
bool saturate = false) {
uint32_t operands_length =
dest.GetLength() + src0.GetLength(0b0011) + src1.GetLength(0b0011);
shader_code_.reserve(shader_code_.size() + 1 + operands_length);
shader_code_.push_back(
DxbcOpcodeToken(DxbcOpcode::kDP2, operands_length, saturate));
dest.Write(shader_code_);
src0.Write(shader_code_, false, 0b0011);
src1.Write(shader_code_, false, 0b0011);
++stat_.instruction_count;
++stat_.float_instruction_count;
}
void DxbcOpDP3(const DxbcDest& dest, const DxbcSrc& src0, const DxbcSrc& src1,
bool saturate = false) {
uint32_t operands_length =
dest.GetLength() + src0.GetLength(0b0111) + src1.GetLength(0b0111);
shader_code_.reserve(shader_code_.size() + 1 + operands_length);
shader_code_.push_back(
DxbcOpcodeToken(DxbcOpcode::kDP3, operands_length, saturate));
dest.Write(shader_code_);
src0.Write(shader_code_, false, 0b0111);
src1.Write(shader_code_, false, 0b0111);
++stat_.instruction_count;
++stat_.float_instruction_count;
}
void DxbcOpDP4(const DxbcDest& dest, const DxbcSrc& src0, const DxbcSrc& src1,
bool saturate = false) {
uint32_t operands_length =
@ -1325,6 +1355,11 @@ class DxbcShaderTranslator : public ShaderTranslator {
DxbcEmitAluOp(DxbcOpcode::kEq, 0b00, dest, src0, src1);
++stat_.float_instruction_count;
}
void DxbcOpFrc(const DxbcDest& dest, const DxbcSrc& src,
bool saturate = false) {
DxbcEmitAluOp(DxbcOpcode::kFrc, 0b0, dest, src, saturate);
++stat_.float_instruction_count;
}
void DxbcOpFToI(const DxbcDest& dest, const DxbcSrc& src) {
DxbcEmitAluOp(DxbcOpcode::kFToI, 0b0, dest, src);
++stat_.conversion_instruction_count;
@ -1471,6 +1506,11 @@ class DxbcShaderTranslator : public ShaderTranslator {
DxbcEmitAluOp(DxbcOpcode::kRoundNE, 0b0, dest, src, saturate);
++stat_.float_instruction_count;
}
void DxbcOpRoundNI(const DxbcDest& dest, const DxbcSrc& src,
bool saturate = false) {
DxbcEmitAluOp(DxbcOpcode::kRoundNI, 0b0, dest, src, saturate);
++stat_.float_instruction_count;
}
void DxbcOpRoundZ(const DxbcDest& dest, const DxbcSrc& src,
bool saturate = false) {
DxbcEmitAluOp(DxbcOpcode::kRoundZ, 0b0, dest, src, saturate);
@ -2027,6 +2067,14 @@ class DxbcShaderTranslator : public ShaderTranslator {
// as shader messages, from instruction_disassembly_buffer_.
void EmitInstructionDisassembly();
// Converts a shader translator source operand to a DXBC emitter operand, or
// returns a zero literal operand if it's not going to be referenced. This may
// allocate a temporary register and emit instructions if the operand can't be
// used directly with most DXBC instructions (like, if it's an indexable GPR),
// in this case, temp_pushed_out will be set to true, and PopSystemTemp must
// be done when the operand is not needed anymore.
DxbcSrc LoadOperand(const InstructionOperand& operand,
uint32_t needed_components, bool& temp_pushed_out);
// Abstract 4-component vector source operand.
// TODO(Triang3l): Remove after fully moving to the new emitter.
struct DxbcSourceOperand {
@ -2085,11 +2133,12 @@ class DxbcShaderTranslator : public ShaderTranslator {
// TODO(Triang3l): Remove after fully moving to the new emitter.
void UnloadDxbcSourceOperand(const DxbcSourceOperand& operand);
// Writes xyzw or xxxx of the specified r# to the destination.
// can_store_memexport_address is for safety, to allow only proper MADs with
// a stream constant to write to eA.
void StoreResult(const InstructionResult& result, uint32_t reg,
bool replicate_x, bool can_store_memexport_address = false);
// Writes the specified source (src must be usable as a vector `mov` source,
// including to x#) to an instruction storage target.
// can_store_memexport_address is for safety, to allow only proper MADs with a
// stream constant to write to eA.
void StoreResult(const InstructionResult& result, const DxbcSrc& src,
bool can_store_memexport_address = false);
// The nesting of `if` instructions is the following:
// - pc checks (labels).
@ -2150,12 +2199,12 @@ class DxbcShaderTranslator : public ShaderTranslator {
TextureFilter min_filter,
TextureFilter mip_filter,
AnisoFilter aniso_filter);
// Converts (S, T, face index) in the specified temporary register to a 3D
// cubemap coordinate.
void ArrayCoordToCubeDirection(uint32_t reg);
// Converts (array S + 1, array T + 1, face index) in the specified temporary
// register to a 3D cubemap coordinate.
void TfetchCubeCoordToCubeDirection(uint32_t reg);
bool ProcessVectorAluOperation(const ParsedAluInstruction& instr,
bool& replicate_result_x,
void ProcessVectorAluOperation(const ParsedAluInstruction& instr,
uint32_t& result_swizzle,
bool& predicate_written);
bool ProcessScalarAluOperation(const ParsedAluInstruction& instr,
bool& predicate_written);
@ -2334,9 +2383,9 @@ class DxbcShaderTranslator : public ShaderTranslator {
// eM# in each `alloc export`, or UINT32_MAX if not used.
uint32_t system_temps_memexport_data_[kMaxMemExports][5];
// Vector ALU result or fetch scratch (since Xenos write masks can contain
// Vector ALU or fetch result/scratch (since Xenos write masks can contain
// swizzles).
uint32_t system_temp_pv_;
uint32_t system_temp_result_;
// Temporary register ID for previous scalar result, program counter,
// predicate and absolute address register.
uint32_t system_temp_ps_pc_p0_a0_;

File diff suppressed because it is too large Load Diff

View File

@ -42,7 +42,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index,
shader_code_.push_back(temp1);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(8);
@ -74,7 +74,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index,
shader_code_.push_back(8);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(temp1);
@ -91,7 +91,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index,
shader_code_.push_back(temp2);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(16);
@ -189,7 +189,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index,
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, write_mask, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1));
shader_code_.push_back(temp2);
@ -198,7 +198,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index,
shader_code_.push_back(temp1);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
++stat_.instruction_count;
++stat_.movc_instruction_count;
@ -212,7 +212,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index,
shader_code_.push_back(temp1);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(16);
@ -244,7 +244,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index,
shader_code_.push_back(16);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(temp1);
@ -257,7 +257,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index,
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, write_mask, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1));
shader_code_.push_back(temp2);
@ -266,7 +266,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index,
shader_code_.push_back(temp1);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
++stat_.instruction_count;
++stat_.movc_instruction_count;
@ -342,7 +342,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
5 + DxbcSourceOperandLength(index_operand)));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
UseDxbcSourceOperand(index_operand, kSwizzleXYZW, 0);
shader_code_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
@ -353,10 +353,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
++stat_.instruction_count;
++stat_.conversion_instruction_count;
} else {
@ -365,7 +365,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
3 + DxbcSourceOperandLength(index_operand)));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
UseDxbcSourceOperand(index_operand, kSwizzleXYZW, 0);
++stat_.instruction_count;
++stat_.conversion_instruction_count;
@ -390,7 +390,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0010, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSelectOperand(
D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, (vfetch_index & 1) * 2, 3));
shader_code_.push_back(cbuffer_index_fetch_constants_);
@ -407,16 +407,16 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
shader_code_.push_back(instr.attributes.stride * 4);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
++stat_.instruction_count;
++stat_.uint_instruction_count;
@ -426,10 +426,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
shader_code_.push_back(instr.attributes.offset * 4);
@ -444,7 +444,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0010, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSelectOperand(
D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSysConst_Flags_Comp, 3));
shader_code_.push_back(cbuffer_index_system_constants_);
@ -462,7 +462,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3));
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
++stat_.instruction_count;
++stat_.dynamic_flow_control_count;
@ -471,10 +471,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8));
shader_code_.push_back(EncodeVectorMaskedOperand(
D3D10_SB_OPERAND_TYPE_TEMP, (1 << load_dword_count) - 1, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D11_SB_OPERAND_TYPE_UNORDERED_ACCESS_VIEW,
kSwizzleXYZW & ((1 << (load_dword_count * 2)) - 1), 2));
@ -492,10 +492,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8));
shader_code_.push_back(EncodeVectorMaskedOperand(
D3D10_SB_OPERAND_TYPE_TEMP, (1 << load_dword_count) - 1, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_RESOURCE,
kSwizzleXYZW & ((1 << (load_dword_count * 2)) - 1), 2));
@ -607,7 +607,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(15));
shader_code_.push_back(EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP,
result_write_mask, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(extract_widths[0]);
@ -622,7 +622,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
shader_code_.push_back(extract_offsets[3]);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, extract_swizzle, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
++stat_.instruction_count;
if (extract_signed) {
++stat_.int_instruction_count;
@ -639,10 +639,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5));
shader_code_.push_back(EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP,
result_write_mask, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
++stat_.instruction_count;
++stat_.conversion_instruction_count;
} else if (normalize_scales[0] != 0.0f) {
@ -655,10 +655,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5));
shader_code_.push_back(EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP,
result_write_mask, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
++stat_.instruction_count;
++stat_.conversion_instruction_count;
if (!instr.attributes.is_integer) {
@ -667,10 +667,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10));
shader_code_.push_back(EncodeVectorMaskedOperand(
D3D10_SB_OPERAND_TYPE_TEMP, result_write_mask, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
for (uint32_t i = 0; i < 4; ++i) {
@ -687,10 +687,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10));
shader_code_.push_back(EncodeVectorMaskedOperand(
D3D10_SB_OPERAND_TYPE_TEMP, result_write_mask, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(0xBF800000u);
@ -710,7 +710,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8));
shader_code_.push_back(EncodeVectorMaskedOperand(
D3D10_SB_OPERAND_TYPE_TEMP, 0b1111 & ~result_write_mask, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(0);
@ -727,10 +727,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10));
shader_code_.push_back(EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP,
result_write_mask, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
uint32_t exp_adjust_scale =
@ -743,7 +743,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction(
++stat_.float_instruction_count;
}
StoreResult(instr.result, system_temp_pv_, false);
StoreResult(instr.result, DxbcSrc::R(system_temp_result_));
}
uint32_t DxbcShaderTranslator::FindOrAddTextureSRV(uint32_t fetch_constant,
@ -852,9 +852,9 @@ uint32_t DxbcShaderTranslator::FindOrAddSamplerBinding(
return sampler_register;
}
void DxbcShaderTranslator::ArrayCoordToCubeDirection(uint32_t reg) {
// This does the reverse of what the cube vector ALU instruction does, but
// assuming S and T are normalized.
void DxbcShaderTranslator::TfetchCubeCoordToCubeDirection(uint32_t reg) {
// This does the reverse of what's done by the ALU sequence for cubemap
// coordinate calculation.
//
// The major axis depends on the face index (passed as a float in reg.z):
// +X for 0, -X for 1, +Y for 2, -Y for 3, +Z for 4, -Z for 5.
@ -872,8 +872,8 @@ void DxbcShaderTranslator::ArrayCoordToCubeDirection(uint32_t reg) {
// * Y is -T.
// * Z is 1.0 or -1.0.
// Make 0, not 0.5, the center of S and T.
// mad reg.xy__, reg.xy__, l(2.0, 2.0, _, _), l(-1.0, -1.0, _, _)
// Make 0, not 1.5, the center of S and T.
// mad reg.xy__, reg.xy__, l(2.0, 2.0, _, _), l(-3.0, -3.0, _, _)
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MAD) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(15));
shader_code_.push_back(
@ -890,8 +890,8 @@ void DxbcShaderTranslator::ArrayCoordToCubeDirection(uint32_t reg) {
shader_code_.push_back(0x3F800000u);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(0xBF800000u);
shader_code_.push_back(0xBF800000u);
shader_code_.push_back(0xC0400000u);
shader_code_.push_back(0xC0400000u);
shader_code_.push_back(0);
shader_code_.push_back(0);
++stat_.instruction_count;
@ -1194,7 +1194,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0));
shader_code_.push_back(0);
@ -2149,7 +2149,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, coord_mask, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(coord_temp);
@ -2157,12 +2157,13 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
++stat_.float_instruction_count;
} else {
if (instr.dimension == TextureDimension::kCube) {
// Convert cubemap coordinates passed as 2D array texture coordinates to
// a 3D direction. We can't use a 2D array to emulate cubemaps because
// at the edges, especially in pixel shader helper invocations, the
// major axis changes, causing S/T to jump between 0 and 1, breaking
// gradient calculation and causing the 1x1 mipmap to be sampled.
ArrayCoordToCubeDirection(coord_temp);
// Convert cubemap coordinates passed as 2D array texture coordinates
// plus 1 in ST to a 3D direction. We can't use a 2D array to emulate
// cubemaps because at the edges, especially in pixel shader helper
// invocations, the major axis changes, causing S/T to jump between 0
// and 1, breaking gradient calculation and causing the 1x1 mipmap to be
// sampled.
TfetchCubeCoordToCubeDirection(coord_temp);
}
// Bias the register LOD if fetching with explicit LOD (so this is not
@ -2237,7 +2238,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(11));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(coord_temp);
@ -2260,10 +2261,10 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
shader_code_.push_back(EncodeVectorMaskedOperand(
D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
shader_code_.push_back(
@ -2277,7 +2278,8 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
for (uint32_t j = 0; j < 2; ++j) {
uint32_t srv_index_current =
i ? srv_indices_stacked[j] : srv_indices[j];
uint32_t target_temp_sign = j ? signed_value_temp : system_temp_pv_;
uint32_t target_temp_sign =
j ? signed_value_temp : system_temp_result_;
for (uint32_t k = 0;
k < (vol_filter_lerp_temp != UINT32_MAX ? 2u : 1u); ++k) {
uint32_t target_temp_current =
@ -2564,7 +2566,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1 << i, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1));
shader_code_.push_back(sign_temp);
@ -2573,7 +2575,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
shader_code_.push_back(signed_value_temp);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, i, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
++stat_.instruction_count;
++stat_.movc_instruction_count;
@ -2603,7 +2605,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
shader_code_.push_back(sign_temp);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, i, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
shader_code_.push_back(0x40000000u);
@ -2619,7 +2621,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1 << i, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1));
shader_code_.push_back(sign_temp);
@ -2628,7 +2630,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
shader_code_.push_back(sign_temp);
shader_code_.push_back(
EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, i, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
++stat_.instruction_count;
++stat_.movc_instruction_count;
@ -2661,7 +2663,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
++stat_.dynamic_flow_control_count;
// Degamma the channel.
ConvertPWLGamma(false, system_temp_pv_, i, system_temp_pv_, i,
ConvertPWLGamma(false, system_temp_result_, i, system_temp_result_, i,
sign_temp, 0, sign_temp, 1);
// Close the gamma conditional.
@ -2733,10 +2735,10 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1));
shader_code_.push_back(exp_adjust_temp);
@ -2774,7 +2776,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + operand_length));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0101, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
UseDxbcSourceOperand(operand, 0b01010000);
++stat_.instruction_count;
++stat_.float_instruction_count;
@ -2784,7 +2786,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + operand_length));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1010, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
UseDxbcSourceOperand(operand, 0b01010000);
++stat_.instruction_count;
++stat_.float_instruction_count;
@ -2857,10 +2859,10 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1));
shader_code_.push_back(system_temp_pv_);
shader_code_.push_back(system_temp_result_);
shader_code_.push_back(
EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b01000100, 1));
shader_code_.push_back(exp_bias_temp);
@ -2898,7 +2900,9 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
}
if (store_result) {
StoreResult(instr.result, system_temp_pv_, replicate_result);
StoreResult(instr.result,
DxbcSrc::R(system_temp_result_,
replicate_result ? DxbcSrc::kXXXX : DxbcSrc::kXYZW));
}
}

View File

@ -289,7 +289,7 @@ struct ParsedLoopStartInstruction {
uint32_t dword_index = 0;
// Integer constant register that holds the loop parameters.
// Byte-wise: [loop count, start, step [-128, 127], ?]
// 0:7 - uint8 loop count, 8:15 - uint8 start aL, 16:23 - int8 aL step.
uint32_t loop_constant_index = 0;
// Whether to reuse the current aL instead of reset it to loop start.
bool is_repeat = false;
@ -311,7 +311,7 @@ struct ParsedLoopEndInstruction {
bool predicate_condition = false;
// Integer constant register that holds the loop parameters.
// Byte-wise: [loop count, start, step [-128, 127], ?]
// 0:7 - uint8 loop count, 8:15 - uint8 start aL, 16:23 - int8 aL step.
uint32_t loop_constant_index = 0;
// Target address of the start of the loop body.

View File

@ -256,7 +256,7 @@ struct ControlFlowLoopStartInstruction {
// Whether to reuse the current aL instead of reset it to loop start.
bool is_repeat() const { return is_repeat_; }
// Integer constant register that holds the loop parameters.
// Byte-wise: [loop count, start, step [-128, 127], ?]
// 0:7 - uint8 loop count, 8:15 - uint8 start aL, 16:23 - int8 aL step.
uint32_t loop_id() const { return loop_id_; }
private:
@ -281,7 +281,7 @@ struct ControlFlowLoopEndInstruction {
// Target address of the start of the loop body.
uint32_t address() const { return address_; }
// Integer constant register that holds the loop parameters.
// Byte-wise: [loop count, start, step [-128, 127], ?]
// 0:7 - uint8 loop count, 8:15 - uint8 start aL, 16:23 - int8 aL step.
uint32_t loop_id() const { return loop_id_; }
// Break from the loop if the predicate matches the expected value.
bool is_predicated_break() const { return is_predicated_break_; }
@ -667,11 +667,13 @@ static_assert_size(TextureFetchInstruction, 12);
// Both are valid only within the current ALU clause. They are not modified
// when the instruction that would write them fails its predication check.
// - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for
// multiplication (0 * anything = 0) wherever it's present (mul, mad, dp,
// etc.) and for NaN in min/max. It's very important to respect this rule for
// multiplication, as games often rely on it in vector normalization (rcp and
// mul), Infinity * 0 resulting in NaN breaks a lot of things in games -
// causes white screen in Halo 3, white specular on characters in GTA IV.
// multiplication (0 or denormal * anything = 0) wherever it's present (mul,
// mad, dp, etc.) and for NaN in min/max. It's very important to respect this
// rule for multiplication, as games often rely on it in vector normalization
// (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of things in
// games - causes white screen in Halo 3, white specular on characters in GTA
// IV.
// TODO(Triang3l): Investigate signed zero handling in multiplication.
enum class AluScalarOpcode : uint32_t {
// Floating-Point Add
@ -1145,7 +1147,7 @@ enum class AluVectorOpcode : uint32_t {
// cube/CUBEv dest, src0, src1
// dest.x = T cube coordinate;
// dest.y = S cube coordinate;
// dest.z = 2.0 * MajorAxis;
// dest.z = 2.0 * major axis;
// dest.w = FaceID;
// https://developer.amd.com/wordpress/media/2012/12/AMD_Southern_Islands_Instruction_Set_Architecture.pdf
// if (abs(z) >= abs(x) && abs(z) >= abs(y)) {
@ -1167,6 +1169,16 @@ enum class AluVectorOpcode : uint32_t {
// Expects src0.zzxy and src1.yxzz swizzles.
// FaceID is D3DCUBEMAP_FACES:
// https://msdn.microsoft.com/en-us/library/windows/desktop/bb172528(v=vs.85).aspx
// Used like:
// cube r0, source.zzxy, source.yxz
// rcp r0.z, r0_abs.z
// mad r0.xy, r0, r0.zzzw, 1.5f
// tfetchCube r0, r0.yxw, tf0
// http://web.archive.org/web/20100705154143/http://msdn.microsoft.com/en-us/library/bb313921.aspx
// On GCN, the sequence is the same, so GCN documentation can be used as a
// reference (tfetchCube doesn't accept the UV as if the texture was a 2D
// array in XY exactly, to get texture array UV, 1 must be subtracted from its
// XY inputs).
kCube = 18,
// Four-Element Maximum
@ -1293,12 +1305,20 @@ enum class AluVectorOpcode : uint32_t {
// Per-Component Floating-Point Maximum with Copy To Integer in AR
// maxa dest, src0, src1
// This is a combined max + mova/MOVAv.
// int result = (int)floor(src0.w + 0.5);
// a0 = clamp(result, -256, 255);
// a0 = (int)clamp(floor(src0.w + 0.5), -256.0, 255.0);
// dest.x = src0.x >= src1.x ? src0.x : src1.x;
// dest.y = src0.x >= src1.y ? src0.y : src1.y;
// dest.z = src0.x >= src1.z ? src0.z : src1.z;
// dest.w = src0.x >= src1.w ? src0.w : src1.w;
// The MSDN documentation specifies clamp as:
// if (!(SQResultF >= -256.0)) {
// SQResultF = -256.0;
// }
// if (SQResultF > 255.0) {
// SQResultF = 255.0;
// }
// http://web.archive.org/web/20100705151335/http://msdn.microsoft.com:80/en-us/library/bb313931.aspx
// However, using NaN as an address would be unusual.
kMaxA = 29,
};
@ -1329,6 +1349,7 @@ constexpr bool AluVectorOpHasSideEffects(AluVectorOpcode vector_opcode) {
// (doesn't check the operand count though).
constexpr uint32_t GetAluVectorOpUsedSourceComponents(
AluVectorOpcode vector_opcode, uint32_t src_index) {
assert_not_zero(src_index);
switch (vector_opcode) {
case AluVectorOpcode::kDp3:
return 0b0111;
@ -1353,27 +1374,30 @@ constexpr uint32_t GetAluVectorOpUsedSourceComponents(
// components specified in the write mask are needed, but there are instructions
// with special behavior for certain components.
constexpr uint32_t GetAluVectorOpNeededSourceComponents(
AluVectorOpcode vector_opcode, uint32_t src_index, uint32_t write_mask) {
uint32_t components = write_mask;
AluVectorOpcode vector_opcode, uint32_t src_index,
uint32_t used_result_components) {
assert_not_zero(src_index);
uint32_t components = used_result_components;
switch (vector_opcode) {
case AluVectorOpcode::kDp4:
case AluVectorOpcode::kMax4:
components = write_mask ? 0b1111 : 0;
components = used_result_components ? 0b1111 : 0;
break;
case AluVectorOpcode::kDp3:
components = write_mask ? 0b0111 : 0;
components = used_result_components ? 0b0111 : 0;
break;
case AluVectorOpcode::kDp2Add:
components = write_mask ? (src_index == 3 ? 0b0001 : 0b0011) : 0;
components =
used_result_components ? (src_index == 3 ? 0b0001 : 0b0011) : 0;
break;
case AluVectorOpcode::kCube:
components = write_mask ? 0b1111 : 0;
components = used_result_components ? 0b1111 : 0;
break;
case AluVectorOpcode::kSetpEqPush:
case AluVectorOpcode::kSetpNePush:
case AluVectorOpcode::kSetpGtPush:
case AluVectorOpcode::kSetpGePush:
components = write_mask ? 0b1001 : 0b1000;
components = used_result_components ? 0b1001 : 0b1000;
break;
case AluVectorOpcode::kKillEq:
case AluVectorOpcode::kKillGt: