[GPU] Shader translator refactoring (mostly ALU), fixes for disassembly round trip and write masks

This commit is contained in:
Triang3l 2020-05-08 23:57:51 +03:00
parent 8f91e580f4
commit 3aa0ce3096
12 changed files with 959 additions and 765 deletions

View File

@ -2961,6 +2961,14 @@ bool D3D12CommandProcessor::UpdateBindings(
(!samplers_written_pixel_ ||
current_samplers_hash_pixel_ != samplers_hash_pixel);
// These are the constant base addresses/ranges for shaders.
// We have these hardcoded right now cause nothing seems to differ on the Xbox
// 360 (however, OpenGL ES on Adreno 200 on Android has different ranges).
assert_true(regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x000FF000 ||
regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x00000000);
assert_true(regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x000FF100 ||
regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x00000000);
// Check if the float constant layout is still the same and get the counts.
const Shader::ConstantRegisterMap& float_constant_map_vertex =
vertex_shader->constant_register_map();

View File

@ -809,14 +809,6 @@ bool PipelineCache::EnsureShadersTranslated(
D3D12Shader* vertex_shader, D3D12Shader* pixel_shader,
Shader::HostVertexShaderType host_vertex_shader_type) {
auto& regs = *register_file_;
// These are the constant base addresses/ranges for shaders.
// We have these hardcoded right now cause nothing seems to differ.
assert_true(regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x000FF000 ||
regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x00000000);
assert_true(regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x000FF100 ||
regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x00000000);
auto sq_program_cntl = regs.Get<reg::SQ_PROGRAM_CNTL>();
// Edge flags are not supported yet (because polygon primitives are not).

View File

@ -18,6 +18,7 @@
#include "xenia/base/assert.h"
#include "xenia/base/cvar.h"
#include "xenia/base/math.h"
DEFINE_bool(dxbc_switch, true,
"Use switch rather than if for flow control. Turning this off or "
@ -86,7 +87,6 @@ DxbcShaderTranslator::DxbcShaderTranslator(uint32_t vendor_id,
// Don't allocate again and again for the first shader.
shader_code_.reserve(8192);
shader_object_.reserve(16384);
float_constant_index_offsets_.reserve(512);
}
DxbcShaderTranslator::~DxbcShaderTranslator() = default;
@ -161,8 +161,6 @@ void DxbcShaderTranslator::Reset() {
cbuffer_index_fetch_constants_ = kCbufferIndexUnallocated;
system_constants_used_ = 0;
float_constants_dynamic_indexed_ = false;
float_constant_index_offsets_.clear();
in_control_point_index_used_ = false;
@ -1166,29 +1164,6 @@ void DxbcShaderTranslator::CompleteShaderCode() {
// Release system_temps_subroutine_.
PopSystemTemp(system_temps_subroutine_count_);
// Remap float constant indices if not indexed dynamically.
if (!float_constants_dynamic_indexed_ &&
!float_constant_index_offsets_.empty()) {
uint8_t float_constant_map[256] = {};
uint32_t float_constant_count = 0;
for (uint32_t i = 0; i < 4; ++i) {
uint64_t float_constants_used = constant_register_map().float_bitmap[i];
uint32_t float_constant_index;
while (
xe::bit_scan_forward(float_constants_used, &float_constant_index)) {
float_constants_used &= ~(1ull << float_constant_index);
float_constant_map[i * 64 + float_constant_index] =
float_constant_count++;
}
}
size_t index_count = float_constant_index_offsets_.size();
for (size_t i = 0; i < index_count; ++i) {
uint32_t index_offset = float_constant_index_offsets_[i];
shader_code_[index_offset] =
float_constant_map[shader_code_[index_offset] & 255];
}
}
}
std::vector<uint8_t> DxbcShaderTranslator::CompleteTranslation() {
@ -1420,7 +1395,7 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand(
shader_code_.push_back(EncodeVectorSwizzledOperand(
D3D10_SB_OPERAND_TYPE_INDEXABLE_TEMP, kSwizzleXYZW, 2));
shader_code_.push_back(0);
shader_code_.push_back(uint32_t(operand.storage_index));
shader_code_.push_back(operand.storage_index);
} else {
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
@ -1433,7 +1408,7 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand(
D3D10_SB_OPERAND_INDEX_IMMEDIATE32,
D3D10_SB_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE));
shader_code_.push_back(0);
shader_code_.push_back(uint32_t(operand.storage_index));
shader_code_.push_back(operand.storage_index);
shader_code_.push_back(EncodeVectorSelectOperand(
D3D10_SB_OPERAND_TYPE_TEMP, dynamic_address_component, 1));
shader_code_.push_back(dynamic_address_register);
@ -1445,7 +1420,7 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand(
assert_true(operand.storage_addressing_mode ==
InstructionStorageAddressingMode::kStatic);
dxbc_operand.type = DxbcSourceOperand::Type::kRegister;
dxbc_operand.index = uint32_t(operand.storage_index);
dxbc_operand.index = operand.storage_index;
}
break;
@ -1457,11 +1432,18 @@ void DxbcShaderTranslator::LoadDxbcSourceOperand(
cbuffer_index_float_constants_ = cbuffer_count_++;
}
dxbc_operand.type = DxbcSourceOperand::Type::kConstantFloat;
dxbc_operand.index = uint32_t(operand.storage_index);
dxbc_operand.addressing_mode = operand.storage_addressing_mode;
if (operand.storage_addressing_mode !=
if (operand.storage_addressing_mode ==
InstructionStorageAddressingMode::kStatic) {
float_constants_dynamic_indexed_ = true;
uint32_t float_constant_index =
constant_register_map().GetPackedFloatConstantIndex(
operand.storage_index);
assert_true(float_constant_index != UINT32_MAX);
dxbc_operand.index =
float_constant_index != UINT32_MAX ? float_constant_index : 0;
} else {
assert_true(constant_register_map().float_dynamic_addressing);
dxbc_operand.index = operand.storage_index;
}
break;
@ -1652,11 +1634,6 @@ void DxbcShaderTranslator::UseDxbcSourceOperand(
}
shader_code_.push_back(cbuffer_index_float_constants_);
shader_code_.push_back(uint32_t(CbufferRegister::kFloatConstants));
if (!float_constants_dynamic_indexed_) {
// If there's no dynamic indexing in the shader, constants are compacted
// and remapped. Store where the index has been written.
float_constant_index_offsets_.push_back(uint32_t(shader_code_.size()));
}
shader_code_.push_back(operand.index);
if (!is_static) {
uint32_t dynamic_address_register, dynamic_address_component;
@ -1718,8 +1695,9 @@ void DxbcShaderTranslator::UnloadDxbcSourceOperand(
void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
uint32_t reg, bool replicate_x,
bool can_store_memexport_address) {
uint32_t used_write_mask = result.GetUsedWriteMask();
if (result.storage_target == InstructionStorageTarget::kNone ||
!result.has_any_writes()) {
!result.GetUsedWriteMask()) {
return;
}
@ -1744,10 +1722,9 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
ENCODE_D3D10_SB_INSTRUCTION_SATURATE(result.is_clamped);
// Scalar targets get only one component.
// TODO(Triang3l): It's not replicated, it's X specifically.
if (result.storage_target == InstructionStorageTarget::kDepth) {
if (!result.write_mask[0]) {
return;
}
assert_not_zero(used_write_mask & 0b0001);
SwizzleSource component = result.components[0];
if (replicate_x && component <= SwizzleSource::kW) {
component = SwizzleSource::kX;
@ -1802,7 +1779,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
uint32_t constant_mask = 0;
uint32_t constant_values = 0;
for (uint32_t i = 0; i < 4; ++i) {
if (!result.write_mask[i]) {
if (!(used_write_mask & (1 << i))) {
continue;
}
SwizzleSource component = result.components[i];
@ -1858,7 +1835,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
is_static ? D3D10_SB_OPERAND_INDEX_IMMEDIATE32
: D3D10_SB_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE));
shader_code_.push_back(0);
shader_code_.push_back(uint32_t(result.storage_index));
shader_code_.push_back(result.storage_index);
if (!is_static) {
shader_code_.push_back(EncodeVectorSelectOperand(
D3D10_SB_OPERAND_TYPE_TEMP, dynamic_address_component, 1));
@ -1874,11 +1851,11 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
saturate_bit);
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1));
shader_code_.push_back(uint32_t(result.storage_index));
shader_code_.push_back(result.storage_index);
}
break;
case InstructionStorageTarget::kInterpolant:
case InstructionStorageTarget::kInterpolator:
++stat_.instruction_count;
++stat_.mov_instruction_count;
shader_code_.push_back(
@ -1943,7 +1920,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
[uint32_t(result.storage_index)]);
break;
case InstructionStorageTarget::kColorTarget:
case InstructionStorageTarget::kColor:
++stat_.instruction_count;
++stat_.mov_instruction_count;
shader_code_.push_back(
@ -1952,8 +1929,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
saturate_bit);
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1));
shader_code_.push_back(
system_temps_color_[uint32_t(result.storage_index)]);
shader_code_.push_back(system_temps_color_[result.storage_index]);
break;
default:
@ -1989,13 +1965,13 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
shader_code_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
shader_code_.push_back(
1u << (uint32_t(result.storage_index) + ((memexport_index & 3) << 3)));
uint32_t(1) << (result.storage_index + ((memexport_index & 3) << 3)));
++stat_.instruction_count;
++stat_.uint_instruction_count;
}
if (edram_rov_used_ &&
result.storage_target == InstructionStorageTarget::kColorTarget) {
result.storage_target == InstructionStorageTarget::kColor) {
// For ROV output, mark that the color has been written to.
// According to:
// https://docs.microsoft.com/en-us/windows/desktop/direct3dhlsl/dx9-graphics-reference-asm-ps-registers-output-color
@ -2014,7 +1990,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
shader_code_.push_back(system_temp_rov_params_);
shader_code_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
shader_code_.push_back(1 << (8 + uint32_t(result.storage_index)));
shader_code_.push_back(1 << (8 + result.storage_index));
++stat_.instruction_count;
++stat_.uint_instruction_count;
}
@ -2479,19 +2455,6 @@ const DxbcShaderTranslator::SystemConstantRdef DxbcShaderTranslator::
};
void DxbcShaderTranslator::WriteResourceDefinitions() {
// ***************************************************************************
// Preparation
// ***************************************************************************
// Float constant count.
uint32_t float_constant_count = 0;
if (cbuffer_index_float_constants_ != kCbufferIndexUnallocated) {
for (uint32_t i = 0; i < 4; ++i) {
float_constant_count +=
xe::bit_count(constant_register_map().float_bitmap[i]);
}
}
uint32_t chunk_position_dwords = uint32_t(shader_object_.size());
uint32_t new_offset;
@ -2583,7 +2546,8 @@ void DxbcShaderTranslator::WriteResourceDefinitions() {
if (RdefTypeIndex(i) == RdefTypeIndex::kFloat4ConstantArray) {
// Declaring a 0-sized array may not be safe, so write something valid
// even if they aren't used.
shader_object_.push_back(std::max(float_constant_count, 1u));
shader_object_.push_back(
std::max(constant_register_map().float_count, uint32_t(1)));
} else {
shader_object_.push_back(type.element_count |
(type.struct_member_count << 16));
@ -2692,8 +2656,9 @@ void DxbcShaderTranslator::WriteResourceDefinitions() {
if (cbuffer_index_float_constants_ != kCbufferIndexUnallocated) {
shader_object_.push_back(constant_name_offset_float);
shader_object_.push_back(0);
shader_object_.push_back(std::max(float_constant_count, 1u) * 4 *
sizeof(float));
shader_object_.push_back(
std::max(constant_register_map().float_count, uint32_t(1)) * 4 *
sizeof(float));
shader_object_.push_back(kDxbcRdefVariableFlagUsed);
shader_object_.push_back(types_offset +
uint32_t(RdefTypeIndex::kFloat4ConstantArray) *
@ -2795,8 +2760,9 @@ void DxbcShaderTranslator::WriteResourceDefinitions() {
shader_object_.push_back(cbuffer_name_offset_float);
shader_object_.push_back(1);
shader_object_.push_back(constant_offset_float);
shader_object_.push_back(std::max(float_constant_count, 1u) * 4 *
sizeof(float));
shader_object_.push_back(
std::max(constant_register_map().float_count, uint32_t(1)) * 4 *
sizeof(float));
shader_object_.push_back(uint32_t(DxbcRdefCbufferType::kCbuffer));
shader_object_.push_back(0);
} else if (i == cbuffer_index_bool_loop_constants_) {
@ -3646,15 +3612,10 @@ void DxbcShaderTranslator::WriteShaderCode() {
// Constant buffers, from most frequenly accessed to least frequently accessed
// (the order is a hint to the driver according to the DXBC header).
if (cbuffer_index_float_constants_ != kCbufferIndexUnallocated) {
uint32_t float_constant_count = 0;
for (uint32_t i = 0; i < 4; ++i) {
float_constant_count +=
xe::bit_count(constant_register_map().float_bitmap[i]);
}
shader_object_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_CONSTANT_BUFFER) |
ENCODE_D3D10_SB_D3D10_SB_CONSTANT_BUFFER_ACCESS_PATTERN(
float_constants_dynamic_indexed_
constant_register_map().float_dynamic_addressing
? D3D10_SB_CONSTANT_BUFFER_DYNAMIC_INDEXED
: D3D10_SB_CONSTANT_BUFFER_IMMEDIATE_INDEXED) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
@ -3663,7 +3624,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
shader_object_.push_back(cbuffer_index_float_constants_);
shader_object_.push_back(uint32_t(CbufferRegister::kFloatConstants));
shader_object_.push_back(uint32_t(CbufferRegister::kFloatConstants));
shader_object_.push_back(float_constant_count);
shader_object_.push_back(constant_register_map().float_count);
shader_object_.push_back(0);
}
if (cbuffer_index_system_constants_ != kCbufferIndexUnallocated) {

View File

@ -857,10 +857,10 @@ class DxbcShaderTranslator : public ShaderTranslator {
return 0b0000;
}
}
DxbcDest Mask(uint32_t write_mask) const {
[[nodiscard]] DxbcDest Mask(uint32_t write_mask) const {
return DxbcDest(type_, write_mask, index_1d_, index_2d_, index_3d_);
}
DxbcDest MaskMasked(uint32_t write_mask) const {
[[nodiscard]] DxbcDest MaskMasked(uint32_t write_mask) const {
return DxbcDest(type_, write_mask_ & write_mask, index_1d_, index_2d_,
index_3d_);
}
@ -991,26 +991,28 @@ class DxbcShaderTranslator : public ShaderTranslator {
return DxbcSrc(DxbcOperandType::kInputCoverageMask, kXXXX);
}
DxbcSrc WithModifiers(bool absolute, bool negate) const {
[[nodiscard]] DxbcSrc WithModifiers(bool absolute, bool negate) const {
DxbcSrc new_src(*this);
new_src.absolute_ = absolute;
new_src.negate_ = negate;
return new_src;
}
DxbcSrc WithAbs(bool absolute) const {
[[nodiscard]] DxbcSrc WithAbs(bool absolute) const {
return WithModifiers(absolute, negate_);
}
DxbcSrc WithNeg(bool negate) const {
[[nodiscard]] DxbcSrc WithNeg(bool negate) const {
return WithModifiers(absolute_, negate);
}
DxbcSrc Abs() const { return WithModifiers(true, false); }
DxbcSrc operator-() const { return WithModifiers(absolute_, !negate_); }
DxbcSrc Swizzle(uint32_t swizzle) const {
[[nodiscard]] DxbcSrc Abs() const { return WithModifiers(true, false); }
[[nodiscard]] DxbcSrc operator-() const {
return WithModifiers(absolute_, !negate_);
}
[[nodiscard]] DxbcSrc Swizzle(uint32_t swizzle) const {
DxbcSrc new_src(*this);
new_src.swizzle_ = swizzle;
return new_src;
}
DxbcSrc SwizzleSwizzled(uint32_t swizzle) const {
[[nodiscard]] DxbcSrc SwizzleSwizzled(uint32_t swizzle) const {
DxbcSrc new_src(*this);
new_src.swizzle_ = 0;
for (uint32_t i = 0; i < 4; ++i) {
@ -1019,12 +1021,12 @@ class DxbcShaderTranslator : public ShaderTranslator {
}
return new_src;
}
DxbcSrc Select(uint32_t component) const {
[[nodiscard]] DxbcSrc Select(uint32_t component) const {
DxbcSrc new_src(*this);
new_src.swizzle_ = component * 0b01010101;
return new_src;
}
DxbcSrc SelectFromSwizzled(uint32_t component) const {
[[nodiscard]] DxbcSrc SelectFromSwizzled(uint32_t component) const {
DxbcSrc new_src(*this);
new_src.swizzle_ = ((swizzle_ >> (component * 2)) & 3) * 0b01010101;
return new_src;
@ -2026,6 +2028,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
void EmitInstructionDisassembly();
// Abstract 4-component vector source operand.
// TODO(Triang3l): Remove after fully moving to the new emitter.
struct DxbcSourceOperand {
enum class Type {
// GPR number in the index - used only when GPRs are not dynamically
@ -2064,18 +2067,22 @@ class DxbcShaderTranslator : public ShaderTranslator {
};
// Each Load must be followed by Unload, otherwise there may be a temporary
// register leak.
// TODO(Triang3l): Remove after fully moving to the new emitter.
void LoadDxbcSourceOperand(const InstructionOperand& operand,
DxbcSourceOperand& dxbc_operand);
// Number of tokens this operand adds to the instruction length when used.
// TODO(Triang3l): Remove after fully moving to the new emitter.
uint32_t DxbcSourceOperandLength(const DxbcSourceOperand& operand,
bool negate = false,
bool absolute = false) const;
// Writes the operand access tokens to the instruction (either for a scalar if
// select_component is <= 3, or for a vector).
// TODO(Triang3l): Remove after fully moving to the new emitter.
void UseDxbcSourceOperand(const DxbcSourceOperand& operand,
uint32_t additional_swizzle = kSwizzleXYZW,
uint32_t select_component = 4, bool negate = false,
bool absolute = false);
// TODO(Triang3l): Remove after fully moving to the new emitter.
void UnloadDxbcSourceOperand(const DxbcSourceOperand& operand);
// Writes xyzw or xxxx of the specified r# to the destination.
@ -2258,15 +2265,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
// the remaining ones can be marked as unused in RDEF.
uint64_t system_constants_used_;
// Whether constants are dynamically indexed and need to be marked as such in
// dcl_constantBuffer.
bool float_constants_dynamic_indexed_;
// Offsets of float constant indices in shader_code_, for remapping in
// CompleteTranslation (initially, at these offsets, guest float constant
// indices are written).
std::vector<uint32_t> float_constant_index_offsets_;
// Whether InOutRegister::kDSInControlPointIndex has been used in the shader.
bool in_control_point_index_used_;

View File

@ -23,7 +23,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
replicate_result_x = false;
predicate_written = false;
if (!instr.has_vector_op) {
if (!instr.vector_and_constant_result.GetUsedWriteMask() &&
!AluVectorOpHasSideEffects(instr.vector_opcode)) {
return false;
}
@ -32,7 +33,7 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
if (instr.vector_opcode == AluVectorOpcode::kCube) {
operand_count = 1;
} else {
operand_count = uint32_t(instr.vector_operand_count);
operand_count = instr.vector_operand_count;
}
DxbcSourceOperand dxbc_operands[3];
// Whether the operand is the same as any previous operand, and thus is loaded
@ -42,7 +43,7 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
for (uint32_t i = 0; i < operand_count; ++i) {
const InstructionOperand& operand = instr.vector_operands[i];
for (uint32_t j = 0; j < i; ++j) {
if (operand == instr.vector_operands[j]) {
if (operand.GetIdenticalComponents(instr.vector_operands[j]) == 0b1111) {
operands_duplicate[i] = true;
dxbc_operands[i] = dxbc_operands[j];
break;
@ -117,7 +118,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
UseDxbcSourceOperand(dxbc_operands[1]);
++stat_.instruction_count;
++stat_.float_instruction_count;
if (!instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) {
if (instr.vector_operands[0].GetAbsoluteIdenticalComponents(
instr.vector_operands[1]) != 0b1111) {
// Reproduce Shader Model 3 multiplication behavior (0 * anything = 0),
// flushing denormals (must be done using eq - doing bitwise comparison
// doesn't flush denormals).
@ -281,7 +283,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
UseDxbcSourceOperand(dxbc_operands[2]);
++stat_.instruction_count;
++stat_.float_instruction_count;
if (!instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) {
if (instr.vector_operands[0].GetAbsoluteIdenticalComponents(
instr.vector_operands[1]) != 0b1111) {
// Reproduce Shader Model 3 multiplication behavior (0 * anything = 0).
// If any operand is zero or denormalized, just leave the addition part.
uint32_t is_subnormal_temp = PushSystemTemp();
@ -388,7 +391,8 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
case AluVectorOpcode::kDp4:
case AluVectorOpcode::kDp3:
case AluVectorOpcode::kDp2Add: {
if (instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) {
if (instr.vector_operands[0].GetAbsoluteIdenticalComponents(
instr.vector_operands[1]) != 0b1111) {
// The operands are the same when calculating vector length, no need to
// emulate 0 * anything = 0 in this case.
shader_code_.push_back(
@ -1092,7 +1096,9 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation(
UseDxbcSourceOperand(dxbc_operands[1], kSwizzleXYZW, 1);
++stat_.instruction_count;
++stat_.float_instruction_count;
if (!instr.vector_operands[0].EqualsAbsolute(instr.vector_operands[1])) {
if (!(instr.vector_operands[0].GetAbsoluteIdenticalComponents(
instr.vector_operands[1]) &
0b0010)) {
// Reproduce Shader Model 3 multiplication behavior (0 * anything = 0).
// This is an attenuation calculation function, so infinity is probably
// not very unlikely.
@ -1294,7 +1300,8 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation(
const ParsedAluInstruction& instr, bool& predicate_written) {
predicate_written = false;
if (!instr.has_scalar_op) {
if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev &&
!instr.scalar_result.GetUsedWriteMask()) {
return false;
}
@ -1306,7 +1313,7 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation(
for (uint32_t i = 0; i < uint32_t(instr.scalar_operand_count); ++i) {
const InstructionOperand& operand = instr.scalar_operands[i];
for (uint32_t j = 0; j < i; ++j) {
if (operand == instr.scalar_operands[j]) {
if (operand.GetIdenticalComponents(instr.scalar_operands[j]) == 0b1111) {
operands_duplicate[i] = true;
dxbc_operands[i] = dxbc_operands[j];
break;
@ -2303,7 +2310,9 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation(
UseDxbcSourceOperand(dxbc_operands[1], kSwizzleXYZW, 0);
++stat_.instruction_count;
++stat_.float_instruction_count;
if (!instr.scalar_operands[0].EqualsAbsolute(instr.scalar_operands[1])) {
if (!(instr.scalar_operands[0].GetAbsoluteIdenticalComponents(
instr.scalar_operands[1]) &
0b0001)) {
// Reproduce Shader Model 3 multiplication behavior (0 * anything = 0).
uint32_t is_subnormal_temp = PushSystemTemp();
// Get the non-NaN multiplicand closer to zero to check if any of them
@ -2421,7 +2430,7 @@ bool DxbcShaderTranslator::ProcessScalarAluOperation(
void DxbcShaderTranslator::ProcessAluInstruction(
const ParsedAluInstruction& instr) {
if (instr.is_nop()) {
if (instr.IsNop()) {
return;
}
@ -2445,7 +2454,8 @@ void DxbcShaderTranslator::ProcessAluInstruction(
ProcessScalarAluOperation(instr, predicate_written_scalar);
if (store_vector) {
StoreResult(instr.vector_result, system_temp_pv_, replicate_vector_x,
StoreResult(instr.vector_and_constant_result, system_temp_pv_,
replicate_vector_x,
instr.GetMemExportStreamConstant() != UINT32_MAX);
}
if (store_scalar) {

View File

@ -10,10 +10,12 @@
#ifndef XENIA_GPU_SHADER_H_
#define XENIA_GPU_SHADER_H_
#include <algorithm>
#include <filesystem>
#include <string>
#include <vector>
#include "xenia/base/math.h"
#include "xenia/base/string_buffer.h"
#include "xenia/gpu/ucode.h"
#include "xenia/gpu/xenos.h"
@ -21,23 +23,32 @@
namespace xe {
namespace gpu {
// The structures here are used for both translation and disassembly.
//
// Because disassembly uses them too, to make sure "assemble -> disassemble ->
// reassemble" round trip is always successful with the XNA assembler (as it is
// the accuracy benchmark for translation), only generalization - not
// optimization like nop skipping/replacement - must be done while converting
// microcode to these structures (in other words, parsed shader code should be
// enough to accurately reconstruct the microcode for any shader that could be
// written by a human in assembly).
//
// During the "parsed -> host" part of the translation, however, translators are
// free to make any optimizations (as long as they don't affect the result, of
// course) they find appropriate.
enum class InstructionStorageTarget {
// Result is not stored.
kNone,
// Result is stored to a temporary register indexed by storage_index [0-31].
kRegister,
// Result is stored into a vertex shader interpolant export [0-15].
kInterpolant,
// Result is stored into a vertex shader interpolator export [0-15].
kInterpolator,
// Result is stored to the position export (gl_Position).
kPosition,
// Result is stored to the vertex shader misc export register.
// See R6xx/R7xx registers for details (USE_VTX_POINT_SIZE, USE_VTX_EDGE_FLAG,
// USE_VTX_KILL_FLAG).
// X - PSIZE (gl_PointSize).
// Y - EDGEFLAG (glEdgeFlag) for PrimitiveType::kPolygon wireframe/point
// drawing.
// Z - KILLVERTEX flag (used in Banjo-Kazooie: Nuts & Bolts for grass), set
// for killing primitives based on PA_CL_CLIP_CNTL::VTX_KILL_OR condition.
// Result is stored to the vertex shader misc export register, see
// ucode::ExportRegister::kVSPointSizeEdgeFlagKillVertex for description of
// components.
kPointSizeEdgeFlagKillVertex,
// Result is stored as memexport destination address
// (see xenos::xe_gpu_memexport_stream_t).
@ -45,11 +56,29 @@ enum class InstructionStorageTarget {
// Result is stored to memexport destination data.
kExportData,
// Result is stored to a color target export indexed by storage_index [0-3].
kColorTarget,
// Result is stored to the depth export (gl_FragDepth).
kColor,
// X of the result is stored to the depth export (gl_FragDepth).
kDepth,
};
// Must be used only in translation to skip unused components, but not in
// disassembly (because oPts.x000 will be assembled, but oPts.x00_ has both
// skipped components and zeros, which cannot be encoded, and therefore it will
// not).
constexpr uint32_t GetInstructionStorageTargetUsedComponents(
InstructionStorageTarget target) {
switch (target) {
case InstructionStorageTarget::kNone:
return 0b0000;
case InstructionStorageTarget::kPointSizeEdgeFlagKillVertex:
return 0b0111;
case InstructionStorageTarget::kDepth:
return 0b0001;
default:
return 0b1111;
}
}
enum class InstructionStorageAddressingMode {
// The storage index is not dynamically addressed.
kStatic,
@ -75,71 +104,63 @@ enum class SwizzleSource {
k1,
};
constexpr SwizzleSource GetSwizzleFromComponentIndex(int i) {
constexpr SwizzleSource GetSwizzleFromComponentIndex(uint32_t i) {
return static_cast<SwizzleSource>(i);
}
inline char GetCharForComponentIndex(int i) {
inline char GetCharForComponentIndex(uint32_t i) {
const static char kChars[] = {'x', 'y', 'z', 'w'};
return kChars[i];
}
inline char GetCharForSwizzle(SwizzleSource swizzle_source) {
const static char kChars[] = {'x', 'y', 'z', 'w', '0', '1'};
return kChars[static_cast<int>(swizzle_source)];
return kChars[static_cast<uint32_t>(swizzle_source)];
}
struct InstructionResult {
// Where the result is going.
InstructionStorageTarget storage_target = InstructionStorageTarget::kNone;
// Index into the storage_target, if it is indexed.
int storage_index = 0;
uint32_t storage_index = 0;
// How the storage index is dynamically addressed, if it is.
InstructionStorageAddressingMode storage_addressing_mode =
InstructionStorageAddressingMode::kStatic;
// True if the result is exporting from the shader.
bool is_export = false;
// True to clamp the result value to [0-1].
bool is_clamped = false;
// Defines whether each output component is written.
bool write_mask[4] = {false, false, false, false};
// Defines whether each output component is written, though this is from the
// original microcode, not taking into account whether such components
// actually exist in the target.
uint32_t original_write_mask = 0b0000;
// Defines the source for each output component xyzw.
SwizzleSource components[4] = {SwizzleSource::kX, SwizzleSource::kY,
SwizzleSource::kZ, SwizzleSource::kW};
// Returns true if any component is written to.
bool has_any_writes() const {
return write_mask[0] || write_mask[1] || write_mask[2] || write_mask[3];
}
// Returns true if all components are written to.
bool has_all_writes() const {
return write_mask[0] && write_mask[1] && write_mask[2] && write_mask[3];
}
// Returns number of components written
uint32_t num_writes() const {
uint32_t total = 0;
for (int i = 0; i < 4; i++) {
if (write_mask[i]) {
total++;
}
}
return total;
}
// Returns true if any non-constant components are written.
bool stores_non_constants() const {
for (int i = 0; i < 4; ++i) {
if (write_mask[i] && components[i] != SwizzleSource::k0 &&
components[i] != SwizzleSource::k1) {
return true;
}
}
return false;
// Returns the write mask containing only components actually present in the
// target.
uint32_t GetUsedWriteMask() const {
return original_write_mask &
GetInstructionStorageTargetUsedComponents(storage_target);
}
// True if the components are in their 'standard' swizzle arrangement (xyzw).
bool is_standard_swizzle() const {
return has_all_writes() && components[0] == SwizzleSource::kX &&
bool IsStandardSwizzle() const {
return (GetUsedWriteMask() == 0b1111) &&
components[0] == SwizzleSource::kX &&
components[1] == SwizzleSource::kY &&
components[2] == SwizzleSource::kZ &&
components[3] == SwizzleSource::kW;
}
// Returns the components of the result, before swizzling, that won't be
// discarded or replaced with a constant.
uint32_t GetUsedResultComponents() const {
uint32_t used_write_mask = GetUsedWriteMask();
uint32_t used_components = 0b0000;
for (uint32_t i = 0; i < 4; ++i) {
if ((used_write_mask & (1 << i)) && components[i] >= SwizzleSource::kX &&
components[i] <= SwizzleSource::kW) {
used_components |=
1 << (uint32_t(components[i]) - uint32_t(SwizzleSource::kX));
}
}
return used_components;
}
};
enum class InstructionStorageSource {
@ -159,7 +180,7 @@ struct InstructionOperand {
// Where the source comes from.
InstructionStorageSource storage_source = InstructionStorageSource::kRegister;
// Index into the storage_target, if it is indexed.
int storage_index = 0;
uint32_t storage_index = 0;
// How the storage index is dynamically addressed, if it is.
InstructionStorageAddressingMode storage_addressing_mode =
InstructionStorageAddressingMode::kStatic;
@ -168,13 +189,19 @@ struct InstructionOperand {
// True to take the absolute value of the source (before any negation).
bool is_absolute_value = false;
// Number of components taken from the source operand.
int component_count = 0;
uint32_t component_count = 4;
// Defines the source for each component xyzw (up to the given
// component_count).
SwizzleSource components[4] = {SwizzleSource::kX, SwizzleSource::kY,
SwizzleSource::kZ, SwizzleSource::kW};
// Returns the swizzle source for the component, replicating the rightmost
// component if there are less than 4 components (similar to what the Xbox 360
// shader compiler does as a general rule for unspecified components).
SwizzleSource GetComponent(uint32_t index) const {
return components[std::min(index, component_count - 1)];
}
// True if the components are in their 'standard' swizzle arrangement (xyzw).
bool is_standard_swizzle() const {
bool IsStandardSwizzle() const {
switch (component_count) {
case 4:
return components[0] == SwizzleSource::kX &&
@ -185,26 +212,32 @@ struct InstructionOperand {
return false;
}
// Whether absolute values of two operands are identical (useful for emulating
// Shader Model 3 0*anything=0 multiplication behavior).
bool EqualsAbsolute(const InstructionOperand& other) const {
// Returns which components of two operands are identical, but may have
// different signs (for simplicity of usage with GetComponent, treating the
// rightmost component as replicated).
uint32_t GetAbsoluteIdenticalComponents(
const InstructionOperand& other) const {
if (storage_source != other.storage_source ||
storage_index != other.storage_index ||
storage_addressing_mode != other.storage_addressing_mode ||
component_count != other.component_count) {
return false;
storage_addressing_mode != other.storage_addressing_mode) {
return 0;
}
for (int i = 0; i < component_count; ++i) {
if (components[i] != other.components[i]) {
return false;
}
uint32_t identical_components = 0;
for (uint32_t i = 0; i < 4; ++i) {
identical_components |= uint32_t(GetComponent(i) == other.GetComponent(i))
<< i;
}
return true;
return identical_components;
}
bool operator==(const InstructionOperand& other) const {
return EqualsAbsolute(other) && is_negated == other.is_negated &&
is_absolute_value == other.is_absolute_value;
// Returns which components of two operands will always be bitwise equal, but
// may have different signs (disregarding component_count for simplicity of
// usage with GetComponent, treating the rightmost component as replicated).
uint32_t GetIdenticalComponents(const InstructionOperand& other) const {
if (is_negated != other.is_negated ||
is_absolute_value != other.is_absolute_value) {
return 0;
}
return GetAbsoluteIdenticalComponents(other);
}
};
@ -365,9 +398,6 @@ struct ParsedAllocInstruction {
};
struct ParsedVertexFetchInstruction {
// Index into the ucode dword source.
uint32_t dword_index = 0;
// Opcode for the instruction.
ucode::FetchOpcode opcode;
// Friendly name of the instruction.
@ -409,9 +439,6 @@ struct ParsedVertexFetchInstruction {
};
struct ParsedTextureFetchInstruction {
// Index into the ucode dword source.
uint32_t dword_index = 0;
// Opcode for the instruction.
ucode::FetchOpcode opcode;
// Friendly name of the instruction.
@ -462,17 +489,6 @@ struct ParsedTextureFetchInstruction {
};
struct ParsedAluInstruction {
// Index into the ucode dword source.
uint32_t dword_index = 0;
// True if the vector part of the instruction needs to be executed and data
// about it in this structure is valid.
bool has_vector_op = false;
// True if the scalar part of the instruction needs to be executed and data
// about it in this structure is valid.
bool has_scalar_op = false;
bool is_nop() const { return !has_vector_op && !has_scalar_op; }
// Opcode for the vector part of the instruction.
ucode::AluVectorOpcode vector_opcode = ucode::AluVectorOpcode::kAdd;
// Opcode for the scalar part of the instruction.
@ -488,8 +504,20 @@ struct ParsedAluInstruction {
// Expected predication condition value if predicated.
bool predicate_condition = false;
// Describes how the vector operation result is stored.
InstructionResult vector_result;
// Describes how the vector operation result and, for exports, constant 0/1
// are stored. For simplicity of translation and disassembly, treating
// constant 0/1 writes as a part of the vector operation - they need to be
// expressed somehow in the disassembly anyway with a properly disassembled
// instruction even if only constants are being exported. The XNA disassembler
// falls back to displaying the whole vector operation, even if only constant
// components are written, if the scalar operation is a nop or if the vector
// operation has side effects (but if the scalar operation isn't nop, it
// outputs the entire constant mask in the scalar operation destination).
// Normally the XNA disassembler outputs the constant mask in both vector and
// scalar operations, but that's not required by assembler, so it doesn't
// really matter whether it's specified in the vector operation, in the scalar
// operation, or in both.
InstructionResult vector_and_constant_result;
// Describes how the scalar operation result is stored.
InstructionResult scalar_result;
// Both operations must be executed before any result is stored if vector and
@ -499,27 +527,109 @@ struct ParsedAluInstruction {
// operations.
// Number of source operands of the vector operation.
size_t vector_operand_count = 0;
uint32_t vector_operand_count = 0;
// Describes each source operand of the vector operation.
InstructionOperand vector_operands[3];
// Number of source operands of the scalar operation.
size_t scalar_operand_count = 0;
uint32_t scalar_operand_count = 0;
// Describes each source operand of the scalar operation.
InstructionOperand scalar_operands[2];
// If this is a valid eA write (MAD with a stream constant), returns the index
// of the stream float constant, otherwise returns UINT32_MAX.
// Whether the vector part of the instruction is the same as if it was omitted
// in the assembly (if compiled or assembled with the Xbox 360 shader
// compiler), and thus reassembling the shader with this instruction omitted
// will result in the same microcode (since instructions with just an empty
// write mask may have different values in other fields).
// This is for disassembly! Translators should use the write masks and
// AluVectorOpHasSideEffects to skip operations, as this only covers one very
// specific nop format!
bool IsVectorOpDefaultNop() const {
if (vector_opcode != ucode::AluVectorOpcode::kMax ||
vector_and_constant_result.original_write_mask ||
vector_and_constant_result.is_clamped ||
vector_operands[0].storage_source !=
InstructionStorageSource::kRegister ||
vector_operands[0].storage_index != 0 ||
vector_operands[0].storage_addressing_mode !=
InstructionStorageAddressingMode::kStatic ||
vector_operands[0].is_negated || vector_operands[0].is_absolute_value ||
!vector_operands[0].IsStandardSwizzle() ||
vector_operands[1].storage_source !=
InstructionStorageSource::kRegister ||
vector_operands[1].storage_index != 0 ||
vector_operands[1].storage_addressing_mode !=
InstructionStorageAddressingMode::kStatic ||
vector_operands[1].is_negated || vector_operands[1].is_absolute_value ||
!vector_operands[1].IsStandardSwizzle()) {
return false;
}
if (vector_and_constant_result.storage_target ==
InstructionStorageTarget::kRegister) {
if (vector_and_constant_result.storage_index != 0 ||
vector_and_constant_result.storage_addressing_mode !=
InstructionStorageAddressingMode::kStatic) {
return false;
}
} else {
// In case both vector and scalar operations are nop, still need to write
// somewhere that it's an export, not mov r0._, r0 + retain_prev r0._.
// Accurate round trip is possible only if the target is o0 or oC0,
// because if the total write mask is empty, the XNA assembler forces the
// destination to be o0/oC0, but this doesn't really matter in this case.
if (IsScalarOpDefaultNop()) {
return false;
}
}
return true;
}
// Whether the scalar part of the instruction is the same as if it was omitted
// in the assembly (if compiled or assembled with the Xbox 360 shader
// compiler), and thus reassembling the shader with this instruction omitted
// will result in the same microcode (since instructions with just an empty
// write mask may have different values in other fields).
bool IsScalarOpDefaultNop() const {
if (scalar_opcode != ucode::AluScalarOpcode::kRetainPrev ||
scalar_result.original_write_mask || scalar_result.is_clamped) {
return false;
}
if (scalar_result.storage_target == InstructionStorageTarget::kRegister) {
if (scalar_result.storage_index != 0 ||
scalar_result.storage_addressing_mode !=
InstructionStorageAddressingMode::kStatic) {
return false;
}
}
// For exports, if both are nop, the vector operation will be kept to state
// in the microcode that the destination in the microcode is an export.
return true;
}
// For translation (not disassembly) - whether this instruction has totally no
// effect.
bool IsNop() const {
return scalar_opcode == ucode::AluScalarOpcode::kRetainPrev &&
!scalar_result.GetUsedWriteMask() &&
!vector_and_constant_result.GetUsedWriteMask() &&
!ucode::AluVectorOpHasSideEffects(vector_opcode);
}
// If this is a "normal" eA write recognized by Xenia (MAD with a stream
// constant), returns the index of the stream float constant, otherwise
// returns UINT32_MAX.
uint32_t GetMemExportStreamConstant() const {
if (has_vector_op &&
vector_result.storage_target ==
if (vector_and_constant_result.storage_target ==
InstructionStorageTarget::kExportAddress &&
vector_opcode == ucode::AluVectorOpcode::kMad &&
vector_result.has_all_writes() &&
vector_and_constant_result.GetUsedResultComponents() == 0b1111 &&
!vector_and_constant_result.is_clamped &&
vector_operands[2].storage_source ==
InstructionStorageSource::kConstantFloat &&
vector_operands[2].storage_addressing_mode ==
InstructionStorageAddressingMode::kStatic &&
vector_operands[2].is_standard_swizzle()) {
vector_operands[2].IsStandardSwizzle() &&
!vector_operands[2].is_negated &&
!vector_operands[2].is_absolute_value) {
return vector_operands[2].storage_index;
}
return UINT32_MAX;
@ -581,9 +691,8 @@ class Shader {
struct ConstantRegisterMap {
// Bitmap of all kConstantFloat registers read by the shader.
// Any shader can only read up to 256 of the 512, and the base is dependent
// on the shader type. Each bit corresponds to a storage index from the type
// base, so bit 0 in a vertex shader is register 0, and bit 0 in a fragment
// shader is register 256.
// on the shader type and SQ_VS/PS_CONST registers. Each bit corresponds to
// a storage index from the type base.
uint64_t float_bitmap[256 / 64];
// Bitmap of all loop constants read by the shader.
// Each bit corresponds to a storage index [0-31].
@ -595,8 +704,33 @@ class Shader {
// Total number of kConstantFloat registers read by the shader.
uint32_t float_count;
// Computed byte count of all registers required when packed.
uint32_t packed_byte_length;
// Whether kConstantFloat registers are indexed dynamically - in this case,
// float_bitmap must be set to all 1, and tight packing must not be done.
bool float_dynamic_addressing;
// Returns the index of the float4 constant as if all float4 constant
// registers actually referenced were tightly packed in a buffer, or
// UINT32_MAX if not found.
uint32_t GetPackedFloatConstantIndex(uint32_t float_constant) const {
if (float_constant >= 256) {
return UINT32_MAX;
}
if (float_dynamic_addressing) {
// Any can potentially be read - not packing.
return float_constant;
}
uint32_t block_index = float_constant / 64;
uint32_t bit_index = float_constant % 64;
if (!(float_bitmap[block_index] & (uint64_t(1) << bit_index))) {
return UINT32_MAX;
}
uint32_t offset = 0;
for (uint32_t i = 0; i < block_index; ++i) {
offset += xe::bit_count(float_bitmap[i]);
}
return offset + xe::bit_count(float_bitmap[block_index] &
((uint64_t(1) << bit_index) - 1));
}
};
Shader(ShaderType shader_type, uint64_t ucode_data_hash,
@ -642,7 +776,9 @@ class Shader {
}
// Returns true if the given color target index [0-3].
bool writes_color_target(int i) const { return writes_color_targets_[i]; }
bool writes_color_target(uint32_t i) const {
return writes_color_targets_[i];
}
// True if the shader overrides the pixel depth.
bool writes_depth() const { return writes_depth_; }

File diff suppressed because it is too large Load Diff

View File

@ -57,15 +57,19 @@ class ShaderTranslator {
}
// True if the current shader is a pixel shader.
bool is_pixel_shader() const { return shader_type_ == ShaderType::kPixel; }
// Used constant register info, populated before translation.
const Shader::ConstantRegisterMap& constant_register_map() const {
return constant_register_map_;
}
// True if the current shader addresses general-purpose registers with dynamic
// indices.
// indices, set before translation. Doesn't include writes to r[#+a#] with an
// empty used write mask.
bool uses_register_dynamic_addressing() const {
return uses_register_dynamic_addressing_;
}
// True if the current shader writes to a color target on any execution path.
// True if the current shader writes to a color target on any execution path,
// set before translation. Doesn't include writes with an empty used write
// mask.
bool writes_color_target(int i) const { return writes_color_targets_[i]; }
bool writes_any_color_target() const {
for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) {
@ -75,7 +79,8 @@ class ShaderTranslator {
}
return false;
}
// True if the current shader overrides the pixel depth.
// True if the current shader overrides the pixel depth, set before
// translation. Doesn't include writes with an empty used write mask.
bool writes_depth() const { return writes_depth_; }
// True if Xenia can automatically enable early depth/stencil for the pixel
// shader when RB_DEPTHCONTROL EARLY_Z_ENABLE is not set, provided alpha
@ -181,8 +186,8 @@ class ShaderTranslator {
private:
struct AluOpcodeInfo {
const char* name;
size_t argument_count;
int src_swizzle_component_count;
uint32_t argument_count;
uint32_t src_swizzle_component_count;
bool disable_implicit_early_z;
};
@ -229,10 +234,16 @@ class ShaderTranslator {
ParsedTextureFetchInstruction* out_instr);
void TranslateAluInstruction(const ucode::AluInstruction& op);
void ParseAluVectorOperation(const ucode::AluInstruction& op,
ParsedAluInstruction& instr);
void ParseAluScalarOperation(const ucode::AluInstruction& op,
ParsedAluInstruction& instr);
void ParseAluInstruction(const ucode::AluInstruction& op,
ParsedAluInstruction& out_instr) const;
static void ParseAluInstructionOperand(const ucode::AluInstruction& op,
uint32_t i,
uint32_t swizzle_component_count,
InstructionOperand& out_op);
static void ParseAluInstructionOperandSpecial(
const ucode::AluInstruction& op, InstructionStorageSource storage_source,
uint32_t reg, bool negate, int const_slot, uint32_t component_index,
InstructionOperand& out_op);
// Input shader metadata and microcode.
ShaderType shader_type_;
@ -265,12 +276,16 @@ class ShaderTranslator {
uint32_t unique_vertex_bindings_ = 0;
uint32_t unique_texture_bindings_ = 0;
// These all are gathered before translation.
// uses_register_dynamic_addressing_ for writes, writes_color_targets_,
// writes_depth_ don't include empty used write masks.
Shader::ConstantRegisterMap constant_register_map_ = {0};
bool uses_register_dynamic_addressing_ = false;
bool writes_color_targets_[4] = {false, false, false, false};
bool writes_depth_ = false;
bool implicit_early_z_allowed_ = true;
// Memexport info is gathered before translation.
uint32_t memexport_alloc_count_ = 0;
// For register allocation in implementations - what was used after each
// `alloc export`.

View File

@ -28,7 +28,7 @@ void DisassembleResultOperand(const InstructionResult& result,
out->Append('r');
uses_storage_index = true;
break;
case InstructionStorageTarget::kInterpolant:
case InstructionStorageTarget::kInterpolator:
out->Append('o');
uses_storage_index = true;
break;
@ -45,7 +45,7 @@ void DisassembleResultOperand(const InstructionResult& result,
out->Append("eM");
uses_storage_index = true;
break;
case InstructionStorageTarget::kColorTarget:
case InstructionStorageTarget::kColor:
out->Append("oC");
uses_storage_index = true;
break;
@ -68,12 +68,19 @@ void DisassembleResultOperand(const InstructionResult& result,
break;
}
}
if (!result.has_any_writes()) {
// Not using GetUsedWriteMask/IsStandardSwizzle because they filter out
// components not having any runtime effect, but those components are still
// present in the microcode.
if (!result.original_write_mask) {
out->Append("._");
} else if (!result.is_standard_swizzle()) {
} else if (result.original_write_mask != 0b1111 ||
result.components[0] != SwizzleSource::kX ||
result.components[1] != SwizzleSource::kY ||
result.components[2] != SwizzleSource::kZ ||
result.components[3] != SwizzleSource::kW) {
out->Append('.');
for (int i = 0; i < 4; ++i) {
if (result.write_mask[i]) {
if (result.original_write_mask & (1 << i)) {
out->Append(GetCharForSwizzle(result.components[i]));
} else {
out->Append('_');
@ -116,7 +123,7 @@ void DisassembleSourceOperand(const InstructionOperand& op, StringBuffer* out) {
out->AppendFormat("[{}+aL]", op.storage_index);
break;
}
if (!op.is_standard_swizzle()) {
if (!op.IsStandardSwizzle()) {
out->Append('.');
if (op.component_count == 1) {
out->Append(GetCharForSwizzle(op.components[0]));
@ -124,7 +131,7 @@ void DisassembleSourceOperand(const InstructionOperand& op, StringBuffer* out) {
out->Append(GetCharForSwizzle(op.components[0]));
out->Append(GetCharForSwizzle(op.components[1]));
} else {
for (int j = 0; j < op.component_count; ++j) {
for (uint32_t j = 0; j < op.component_count; ++j) {
out->Append(GetCharForSwizzle(op.components[j]));
}
}
@ -454,11 +461,19 @@ void ParsedTextureFetchInstruction::Disassemble(StringBuffer* out) const {
}
void ParsedAluInstruction::Disassemble(StringBuffer* out) const {
if (is_nop()) {
out->Append(" nop\n");
bool is_vector_op_default_nop = IsVectorOpDefaultNop();
bool is_scalar_op_default_nop = IsScalarOpDefaultNop();
if (is_vector_op_default_nop && is_scalar_op_default_nop) {
out->Append(" ");
if (is_predicated) {
out->Append(predicate_condition ? " (p0) " : "(!p0) ");
} else {
out->Append(" ");
}
out->Append("nop\n");
return;
}
if (has_vector_op) {
if (!is_vector_op_default_nop) {
out->Append(" ");
if (is_predicated) {
out->Append(predicate_condition ? " (p0) " : "(!p0) ");
@ -466,19 +481,19 @@ void ParsedAluInstruction::Disassemble(StringBuffer* out) const {
out->Append(" ");
}
out->Append(vector_opcode_name);
if (vector_result.is_clamped) {
if (vector_and_constant_result.is_clamped) {
out->Append("_sat");
}
out->Append(' ');
DisassembleResultOperand(vector_result, out);
for (int i = 0; i < vector_operand_count; ++i) {
DisassembleResultOperand(vector_and_constant_result, out);
for (uint32_t i = 0; i < vector_operand_count; ++i) {
out->Append(", ");
DisassembleSourceOperand(vector_operands[i], out);
}
out->Append('\n');
}
if (has_scalar_op) {
out->Append(has_vector_op ? " + " : " ");
if (!is_scalar_op_default_nop) {
out->Append(is_vector_op_default_nop ? " " : " + ");
if (is_predicated) {
out->Append(predicate_condition ? " (p0) " : "(!p0) ");
} else {
@ -490,7 +505,7 @@ void ParsedAluInstruction::Disassemble(StringBuffer* out) const {
}
out->Append(' ');
DisassembleResultOperand(scalar_result, out);
for (int i = 0; i < scalar_operand_count; ++i) {
for (uint32_t i = 0; i < scalar_operand_count; ++i) {
out->Append(", ");
DisassembleSourceOperand(scalar_operands[i], out);
}

View File

@ -2003,7 +2003,7 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction(
void SpirvShaderTranslator::ProcessAluInstruction(
const ParsedAluInstruction& instr) {
if (instr.is_nop()) {
if (instr.IsNop()) {
return;
}
@ -2044,7 +2044,7 @@ void SpirvShaderTranslator::ProcessAluInstruction(
ProcessScalarAluOperation(instr, close_predicated_block_scalar);
if (store_vector) {
StoreToResult(b.createLoad(pv_), instr.vector_result);
StoreToResult(b.createLoad(pv_), instr.vector_and_constant_result);
}
if (store_scalar) {
StoreToResult(b.createLoad(ps_), instr.scalar_result);
@ -2252,7 +2252,8 @@ bool SpirvShaderTranslator::ProcessVectorAluOperation(
const ParsedAluInstruction& instr, bool& close_predicated_block) {
close_predicated_block = false;
if (!instr.has_vector_op) {
if (!instr.vector_and_constant_result.GetUsedWriteMask() &&
!AluVectorOpHasSideEffects(instr.vector_opcode)) {
return false;
}
@ -2261,7 +2262,7 @@ bool SpirvShaderTranslator::ProcessVectorAluOperation(
// TODO: If we have identical operands, reuse previous one.
Id sources[3] = {0};
Id dest = vec4_float_zero_;
for (size_t i = 0; i < instr.vector_operand_count; i++) {
for (uint32_t i = 0; i < instr.vector_operand_count; i++) {
sources[i] = LoadFromOperand(instr.vector_operands[i]);
}
@ -2636,7 +2637,8 @@ bool SpirvShaderTranslator::ProcessScalarAluOperation(
const ParsedAluInstruction& instr, bool& close_predicated_block) {
close_predicated_block = false;
if (!instr.has_scalar_op) {
if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev &&
!instr.scalar_result.GetUsedWriteMask()) {
return false;
}
@ -2645,12 +2647,12 @@ bool SpirvShaderTranslator::ProcessScalarAluOperation(
// TODO: If we have identical operands, reuse previous one.
Id sources[3] = {0};
Id dest = b.makeFloatConstant(0);
for (size_t i = 0, x = 0; i < instr.scalar_operand_count; i++) {
for (uint32_t i = 0, x = 0; i < instr.scalar_operand_count; i++) {
auto src = LoadFromOperand(instr.scalar_operands[i]);
// Pull components out of the vector operands and use them as sources.
if (instr.scalar_operands[i].component_count > 1) {
for (int j = 0; j < instr.scalar_operands[i].component_count; j++) {
for (uint32_t j = 0; j < instr.scalar_operands[i].component_count; j++) {
sources[x++] = b.createCompositeExtract(src, float_type_, j);
}
} else {
@ -3191,7 +3193,7 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) {
}
// swizzle
if (op.component_count > 1 && !op.is_standard_swizzle()) {
if (op.component_count > 1 && !op.IsStandardSwizzle()) {
std::vector<uint32_t> operands;
operands.push_back(storage_value);
operands.push_back(b.makeCompositeConstant(
@ -3200,7 +3202,7 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) {
// Components start from left and are duplicated rightwards
// e.g. count = 1, xxxx / count = 2, xyyy ...
for (int i = 0; i < 4; i++) {
for (uint32_t i = 0; i < 4; i++) {
auto swiz = op.components[i];
if (i > op.component_count - 1) {
swiz = op.components[op.component_count - 1];
@ -3244,7 +3246,8 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
return;
}
if (!result.has_any_writes()) {
uint32_t used_write_mask = result.GetUsedWriteMask();
if (!used_write_mask) {
return;
}
@ -3285,7 +3288,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
storage_array = true;
assert_true(uint32_t(result.storage_index) < register_count());
break;
case InstructionStorageTarget::kInterpolant:
case InstructionStorageTarget::kInterpolator:
assert_true(is_vertex_shader());
storage_pointer = interpolators_;
storage_class = spv::StorageClass::StorageClassOutput;
@ -3310,7 +3313,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
storage_offsets.push_back(0);
storage_array = false;
break;
case InstructionStorageTarget::kColorTarget:
case InstructionStorageTarget::kColor:
assert_true(is_pixel_shader());
assert_not_zero(frag_outputs_);
storage_pointer = frag_outputs_;
@ -3351,7 +3354,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
// Only load from storage if we need it later.
Id storage_value = 0;
if ((source_is_scalar && !storage_is_scalar) || !result.has_all_writes()) {
if ((source_is_scalar && !storage_is_scalar) || used_write_mask != 0b1111) {
storage_value = b.createLoad(storage_pointer);
}
@ -3366,7 +3369,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
}
// destination swizzle
if (!result.is_standard_swizzle() && !source_is_scalar) {
if (!result.IsStandardSwizzle() && !source_is_scalar) {
std::vector<uint32_t> operands;
operands.push_back(source_value_id);
operands.push_back(b.makeCompositeConstant(
@ -3377,7 +3380,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
// e.g. count = 1, xxxx / count = 2, xyyy ...
uint32_t source_components = b.getNumComponents(source_value_id);
for (int i = 0; i < 4; i++) {
if (!result.write_mask[i]) {
if (!(used_write_mask & (1 << i))) {
// Undefined / don't care.
operands.push_back(0);
continue;
@ -3411,29 +3414,30 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
}
// write mask
if (!result.has_all_writes() && !source_is_scalar && !storage_is_scalar) {
if (used_write_mask != 0b1111 && !source_is_scalar && !storage_is_scalar) {
std::vector<uint32_t> operands;
operands.push_back(source_value_id);
operands.push_back(storage_value);
for (int i = 0; i < b.getNumTypeComponents(storage_type); i++) {
operands.push_back(
result.write_mask[i] ? i : b.getNumComponents(source_value_id) + i);
operands.push_back((used_write_mask & (1 << i))
? i
: b.getNumComponents(source_value_id) + i);
}
source_value_id =
b.createOp(spv::Op::OpVectorShuffle, storage_type, operands);
} else if (source_is_scalar && !storage_is_scalar) {
assert_true(result.num_writes() >= 1);
assert_not_zero(used_write_mask);
if (result.has_all_writes()) {
if (used_write_mask == 0b1111) {
source_value_id =
b.smearScalar(spv::NoPrecision, source_value_id, storage_type);
} else {
// Find first enabled component
uint32_t index = 0;
for (uint32_t i = 0; i < 4; i++) {
if (result.write_mask[i]) {
if (used_write_mask & (1 << i)) {
index = i;
break;
}
@ -3443,10 +3447,10 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id,
}
} else if (!source_is_scalar && storage_is_scalar) {
// Num writes /needs/ to be 1, and let's assume it's the first element.
assert_true(result.num_writes() == 1);
assert_true(xe::bit_count(used_write_mask) == 1);
for (uint32_t i = 0; i < 4; i++) {
if (result.write_mask[i]) {
if (used_write_mask & (1 << i)) {
source_value_id =
b.createCompositeExtract(source_value_id, storage_type, 0);
break;

View File

@ -667,7 +667,11 @@ static_assert_size(TextureFetchInstruction, 12);
// Both are valid only within the current ALU clause. They are not modified
// when the instruction that would write them fails its predication check.
// - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for
// multiplication (0 * anything = 0) and for NaN in min/max.
// multiplication (0 * anything = 0) wherever it's present (mul, mad, dp,
// etc.) and for NaN in min/max. It's very important to respect this rule for
// multiplication, as games often rely on it in vector normalization (rcp and
// mul), Infinity * 0 resulting in NaN breaks a lot of things in games -
// causes white screen in Halo 3, white specular on characters in GTA IV.
enum class AluScalarOpcode : uint32_t {
// Floating-Point Add
@ -1300,8 +1304,10 @@ enum class AluVectorOpcode : uint32_t {
// Whether the vector instruction has side effects such as discarding a pixel or
// setting the predicate and can't be ignored even if it doesn't write to
// anywhere.
inline bool AluVectorOpcodeHasSideEffects(AluVectorOpcode vector_opcode) {
// anywhere. Note that all scalar operations except for retain_prev have a side
// effect of modifying the previous scalar result register, so they must always
// be executed even if not writing.
constexpr bool AluVectorOpHasSideEffects(AluVectorOpcode vector_opcode) {
switch (vector_opcode) {
case AluVectorOpcode::kSetpEqPush:
case AluVectorOpcode::kSetpNePush:
@ -1319,7 +1325,126 @@ inline bool AluVectorOpcodeHasSideEffects(AluVectorOpcode vector_opcode) {
return false;
}
// Whether each component of a source operand is used at all in the instruction
// (doesn't check the operand count though).
constexpr uint32_t GetAluVectorOpUsedSourceComponents(
AluVectorOpcode vector_opcode, uint32_t src_index) {
switch (vector_opcode) {
case AluVectorOpcode::kDp3:
return 0b0111;
case AluVectorOpcode::kDp2Add:
return src_index == 3 ? 0b0001 : 0b0011;
case AluVectorOpcode::kSetpEqPush:
case AluVectorOpcode::kSetpNePush:
case AluVectorOpcode::kSetpGtPush:
case AluVectorOpcode::kSetpGePush:
return 0b1001;
case AluVectorOpcode::kDst:
return src_index == 2 ? 0b1010 : 0b0110;
default:
break;
}
return 0b1111;
}
// Whether each component of a source operand is needed for the instruction if
// executed with the specified write mask, and thus can't be thrown away or be
// undefined in translation. For per-component operations, for example, only the
// components specified in the write mask are needed, but there are instructions
// with special behavior for certain components.
constexpr uint32_t GetAluVectorOpNeededSourceComponents(
AluVectorOpcode vector_opcode, uint32_t src_index, uint32_t write_mask) {
uint32_t components = write_mask;
switch (vector_opcode) {
case AluVectorOpcode::kDp4:
case AluVectorOpcode::kMax4:
components = write_mask ? 0b1111 : 0;
break;
case AluVectorOpcode::kDp3:
components = write_mask ? 0b0111 : 0;
break;
case AluVectorOpcode::kDp2Add:
components = write_mask ? (src_index == 3 ? 0b0001 : 0b0011) : 0;
break;
case AluVectorOpcode::kCube:
components = write_mask ? 0b1111 : 0;
break;
case AluVectorOpcode::kSetpEqPush:
case AluVectorOpcode::kSetpNePush:
case AluVectorOpcode::kSetpGtPush:
case AluVectorOpcode::kSetpGePush:
components = write_mask ? 0b1001 : 0b1000;
break;
case AluVectorOpcode::kKillEq:
case AluVectorOpcode::kKillGt:
case AluVectorOpcode::kKillGe:
case AluVectorOpcode::kKillNe:
components = 0b1111;
break;
// kDst is per-component, but not all components are used -
// GetAluVectorOpUsedSourceComponents will filter out the unused ones.
case AluVectorOpcode::kMaxA:
if (src_index == 1) {
components |= 0b1000;
}
break;
default:
break;
}
return components &
GetAluVectorOpUsedSourceComponents(vector_opcode, src_index);
}
enum class ExportRegister : uint32_t {
kVSInterpolator0 = 0,
kVSInterpolator1,
kVSInterpolator2,
kVSInterpolator3,
kVSInterpolator4,
kVSInterpolator5,
kVSInterpolator6,
kVSInterpolator7,
kVSInterpolator8,
kVSInterpolator9,
kVSInterpolator10,
kVSInterpolator11,
kVSInterpolator12,
kVSInterpolator13,
kVSInterpolator14,
kVSInterpolator15,
kVSPosition = 62,
// See R6xx/R7xx registers for details (USE_VTX_POINT_SIZE, USE_VTX_EDGE_FLAG,
// USE_VTX_KILL_FLAG).
// X - PSIZE (gl_PointSize).
// Y - EDGEFLAG (glEdgeFlag) for PrimitiveType::kPolygon wireframe/point
// drawing.
// Z - KILLVERTEX flag (used in Banjo-Kazooie: Nuts & Bolts for grass), set
// for killing primitives based on PA_CL_CLIP_CNTL::VTX_KILL_OR condition.
kVSPointSizeEdgeFlagKillVertex = 63,
kPSColor0 = 0,
kPSColor1,
kPSColor2,
kPSColor3,
// In X.
kPSDepth = 61,
// Memory export: index.?y?? * 0100 + xe_gpu_memexport_stream_t.xyzw.
kExportAddress = 32,
// Memory export: values for texels [index+0], [index+1], ..., [index+4].
kExportData0 = 33,
kExportData1,
kExportData2,
kExportData3,
kExportData4,
};
struct AluInstruction {
// Raw accessors.
// Whether data is being exported (or written to local registers).
bool is_export() const { return data_.export_data == 1; }
bool export_write_mask() const { return data_.scalar_dest_rel == 1; }
@ -1334,20 +1459,12 @@ struct AluInstruction {
bool is_const_1_addressed() const { return data_.const_1_rel_abs == 1; }
bool is_address_relative() const { return data_.address_absolute == 1; }
bool has_vector_op() const {
return vector_write_mask() || is_export() ||
AluVectorOpcodeHasSideEffects(vector_opcode());
}
AluVectorOpcode vector_opcode() const { return data_.vector_opc; }
uint32_t vector_write_mask() const { return data_.vector_write_mask; }
uint32_t vector_dest() const { return data_.vector_dest; }
bool is_vector_dest_relative() const { return data_.vector_dest_rel == 1; }
bool vector_clamp() const { return data_.vector_clamp == 1; }
bool has_scalar_op() const {
return scalar_opcode() != AluScalarOpcode::kRetainPrev ||
(!is_export() && scalar_write_mask() != 0);
}
AluScalarOpcode scalar_opcode() const { return data_.scalar_opc; }
uint32_t scalar_write_mask() const { return data_.scalar_write_mask; }
uint32_t scalar_dest() const { return data_.scalar_dest; }
@ -1407,14 +1524,62 @@ struct AluInstruction {
}
}
// Helpers.
// Note that even if the export component is unused (like W of the vertex
// shader misc register, YZW of pixel shader depth), it must still not be
// excluded - that may make disassembly not reassemblable if there are
// constant 0 writes in the export, like, oPts.x000 will be assembled, but
// oPts.x00_ will not, even though W has no effect on anything.
uint32_t GetVectorOpResultWriteMask() const {
uint32_t mask = vector_write_mask();
if (is_export()) {
mask &= ~scalar_write_mask();
}
return mask;
}
uint32_t GetScalarOpResultWriteMask() const {
uint32_t mask = scalar_write_mask();
if (is_export()) {
mask &= ~vector_write_mask();
}
return mask;
}
uint32_t GetConstant0WriteMask() const {
if (!is_export() || !is_scalar_dest_relative()) {
return 0b0000;
}
return 0b1111 & ~(vector_write_mask() | scalar_write_mask());
}
uint32_t GetConstant1WriteMask() const {
if (!is_export()) {
return 0b0000;
}
return vector_write_mask() & scalar_write_mask();
}
private:
XEPACKEDSTRUCT(Data, {
XEPACKEDSTRUCTANONYMOUS({
// If exporting, both vector and scalar operations use the vector
// destination (which can't be relative in this case).
// Not very important note: If both scalar and vector operations exporting
// something have empty write mask, the XNA assembler forces vector_dest
// to 0 (interpolator 0 or color 0) directly in the microcode.
uint32_t vector_dest : 6;
uint32_t vector_dest_rel : 1;
uint32_t abs_constants : 1;
uint32_t scalar_dest : 6;
uint32_t scalar_dest_rel : 1;
// Exports have different write masking (export is done to vector_dest by
// both the vector and the scalar operation, and exports can write
// constant 0 and 1). For each component:
// - vector_write_mask 0, scalar_write_mask 0:
// - scalar_dest_rel 0 - unchanged.
// - scalar_dest_rel 1 - constant 0 (all components must be written).
// - vector_write_mask 1, scalar_write_mask 0 - from vector operation.
// - vector_write_mask 0, scalar_write_mask 1 - from scalar operation.
// - vector_write_mask 1, scalar_write_mask 1 - constant 1.
uint32_t export_data : 1;
uint32_t vector_write_mask : 4;
uint32_t scalar_write_mask : 4;

View File

@ -267,6 +267,7 @@ namespace shader_playground {
"--shader_output=" + translatedDisasmPath,
"--shader_output_type=" + outputType,
"--vertex_shader_output_type=" + vertexShaderType,
"--dxbc_source_map=true",
};
if (translationComboBox.SelectedIndex == 1) {
startArguments.Add("--shader_output_dxbc_rov=true");