[D3D12] DXBC: eA and eM registers

This commit is contained in:
Triang3l 2018-12-22 19:51:12 +03:00
parent e803ee84d5
commit bd9aae016f
3 changed files with 161 additions and 17 deletions

View File

@ -126,6 +126,8 @@ void DxbcShaderTranslator::Reset() {
texture_srvs_.clear();
sampler_bindings_.clear();
memexport_alloc_current_count_ = 0;
std::memset(&stat_, 0, sizeof(stat_));
}
@ -967,6 +969,33 @@ void DxbcShaderTranslator::StartTranslation() {
}
if (!is_depth_only_pixel_shader_) {
// Allocate temporary registers for memexport addresses and data.
std::memset(system_temps_memexport_address_, 0xFF,
sizeof(system_temps_memexport_address_));
std::memset(system_temps_memexport_data_, 0xFF,
sizeof(system_temps_memexport_data_));
system_temp_memexport_written_ = UINT32_MAX;
const uint8_t* memexports_written = memexport_eM_written();
for (uint32_t i = 0; i < kMaxMemExports; ++i) {
uint32_t memexport_alloc_written = memexports_written[i];
if (memexport_alloc_written == 0) {
continue;
}
// If memexport is used at all, allocate a register containing whether eM#
// have actually been written to.
if (system_temp_memexport_written_ == UINT32_MAX) {
system_temp_memexport_written_ = PushSystemTemp(true);
}
system_temps_memexport_address_[i] = PushSystemTemp(true);
uint32_t memexport_data_index;
while (xe::bit_scan_forward(memexport_alloc_written,
&memexport_data_index)) {
memexport_alloc_written &= ~(1u << memexport_data_index);
system_temps_memexport_data_[i][memexport_data_index] =
PushSystemTemp();
}
}
// Allocate system temporary variables for the translated code.
system_temp_pv_ = PushSystemTemp(true);
system_temp_ps_pc_p0_a0_ = PushSystemTemp(true);
@ -1266,6 +1295,26 @@ void DxbcShaderTranslator::CompleteShaderCode() {
// - system_temp_grad_h_lod_.
// - system_temp_grad_v_.
PopSystemTemp(6);
// TODO(Triang3l): Do memexport.
// Release memexport temporary registers.
for (int i = kMaxMemExports - 1; i >= 0; --i) {
if (system_temps_memexport_address_[i] == UINT32_MAX) {
continue;
}
// Release exported data registers.
for (int j = 4; j >= 0; --j) {
if (system_temps_memexport_data_[i][j] != UINT32_MAX) {
PopSystemTemp();
}
}
// Release the address register.
PopSystemTemp();
}
if (system_temp_memexport_written_ != UINT32_MAX) {
PopSystemTemp();
}
}
// Write stage-specific epilogue.
@ -2009,12 +2058,30 @@ void DxbcShaderTranslator::UnloadDxbcSourceOperand(
}
void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
uint32_t reg, bool replicate_x) {
uint32_t reg, bool replicate_x,
bool can_store_memexport_address) {
if (result.storage_target == InstructionStorageTarget::kNone ||
!result.has_any_writes()) {
return;
}
// Validate memexport writes (Halo 3 has some weird invalid ones).
if (result.storage_target == InstructionStorageTarget::kExportAddress) {
if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 ||
memexport_alloc_current_count_ > kMaxMemExports ||
system_temps_memexport_address_[memexport_alloc_current_count_ - 1] ==
UINT32_MAX) {
return;
}
} else if (result.storage_target == InstructionStorageTarget::kExportData) {
if (memexport_alloc_current_count_ == 0 ||
memexport_alloc_current_count_ > kMaxMemExports ||
system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
[result.storage_index] == UINT32_MAX) {
return;
}
}
uint32_t saturate_bit =
ENCODE_D3D10_SB_INSTRUCTION_SATURATE(result.is_clamped);
@ -2187,6 +2254,34 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
shader_code_.push_back(system_temp_position_);
break;
case InstructionStorageTarget::kExportAddress:
++stat_.instruction_count;
++stat_.mov_instruction_count;
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) |
saturate_bit);
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1));
shader_code_.push_back(
system_temps_memexport_address_[memexport_alloc_current_count_ -
1]);
break;
case InstructionStorageTarget::kExportData:
++stat_.instruction_count;
++stat_.mov_instruction_count;
shader_code_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) |
saturate_bit);
shader_code_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1));
shader_code_.push_back(
system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
[uint32_t(result.storage_index)]);
break;
case InstructionStorageTarget::kColorTarget:
++stat_.instruction_count;
++stat_.mov_instruction_count;
@ -2219,6 +2314,25 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
}
}
if (result.storage_target == InstructionStorageTarget::kExportData) {
// Mark that the eM# has been written to and needs to be exported.
uint32_t memexport_index = memexport_alloc_current_count_ - 1;
shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7));
shader_code_.push_back(EncodeVectorMaskedOperand(
D3D10_SB_OPERAND_TYPE_TEMP, 1 << (memexport_index >> 2), 1));
shader_code_.push_back(system_temp_memexport_written_);
shader_code_.push_back(EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP,
memexport_index >> 2, 1));
shader_code_.push_back(system_temp_memexport_written_);
shader_code_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0));
shader_code_.push_back(
1u << (uint32_t(result.storage_index) + ((memexport_index & 3) << 3)));
++stat_.instruction_count;
++stat_.uint_instruction_count;
}
if (edram_rov_used_ &&
result.storage_target == InstructionStorageTarget::kColorTarget) {
// For ROV output, mark that the color has been written to.
@ -2862,6 +2976,19 @@ void DxbcShaderTranslator::ProcessJumpInstruction(
JumpToLabel(instr.target_address);
}
void DxbcShaderTranslator::ProcessAllocInstruction(
const ParsedAllocInstruction& instr) {
if (FLAGS_dxbc_source_map) {
instruction_disassembly_buffer_.Reset();
instr.Disassemble(&instruction_disassembly_buffer_);
EmitInstructionDisassembly();
}
if (instr.type == AllocType::kMemory) {
++memexport_alloc_current_count_;
}
}
uint32_t DxbcShaderTranslator::AppendString(std::vector<uint32_t>& dest,
const char* source) {
size_t size = std::strlen(source) + 1;

View File

@ -521,6 +521,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
void ProcessLoopEndInstruction(
const ParsedLoopEndInstruction& instr) override;
void ProcessJumpInstruction(const ParsedJumpInstruction& instr) override;
void ProcessAllocInstruction(const ParsedAllocInstruction& instr) override;
void ProcessVertexFetchInstruction(
const ParsedVertexFetchInstruction& instr) override;
@ -965,8 +966,10 @@ class DxbcShaderTranslator : public ShaderTranslator {
void UnloadDxbcSourceOperand(const DxbcSourceOperand& operand);
// Writes xyzw or xxxx of the specified r# to the destination.
// can_store_memexport_address is for safety, to allow only proper MADs with
// a stream constant to write to eA.
void StoreResult(const InstructionResult& result, uint32_t reg,
bool replicate_x);
bool replicate_x, bool can_store_memexport_address = false);
// The nesting of `if` instructions is the following:
// - pc checks (labels).
@ -1149,20 +1152,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
// translation (for the declaration).
uint32_t system_temp_count_max_;
// Vector ALU result/scratch (since Xenos write masks can contain swizzles).
uint32_t system_temp_pv_;
// Temporary register ID for previous scalar result, program counter,
// predicate and absolute address register.
uint32_t system_temp_ps_pc_p0_a0_;
// Loop index stack - .x is the active loop, shifted right to .yzw on push.
uint32_t system_temp_aL_;
// Loop counter stack, .x is the active loop. Represents number of times
// remaining to loop.
uint32_t system_temp_loop_count_;
// Explicitly set texture gradients and LOD.
uint32_t system_temp_grad_h_lod_;
uint32_t system_temp_grad_v_;
// Position in vertex shaders (because viewport and W transformations can be
// applied in the end of the shader).
uint32_t system_temp_position_;
@ -1182,6 +1171,29 @@ class DxbcShaderTranslator : public ShaderTranslator {
// - Z - depth Y derivative.
uint32_t system_temp_depth_;
// Bits containing whether each eM# has been written, for up to 16 streams, or
// UINT32_MAX if memexport is not used. 8 bits (5 used) for each stream, with
// 4 `alloc export`s per component.
uint32_t system_temp_memexport_written_;
// eA in each `alloc export`, or UINT32_MAX if not used.
uint32_t system_temps_memexport_address_[kMaxMemExports];
// eM# in each `alloc export`, or UINT32_MAX if not used.
uint32_t system_temps_memexport_data_[kMaxMemExports][5];
// Vector ALU result/scratch (since Xenos write masks can contain swizzles).
uint32_t system_temp_pv_;
// Temporary register ID for previous scalar result, program counter,
// predicate and absolute address register.
uint32_t system_temp_ps_pc_p0_a0_;
// Loop index stack - .x is the active loop, shifted right to .yzw on push.
uint32_t system_temp_aL_;
// Loop counter stack, .x is the active loop. Represents number of times
// remaining to loop.
uint32_t system_temp_loop_count_;
// Explicitly set texture gradients and LOD.
uint32_t system_temp_grad_h_lod_;
uint32_t system_temp_grad_v_;
// The bool constant number containing the condition for the currently
// processed exec (or the last - unless a label has reset this), or
// kCfExecBoolConstantNone if it's not checked.
@ -1209,6 +1221,10 @@ class DxbcShaderTranslator : public ShaderTranslator {
std::vector<TextureSRV> texture_srvs_;
std::vector<SamplerBinding> sampler_bindings_;
// Number of `alloc export`s encountered so far in the translation. The index
// of the current eA/eM# temp register set is this minus 1, if it's not 0.
uint32_t memexport_alloc_current_count_;
// The STAT chunk (based on Wine d3dcompiler_parse_stat).
struct Statistics {
uint32_t instruction_count;

View File

@ -1289,7 +1289,8 @@ void DxbcShaderTranslator::ProcessVectorAluInstruction(
}
}
StoreResult(instr.result, system_temp_pv_, replicate_result);
StoreResult(instr.result, system_temp_pv_, replicate_result,
instr.GetMemExportStreamConstant() != UINT32_MAX);
if (predicate_written) {
cf_exec_predicate_written_ = true;