[D3D12] Force early Z with DSV, fix blend disabled flag in rb_colorcontrol ignored

This commit is contained in:
Triang3l 2019-01-11 17:07:33 +03:00
parent d7ed044be1
commit ef523823d5
9 changed files with 202 additions and 89 deletions

View File

@ -2199,8 +2199,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
// Alpha test.
int32_t alpha_test;
if (rb_colorcontrol & 0x8) {
uint32_t alpha_test_function = rb_colorcontrol & 0x7;
uint32_t alpha_test_function = rb_colorcontrol & 0x7;
if ((rb_colorcontrol & 0x8) && alpha_test_function != 0x7) {
// 0: Never - fail in [-inf, +inf].
// 1: Less - fail in [ref, +inf].
// 2: Equal - pass in [ref, ref].

View File

@ -40,6 +40,17 @@ class D3D12Shader : public Shader {
const DxbcShaderTranslator::SamplerBinding* sampler_bindings,
uint32_t sampler_binding_count);
void SetForcedEarlyZShaderObject(const std::vector<uint8_t>& shader_object) {
forced_early_z_shader_ = shader_object;
}
// Returns the shader with forced early depth/stencil set with
// SetForcedEarlyZShader after translation. If there's none (for example,
// if the shader discards pixels or writes to the depth buffer), an empty
// vector is returned.
const std::vector<uint8_t>& GetForcedEarlyZShaderObject() const {
return forced_early_z_shader_;
}
bool DisassembleDxbc(const ui::d3d12::D3D12Provider* provider);
static constexpr uint32_t kMaxTextureSRVIndexBits =
@ -78,9 +89,12 @@ class D3D12Shader : public Shader {
private:
PrimitiveType domain_shader_primitive_type_ = PrimitiveType::kNone;
std::vector<TextureSRV> texture_srvs_;
uint32_t used_texture_mask_ = 0;
std::vector<SamplerBinding> sampler_bindings_;
std::vector<uint8_t> forced_early_z_shader_;
};
} // namespace d3d12

View File

@ -331,6 +331,15 @@ bool PipelineCache::TranslateShader(D3D12Shader* shader,
shader->ucode_disassembly().c_str());
}
// If may be useful, create a version of the shader with early depth/stencil
// forced.
if (shader->type() == ShaderType::kPixel && !edram_rov_used_ &&
shader->early_z_allowed()) {
shader->SetForcedEarlyZShaderObject(
std::move(DxbcShaderTranslator::ForceEarlyDepthStencil(
shader->translated_binary().data())));
}
// Disassemble the shader for dumping.
if (FLAGS_d3d12_dxbc_disasm) {
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
@ -569,6 +578,8 @@ bool PipelineCache::GetCurrentStateDescription(
}
if (!edram_rov_used_) {
uint32_t rb_colorcontrol = regs[XE_GPU_REG_RB_COLORCONTROL].u32;
// Depth/stencil. No stencil, always passing depth test and no depth writing
// means depth disabled.
if (render_targets[4].format != DXGI_FORMAT_UNKNOWN) {
@ -616,6 +627,16 @@ bool PipelineCache::GetCurrentStateDescription(
description_out.depth_func = 0b111;
}
// Forced early Z if the shader allows that and alpha testing is disabled.
// TODO(Triang3l): For memexporting shaders, possibly choose this according
// to the early Z toggle in RB_DEPTHCONTROL (the correct behavior is still
// unknown).
if (pixel_shader != nullptr &&
pixel_shader->GetForcedEarlyZShaderObject().size() != 0 &&
(!(rb_colorcontrol & 0x8) || (rb_colorcontrol & 0x7) == 0x7)) {
description_out.force_early_z = 1;
}
// Render targets and blending state. 32 because of 0x1F mask, for safety
// (all unknown to zero).
uint32_t color_mask = command_processor_->GetCurrentColorMask(pixel_shader);
@ -695,7 +716,7 @@ bool PipelineCache::GetCurrentStateDescription(
rt.format = RenderTargetCache::GetBaseColorFormat(
ColorRenderTargetFormat((color_info >> 16) & 0xF));
rt.write_mask = (color_mask >> (guest_rt_index * 4)) & 0xF;
if (rt.write_mask) {
if (!(rb_colorcontrol & 0x20) && rt.write_mask) {
rt.src_blend = kBlendFactorMap[blendcontrol & 0x1F];
rt.dest_blend = kBlendFactorMap[(blendcontrol >> 8) & 0x1F];
rt.blend_op = BlendOp((blendcontrol >> 5) & 0x7);
@ -874,10 +895,17 @@ ID3D12PipelineState* PipelineCache::CreatePipelineState(
assert_always();
return nullptr;
}
state_desc.PS.pShaderBytecode =
description.pixel_shader->translated_binary().data();
state_desc.PS.BytecodeLength =
description.pixel_shader->translated_binary().size();
const auto& forced_early_z_shader =
description.pixel_shader->GetForcedEarlyZShaderObject();
if (description.force_early_z && forced_early_z_shader.size() != 0) {
state_desc.PS.pShaderBytecode = forced_early_z_shader.data();
state_desc.PS.BytecodeLength = forced_early_z_shader.size();
} else {
state_desc.PS.pShaderBytecode =
description.pixel_shader->translated_binary().data();
state_desc.PS.BytecodeLength =
description.pixel_shader->translated_binary().size();
}
} else if (edram_rov_used_) {
state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data();
state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size();

View File

@ -155,6 +155,7 @@ class PipelineCache {
uint32_t depth_write : 1; // 21
uint32_t stencil_enable : 1; // 22
uint32_t stencil_read_mask : 8; // 30
uint32_t force_early_z : 1; // 31
uint32_t stencil_write_mask : 8; // 8
uint32_t stencil_front_fail_op : 3; // 11

View File

@ -89,6 +89,55 @@ DxbcShaderTranslator::DxbcShaderTranslator(uint32_t vendor_id,
}
DxbcShaderTranslator::~DxbcShaderTranslator() = default;
std::vector<uint8_t> DxbcShaderTranslator::ForceEarlyDepthStencil(
const uint8_t* shader) {
const uint32_t* old_shader = reinterpret_cast<const uint32_t*>(shader);
// To return something anyway even if patching fails.
std::vector<uint8_t> new_shader;
uint32_t shader_size_bytes = old_shader[6];
new_shader.resize(shader_size_bytes);
std::memcpy(new_shader.data(), shader, shader_size_bytes);
// Find the SHEX chunk.
uint32_t chunk_count = old_shader[7];
for (uint32_t i = 0; i < chunk_count; ++i) {
uint32_t chunk_offset_bytes = old_shader[8 + i];
const uint32_t* chunk = old_shader + chunk_offset_bytes / sizeof(uint32_t);
if (chunk[0] != 'XEHS') {
continue;
}
// Find dcl_globalFlags and patch it.
uint32_t code_size_dwords = chunk[3];
chunk += 4;
for (uint32_t j = 0; j < code_size_dwords;) {
uint32_t opcode_token = chunk[j];
uint32_t opcode = DECODE_D3D10_SB_OPCODE_TYPE(opcode_token);
if (opcode == D3D10_SB_OPCODE_DCL_GLOBAL_FLAGS) {
opcode_token |= D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL;
std::memcpy(new_shader.data() +
(chunk_offset_bytes + (4 + j) * sizeof(uint32_t)),
&opcode_token, sizeof(uint32_t));
// Recalculate the checksum since the shader was modified.
CalculateDXBCChecksum(
reinterpret_cast<unsigned char*>(new_shader.data()),
shader_size_bytes,
reinterpret_cast<unsigned int*>(new_shader.data() +
sizeof(uint32_t)));
break;
}
if (opcode == D3D10_SB_OPCODE_CUSTOMDATA) {
j += chunk[j + 1];
} else {
j += DECODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(opcode_token);
}
}
break;
}
return std::move(new_shader);
}
std::vector<uint8_t> DxbcShaderTranslator::CreateDepthOnlyPixelShader() {
Reset();
is_depth_only_pixel_shader_ = true;
@ -4034,7 +4083,8 @@ void DxbcShaderTranslator::WriteShaderCode() {
}
// Don't allow refactoring when converting to native code to maintain position
// invariance (needed even in pixel shaders for oDepth invariance).
// invariance (needed even in pixel shaders for oDepth invariance). Also this
// dcl will be modified by ForceEarlyDepthStencil.
shader_object_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_GLOBAL_FLAGS) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1));

View File

@ -491,6 +491,10 @@ class DxbcShaderTranslator : public ShaderTranslator {
kEDRAM,
};
// Creates a copy of the shader with early depth/stencil testing forced,
// overriding that alpha testing is used in the shader.
static std::vector<uint8_t> ForceEarlyDepthStencil(const uint8_t* shader);
// Returns the bits that need to be added to the RT flags constant - needs to
// be done externally, not in SetColorFormatConstants, because the flags
// contain other state.

View File

@ -607,6 +607,10 @@ class Shader {
// Returns true if the given color target index [0-3].
bool writes_color_target(int i) const { return writes_color_targets_[i]; }
// Returns true if the pixel shader can potentially have early depth/stencil
// testing enabled, provided alpha testing is disabled.
bool early_z_allowed() const { return early_z_allowed_; }
// True if the shader was translated and prepared without error.
bool is_valid() const { return is_valid_; }
@ -655,6 +659,7 @@ class Shader {
std::vector<TextureBinding> texture_bindings_;
ConstantRegisterMap constant_register_map_ = {0};
bool writes_color_targets_[4] = {false, false, false, false};
bool early_z_allowed_ = true;
std::vector<uint32_t> memexport_stream_constants_;
bool is_valid_ = false;

View File

@ -65,6 +65,7 @@ void ShaderTranslator::Reset() {
writes_color_targets_[i] = false;
}
writes_depth_ = false;
early_z_allowed_ = true;
memexport_alloc_count_ = 0;
memexport_eA_written_ = 0;
std::memset(&memexport_eM_written_, 0, sizeof(memexport_eM_written_));
@ -189,6 +190,7 @@ bool ShaderTranslator::TranslateInternal(Shader* shader) {
for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) {
shader->writes_color_targets_[i] = writes_color_targets_[i];
}
shader->early_z_allowed_ = early_z_allowed_;
shader->memexport_stream_constants_.clear();
for (uint32_t memexport_stream_constant : memexport_stream_constants_) {
shader->memexport_stream_constants_.push_back(memexport_stream_constant);
@ -288,6 +290,7 @@ void ShaderTranslator::GatherInstructionInformation(
if (op.has_vector_op()) {
const auto& opcode_info =
alu_vector_opcode_infos_[static_cast<int>(op.vector_opcode())];
early_z_allowed_ &= !opcode_info.disable_early_z;
for (size_t i = 0; i < opcode_info.argument_count; ++i) {
if (op.src_is_temp(i + 1) && (op.src_reg(i + 1) & 0x40)) {
uses_register_dynamic_addressing_ = true;
@ -299,6 +302,7 @@ void ShaderTranslator::GatherInstructionInformation(
writes_color_targets_[op.vector_dest()] = true;
} else if (op.vector_dest() == 61) {
writes_depth_ = true;
early_z_allowed_ = false;
}
}
if (memexport_alloc_count_ > 0 &&
@ -335,6 +339,7 @@ void ShaderTranslator::GatherInstructionInformation(
if (op.has_scalar_op()) {
const auto& opcode_info =
alu_scalar_opcode_infos_[static_cast<int>(op.scalar_opcode())];
early_z_allowed_ &= !opcode_info.disable_early_z;
if (opcode_info.argument_count == 1 && op.src_is_temp(3) &&
(op.src_reg(3) & 0x40)) {
uses_register_dynamic_addressing_ = true;
@ -345,6 +350,7 @@ void ShaderTranslator::GatherInstructionInformation(
writes_color_targets_[op.scalar_dest()] = true;
} else if (op.scalar_dest() == 61) {
writes_depth_ = true;
early_z_allowed_ = false;
}
}
if (memexport_alloc_count_ > 0 &&
@ -1030,91 +1036,91 @@ void ShaderTranslator::ParseTextureFetchInstruction(
const ShaderTranslator::AluOpcodeInfo
ShaderTranslator::alu_vector_opcode_infos_[0x20] = {
{"add", 2, 4}, // 0
{"mul", 2, 4}, // 1
{"max", 2, 4}, // 2
{"min", 2, 4}, // 3
{"seq", 2, 4}, // 4
{"sgt", 2, 4}, // 5
{"sge", 2, 4}, // 6
{"sne", 2, 4}, // 7
{"frc", 1, 4}, // 8
{"trunc", 1, 4}, // 9
{"floor", 1, 4}, // 10
{"mad", 3, 4}, // 11
{"cndeq", 3, 4}, // 12
{"cndge", 3, 4}, // 13
{"cndgt", 3, 4}, // 14
{"dp4", 2, 4}, // 15
{"dp3", 2, 4}, // 16
{"dp2add", 3, 4}, // 17
{"cube", 2, 4}, // 18
{"max4", 1, 4}, // 19
{"setp_eq_push", 2, 4}, // 20
{"setp_ne_push", 2, 4}, // 21
{"setp_gt_push", 2, 4}, // 22
{"setp_ge_push", 2, 4}, // 23
{"kill_eq", 2, 4}, // 24
{"kill_gt", 2, 4}, // 25
{"kill_ge", 2, 4}, // 26
{"kill_ne", 2, 4}, // 27
{"dst", 2, 4}, // 28
{"maxa", 2, 4}, // 29
{"add", 2, 4, false}, // 0
{"mul", 2, 4, false}, // 1
{"max", 2, 4, false}, // 2
{"min", 2, 4, false}, // 3
{"seq", 2, 4, false}, // 4
{"sgt", 2, 4, false}, // 5
{"sge", 2, 4, false}, // 6
{"sne", 2, 4, false}, // 7
{"frc", 1, 4, false}, // 8
{"trunc", 1, 4, false}, // 9
{"floor", 1, 4, false}, // 10
{"mad", 3, 4, false}, // 11
{"cndeq", 3, 4, false}, // 12
{"cndge", 3, 4, false}, // 13
{"cndgt", 3, 4, false}, // 14
{"dp4", 2, 4, false}, // 15
{"dp3", 2, 4, false}, // 16
{"dp2add", 3, 4, false}, // 17
{"cube", 2, 4, false}, // 18
{"max4", 1, 4, false}, // 19
{"setp_eq_push", 2, 4, false}, // 20
{"setp_ne_push", 2, 4, false}, // 21
{"setp_gt_push", 2, 4, false}, // 22
{"setp_ge_push", 2, 4, false}, // 23
{"kill_eq", 2, 4, true}, // 24
{"kill_gt", 2, 4, true}, // 25
{"kill_ge", 2, 4, true}, // 26
{"kill_ne", 2, 4, true}, // 27
{"dst", 2, 4, false}, // 28
{"maxa", 2, 4, false}, // 29
};
const ShaderTranslator::AluOpcodeInfo
ShaderTranslator::alu_scalar_opcode_infos_[0x40] = {
{"adds", 1, 2}, // 0
{"adds_prev", 1, 1}, // 1
{"muls", 1, 2}, // 2
{"muls_prev", 1, 1}, // 3
{"muls_prev2", 1, 2}, // 4
{"maxs", 1, 2}, // 5
{"mins", 1, 2}, // 6
{"seqs", 1, 1}, // 7
{"sgts", 1, 1}, // 8
{"sges", 1, 1}, // 9
{"snes", 1, 1}, // 10
{"frcs", 1, 1}, // 11
{"truncs", 1, 1}, // 12
{"floors", 1, 1}, // 13
{"exp", 1, 1}, // 14
{"logc", 1, 1}, // 15
{"log", 1, 1}, // 16
{"rcpc", 1, 1}, // 17
{"rcpf", 1, 1}, // 18
{"rcp", 1, 1}, // 19
{"rsqc", 1, 1}, // 20
{"rsqf", 1, 1}, // 21
{"rsq", 1, 1}, // 22
{"maxas", 1, 2}, // 23
{"maxasf", 1, 2}, // 24
{"subs", 1, 2}, // 25
{"subs_prev", 1, 1}, // 26
{"setp_eq", 1, 1}, // 27
{"setp_ne", 1, 1}, // 28
{"setp_gt", 1, 1}, // 29
{"setp_ge", 1, 1}, // 30
{"setp_inv", 1, 1}, // 31
{"setp_pop", 1, 1}, // 32
{"setp_clr", 1, 1}, // 33
{"setp_rstr", 1, 1}, // 34
{"kills_eq", 1, 1}, // 35
{"kills_gt", 1, 1}, // 36
{"kills_ge", 1, 1}, // 37
{"kills_ne", 1, 1}, // 38
{"kills_one", 1, 1}, // 39
{"sqrt", 1, 1}, // 40
{"UNKNOWN", 0, 0}, // 41
{"mulsc", 2, 1}, // 42
{"mulsc", 2, 1}, // 43
{"addsc", 2, 1}, // 44
{"addsc", 2, 1}, // 45
{"subsc", 2, 1}, // 46
{"subsc", 2, 1}, // 47
{"sin", 1, 1}, // 48
{"cos", 1, 1}, // 49
{"retain_prev", 1, 1}, // 50
{"adds", 1, 2, false}, // 0
{"adds_prev", 1, 1, false}, // 1
{"muls", 1, 2, false}, // 2
{"muls_prev", 1, 1, false}, // 3
{"muls_prev2", 1, 2, false}, // 4
{"maxs", 1, 2, false}, // 5
{"mins", 1, 2, false}, // 6
{"seqs", 1, 1, false}, // 7
{"sgts", 1, 1, false}, // 8
{"sges", 1, 1, false}, // 9
{"snes", 1, 1, false}, // 10
{"frcs", 1, 1, false}, // 11
{"truncs", 1, 1, false}, // 12
{"floors", 1, 1, false}, // 13
{"exp", 1, 1, false}, // 14
{"logc", 1, 1, false}, // 15
{"log", 1, 1, false}, // 16
{"rcpc", 1, 1, false}, // 17
{"rcpf", 1, 1, false}, // 18
{"rcp", 1, 1, false}, // 19
{"rsqc", 1, 1, false}, // 20
{"rsqf", 1, 1, false}, // 21
{"rsq", 1, 1, false}, // 22
{"maxas", 1, 2, false}, // 23
{"maxasf", 1, 2, false}, // 24
{"subs", 1, 2, false}, // 25
{"subs_prev", 1, 1, false}, // 26
{"setp_eq", 1, 1, false}, // 27
{"setp_ne", 1, 1, false}, // 28
{"setp_gt", 1, 1, false}, // 29
{"setp_ge", 1, 1, false}, // 30
{"setp_inv", 1, 1, false}, // 31
{"setp_pop", 1, 1, false}, // 32
{"setp_clr", 1, 1, false}, // 33
{"setp_rstr", 1, 1, false}, // 34
{"kills_eq", 1, 1, true}, // 35
{"kills_gt", 1, 1, true}, // 36
{"kills_ge", 1, 1, true}, // 37
{"kills_ne", 1, 1, true}, // 38
{"kills_one", 1, 1, true}, // 39
{"sqrt", 1, 1, false}, // 40
{"UNKNOWN", 0, 0, false}, // 41
{"mulsc", 2, 1, false}, // 42
{"mulsc", 2, 1, false}, // 43
{"addsc", 2, 1, false}, // 44
{"addsc", 2, 1, false}, // 45
{"subsc", 2, 1, false}, // 46
{"subsc", 2, 1, false}, // 47
{"sin", 1, 1, false}, // 48
{"cos", 1, 1, false}, // 49
{"retain_prev", 1, 1, false}, // 50
};
void ShaderTranslator::TranslateAluInstruction(const AluInstruction& op) {

View File

@ -58,6 +58,9 @@ class ShaderTranslator {
bool writes_color_target(int i) const { return writes_color_targets_[i]; }
// True if the current shader overrides the pixel depth.
bool writes_depth() const { return writes_depth_; }
// True if the pixel shader can potentially have early depth/stencil testing
// enabled, provided alpha testing is disabled.
bool early_z_allowed() const { return early_z_allowed_; }
// A list of all vertex bindings, populated before translation occurs.
const std::vector<Shader::VertexBinding>& vertex_bindings() const {
return vertex_bindings_;
@ -160,6 +163,7 @@ class ShaderTranslator {
const char* name;
size_t argument_count;
int src_swizzle_component_count;
bool disable_early_z;
};
bool TranslateInternal(Shader* shader);
@ -245,6 +249,7 @@ class ShaderTranslator {
bool uses_register_dynamic_addressing_ = false;
bool writes_color_targets_[4] = {false, false, false, false};
bool writes_depth_ = false;
bool early_z_allowed_ = true;
uint32_t memexport_alloc_count_ = 0;
// For register allocation in implementations - what was used after each