[DXBC] Multiplication signed zero handling

This commit is contained in:
Triang3l 2020-10-30 22:14:38 +03:00
parent adebaba799
commit feb8258a5e
3 changed files with 124 additions and 144 deletions

View File

@ -68,32 +68,34 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
break; break;
case AluVectorOpcode::kMul: case AluVectorOpcode::kMul:
case AluVectorOpcode::kMad: { case AluVectorOpcode::kMad: {
bool is_mad = instr.vector_opcode == AluVectorOpcode::kMad; // Not using DXBC mad to prevent fused multiply-add (mul followed by add
if (is_mad) { // may be optimized into non-fused mad by the driver in the identical
DxbcOpMAd(per_component_dest, operands[0], operands[1], operands[2]); // operands case also).
} else {
DxbcOpMul(per_component_dest, operands[0], operands[1]); DxbcOpMul(per_component_dest, operands[0], operands[1]);
} uint32_t multiplicands_different =
// Shader Model 3: 0 or denormal * anything = 0.
// FIXME(Triang3l): Signed zero needs research and handling.
uint32_t absolute_different =
used_result_components & used_result_components &
~instr.vector_operands[0].GetAbsoluteIdenticalComponents( ~instr.vector_operands[0].GetIdenticalMultiplicandComponents(
instr.vector_operands[1]); instr.vector_operands[1]);
if (absolute_different) { if (multiplicands_different) {
// Shader Model 3: +-0 or denormal * anything = +0.
uint32_t is_zero_temp = PushSystemTemp(); uint32_t is_zero_temp = PushSystemTemp();
DxbcOpMin(DxbcDest::R(is_zero_temp, absolute_different), DxbcOpMin(DxbcDest::R(is_zero_temp, multiplicands_different),
operands[0].Abs(), operands[1].Abs()); operands[0].Abs(), operands[1].Abs());
// min isn't required to flush denormals, eq is. // min isn't required to flush denormals, eq is.
DxbcOpEq(DxbcDest::R(is_zero_temp, absolute_different), DxbcOpEq(DxbcDest::R(is_zero_temp, multiplicands_different),
DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f)); DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f));
DxbcOpMovC(DxbcDest::R(system_temp_result_, absolute_different), // Not replacing true `0 + term` with movc of the term because +0 + -0
DxbcSrc::R(is_zero_temp), // should result in +0, not -0.
is_mad ? operands[2] : DxbcSrc::LF(0.0f), DxbcOpMovC(DxbcDest::R(system_temp_result_, multiplicands_different),
DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f),
DxbcSrc::R(system_temp_result_)); DxbcSrc::R(system_temp_result_));
// Release is_zero_temp. // Release is_zero_temp.
PopSystemTemp(); PopSystemTemp();
} }
if (instr.vector_opcode == AluVectorOpcode::kMad) {
DxbcOpAdd(per_component_dest, DxbcSrc::R(system_temp_result_),
operands[2]);
}
} break; } break;
case AluVectorOpcode::kMax: case AluVectorOpcode::kMax:
@ -179,69 +181,41 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
component_count = 4; component_count = 4;
} }
result_swizzle = DxbcSrc::kXXXX; result_swizzle = DxbcSrc::kXXXX;
uint32_t absolute_different = uint32_t multiplicands_different =
uint32_t((1 << component_count) - 1) & uint32_t((1 << component_count) - 1) &
~instr.vector_operands[0].GetAbsoluteIdenticalComponents( ~instr.vector_operands[0].GetIdenticalMultiplicandComponents(
instr.vector_operands[1]); instr.vector_operands[1]);
if (absolute_different) { for (uint32_t i = 0; i < component_count; ++i) {
// Shader Model 3: 0 or denormal * anything = 0. DxbcOpMul(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001),
// FIXME(Triang3l): Signed zero needs research and handling.
// Add component products only if non-zero. For dp4, 16 scalar
// operations in the worst case (as opposed to always 20 for
// eq/movc/eq/movc/dp4 or min/eq/movc/movc/dp4 for preparing operands
// for dp4).
DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0001),
operands[0].SelectFromSwizzled(0),
operands[1].SelectFromSwizzled(0));
if (absolute_different & 0b0001) {
DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0010),
operands[0].SelectFromSwizzled(0).Abs(),
operands[1].SelectFromSwizzled(0).Abs());
DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0010),
DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY),
DxbcSrc::LF(0.0f));
DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001),
DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY),
DxbcSrc::LF(0.0f),
DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX));
}
for (uint32_t i = 1; i < component_count; ++i) {
bool component_different = (absolute_different & (1 << i)) != 0;
DxbcOpMAd(DxbcDest::R(system_temp_result_,
component_different ? 0b0010 : 0b0001),
operands[0].SelectFromSwizzled(i), operands[0].SelectFromSwizzled(i),
operands[1].SelectFromSwizzled(i), operands[1].SelectFromSwizzled(i));
DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); if ((multiplicands_different & (1 << i)) != 0) {
if (component_different) { // Shader Model 3: +-0 or denormal * anything = +0 (also not replacing
// true `0 + term` with movc of the term because +0 + -0 should result
// in +0, not -0).
DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100), DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100),
operands[0].SelectFromSwizzled(i).Abs(), operands[0].SelectFromSwizzled(i).Abs(),
operands[1].SelectFromSwizzled(i).Abs()); operands[1].SelectFromSwizzled(i).Abs());
DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100), DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100),
DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ),
DxbcSrc::LF(0.0f)); DxbcSrc::LF(0.0f));
DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001), DxbcOpMovC(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001),
DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ),
DxbcSrc::LF(0.0f),
DxbcSrc::R(system_temp_result_,
i ? DxbcSrc::kYYYY : DxbcSrc::kXXXX));
}
if (i) {
// Not using DXBC dp# to avoid fused multiply-add, PC GPUs are scalar
// as of 2020 anyway, and not using mad for the same reason (mul
// followed by add may be optimized into non-fused mad by the driver
// in the identical operands case also).
DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001),
DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX),
DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY)); DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY));
} }
} }
} else {
if (component_count == 2) { if (component_count == 2) {
DxbcOpDP2(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
operands[1]);
} else if (component_count == 3) {
DxbcOpDP3(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
operands[1]);
} else {
assert_true(component_count == 4);
DxbcOpDP4(DxbcDest::R(system_temp_result_, 0b0001), operands[0],
operands[1]);
}
}
if (component_count == 2) {
// Add the third operand. Since floating-point addition isn't
// associative, even though adding this in multiply-add for the first
// component would be faster, it's safer to add here, in the end.
DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001), DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001),
DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX),
operands[2].SelectFromSwizzled(0)); operands[2].SelectFromSwizzled(0));
@ -592,14 +566,13 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
DxbcOpMov(DxbcDest::R(system_temp_result_, 0b0001), DxbcSrc::LF(1.0f)); DxbcOpMov(DxbcDest::R(system_temp_result_, 0b0001), DxbcSrc::LF(1.0f));
} }
if (used_result_components & 0b0010) { if (used_result_components & 0b0010) {
// Shader Model 3: 0 or denormal * anything = 0.
// FIXME(Triang3l): Signed zero needs research and handling.
DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0010), DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0010),
operands[0].SelectFromSwizzled(1), operands[0].SelectFromSwizzled(1),
operands[1].SelectFromSwizzled(1)); operands[1].SelectFromSwizzled(1));
if (!(instr.vector_operands[0].GetAbsoluteIdenticalComponents( if (!(instr.vector_operands[0].GetIdenticalMultiplicandComponents(
instr.vector_operands[1]) & instr.vector_operands[1]) &
0b0010)) { 0b0010)) {
// Shader Model 3: +-0 or denormal * anything = +0.
DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100), DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100),
operands[0].SelectFromSwizzled(1).Abs(), operands[0].SelectFromSwizzled(1).Abs(),
operands[1].SelectFromSwizzled(1).Abs()); operands[1].SelectFromSwizzled(1).Abs());
@ -700,8 +673,7 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
DxbcOpMul(ps_dest, operand_0_a, operand_0_b); DxbcOpMul(ps_dest, operand_0_a, operand_0_b);
if (instr.scalar_operands[0].components[0] != if (instr.scalar_operands[0].components[0] !=
instr.scalar_operands[0].components[1]) { instr.scalar_operands[0].components[1]) {
// Shader Model 3: 0 or denormal * anything = 0. // Shader Model 3: +-0 or denormal * anything = +0.
// FIXME(Triang3l): Signed zero needs research and handling.
uint32_t is_zero_temp = PushSystemTemp(); uint32_t is_zero_temp = PushSystemTemp();
DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(), DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
operand_0_b.Abs()); operand_0_b.Abs());
@ -714,28 +686,16 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
PopSystemTemp(); PopSystemTemp();
} }
break; break;
case AluScalarOpcode::kMulsPrev: { case AluScalarOpcode::kMulsPrev:
// Shader Model 3: 0 or denormal * anything = 0.
// FIXME(Triang3l): Signed zero needs research and handling.
uint32_t is_zero_temp = PushSystemTemp();
DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
ps_src.Abs());
// min isn't required to flush denormals, eq is.
DxbcOpEq(DxbcDest::R(is_zero_temp, 0b0001),
DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f));
DxbcOpMul(ps_dest, operand_0_a, ps_src);
DxbcOpMovC(ps_dest, DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX),
DxbcSrc::LF(0.0f), ps_src);
// Release is_zero_temp.
PopSystemTemp();
} break;
case AluScalarOpcode::kMulsPrev2: { case AluScalarOpcode::kMulsPrev2: {
uint32_t test_temp = PushSystemTemp(); uint32_t test_temp = PushSystemTemp();
if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) {
// Check if need to select the src0.a * ps case. // Check if need to select the src0.a * ps case.
// ps != -FLT_MAX. // ps != -FLT_MAX.
DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX)); DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX));
// isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since -FLT_MAX // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since
// is already loaded to an SGPR, this is also false if it's NaN. // -FLT_MAX is already loaded to an SGPR, this is also false if it's
// NaN.
DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(), DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(),
DxbcSrc::LF(-FLT_MAX)); DxbcSrc::LF(-FLT_MAX));
DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
@ -749,23 +709,27 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
// src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked // src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked
// for NaN). // for NaN).
DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f), operand_0_b); DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f),
operand_0_b);
DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), DxbcOpAnd(DxbcDest::R(test_temp, 0b0001),
DxbcSrc::R(test_temp, DxbcSrc::kXXXX), DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); DxbcSrc::R(test_temp, DxbcSrc::kYYYY));
DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX)); DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX));
// Shader Model 3: 0 or denormal * anything = 0. }
// ps is already known to be not NaN or Infinity, so multiplying it by 0 // Shader Model 3: +-0 or denormal * anything = +0.
// will result in 0. However, src0.a can be anything, so the result should DxbcOpMin(DxbcDest::R(test_temp, 0b0001), operand_0_a.Abs(),
// be zero if ps is zero. ps_src.Abs());
// FIXME(Triang3l): Signed zero needs research and handling. // min isn't required to flush denormals, eq is.
DxbcOpEq(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(0.0f)); DxbcOpEq(DxbcDest::R(test_temp, 0b0001),
DxbcSrc::R(test_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f));
DxbcOpMul(ps_dest, operand_0_a, ps_src); DxbcOpMul(ps_dest, operand_0_a, ps_src);
DxbcOpMovC(ps_dest, DxbcSrc::R(test_temp, DxbcSrc::kXXXX), DxbcOpMovC(ps_dest, DxbcSrc::R(test_temp, DxbcSrc::kXXXX),
DxbcSrc::LF(0.0f), ps_src); DxbcSrc::LF(0.0f), ps_src);
if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) {
DxbcOpElse(); DxbcOpElse();
DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX)); DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX));
DxbcOpEndIf(); DxbcOpEndIf();
}
// Release test_temp. // Release test_temp.
PopSystemTemp(); PopSystemTemp();
} break; } break;
@ -1023,11 +987,10 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
case AluScalarOpcode::kMulsc0: case AluScalarOpcode::kMulsc0:
case AluScalarOpcode::kMulsc1: case AluScalarOpcode::kMulsc1:
DxbcOpMul(ps_dest, operand_0_a, operand_1); DxbcOpMul(ps_dest, operand_0_a, operand_1);
if (!(instr.scalar_operands[0].GetAbsoluteIdenticalComponents( if (!(instr.scalar_operands[0].GetIdenticalMultiplicandComponents(
instr.scalar_operands[1]) & instr.scalar_operands[1]) &
0b0001)) { 0b0001)) {
// Shader Model 3: 0 or denormal * anything = 0. // Shader Model 3: +-0 or denormal * anything = +0.
// FIXME(Triang3l): Signed zero needs research and handling.
uint32_t is_zero_temp = PushSystemTemp(); uint32_t is_zero_temp = PushSystemTemp();
DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(), DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(),
operand_1.Abs()); operand_1.Abs());

View File

@ -212,14 +212,19 @@ struct InstructionOperand {
return false; return false;
} }
// Returns which components of two operands are identical, but may have // Returns which components of two operands are identical, so that
// different signs (for simplicity of usage with GetComponent, treating the // multiplication of them would result in pow2 with + sign, including in case
// rightmost component as replicated). // they're zero (because -0 * |-0|, or -0 * +0, is -0), for providing a fast
uint32_t GetAbsoluteIdenticalComponents( // path in emulation of the Shader Model 3 +-0 * x = +0 multiplication
// behavior (disregarding component_count for simplicity of usage with
// GetComponent, treating the rightmost component as replicated).
uint32_t GetIdenticalMultiplicandComponents(
const InstructionOperand& other) const { const InstructionOperand& other) const {
if (storage_source != other.storage_source || if (storage_source != other.storage_source ||
storage_index != other.storage_index || storage_index != other.storage_index ||
storage_addressing_mode != other.storage_addressing_mode) { storage_addressing_mode != other.storage_addressing_mode ||
is_absolute_value != other.is_absolute_value ||
(!is_absolute_value && is_negated != other.is_negated)) {
return 0; return 0;
} }
uint32_t identical_components = 0; uint32_t identical_components = 0;
@ -229,15 +234,14 @@ struct InstructionOperand {
} }
return identical_components; return identical_components;
} }
// Returns which components of two operands will always be bitwise equal, but // Returns which components of two operands will always be bitwise equal
// may have different signs (disregarding component_count for simplicity of // (disregarding component_count for simplicity of usage with GetComponent,
// usage with GetComponent, treating the rightmost component as replicated). // treating the rightmost component as replicated).
uint32_t GetIdenticalComponents(const InstructionOperand& other) const { uint32_t GetIdenticalComponents(const InstructionOperand& other) const {
if (is_negated != other.is_negated || if (is_negated != other.is_negated) {
is_absolute_value != other.is_absolute_value) {
return 0; return 0;
} }
return GetAbsoluteIdenticalComponents(other); return GetIdenticalMultiplicandComponents(other);
} }
}; };

View File

@ -800,13 +800,26 @@ static_assert_size(TextureFetchInstruction, 12);
// Both are valid only within the current ALU clause. They are not modified // Both are valid only within the current ALU clause. They are not modified
// when the instruction that would write them fails its predication check. // when the instruction that would write them fails its predication check.
// - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for // - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for
// multiplication (0 or denormal * anything = 0) wherever it's present (mul, // multiplication (+-0 or denormal * anything = +0) wherever it's present
// mad, dp, etc.) and for NaN in min/max. It's very important to respect this // (mul, mad, dp, etc.) and for NaN in min/max. It's very important to respect
// rule for multiplication, as games often rely on it in vector normalization // this rule for multiplication, as games often rely on it in vector
// (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of things in // normalization (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of
// games - causes white screen in Halo 3, white specular on characters in GTA // things in games - causes white screen in Halo 3, white specular on
// IV. // characters in GTA IV. The result is always positive zero in this case, no
// TODO(Triang3l): Investigate signed zero handling in multiplication. // matter what the signs of the other operands are, according to R5xx
// Acceleration section 8.7.5 "Legacy multiply behavior" and testing on
// Adreno 200. This means that the following need to be taken into account
// (according to 8.7.2 "ALU Non-Transcendental Floating Point"):
// - +0 * -0 is -0 with IEEE conformance, however, with this legacy SM3
// handling, it should result in +0.
// - +0 + -0 is +0, so multiply-add should not be replaced with conditional
// move of the third operand in case of zero multiplicands, because the term
// may be -0, while the result should be +0 in this case.
// http://developer.amd.com/wordpress/media/2013/10/R5xx_Acceleration_v1.5.pdf
// Multiply-add also appears to be not fused (the SM3 behavior instruction on
// GCN is called v_mad_legacy_f32, not v_fma_legacy_f32) - shader translators
// should not use instructions that may be interpreted by the host GPU as
// fused multiply-add.
enum class AluScalarOpcode : uint32_t { enum class AluScalarOpcode : uint32_t {
// Floating-Point Add // Floating-Point Add