diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc index 74faf6e13..5fef220b0 100644 --- a/src/xenia/gpu/dxbc_shader_translator_alu.cc +++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc @@ -68,32 +68,34 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( break; case AluVectorOpcode::kMul: case AluVectorOpcode::kMad: { - bool is_mad = instr.vector_opcode == AluVectorOpcode::kMad; - if (is_mad) { - DxbcOpMAd(per_component_dest, operands[0], operands[1], operands[2]); - } else { - DxbcOpMul(per_component_dest, operands[0], operands[1]); - } - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. - uint32_t absolute_different = + // Not using DXBC mad to prevent fused multiply-add (mul followed by add + // may be optimized into non-fused mad by the driver in the identical + // operands case also). + DxbcOpMul(per_component_dest, operands[0], operands[1]); + uint32_t multiplicands_different = used_result_components & - ~instr.vector_operands[0].GetAbsoluteIdenticalComponents( + ~instr.vector_operands[0].GetIdenticalMultiplicandComponents( instr.vector_operands[1]); - if (absolute_different) { + if (multiplicands_different) { + // Shader Model 3: +-0 or denormal * anything = +0. uint32_t is_zero_temp = PushSystemTemp(); - DxbcOpMin(DxbcDest::R(is_zero_temp, absolute_different), + DxbcOpMin(DxbcDest::R(is_zero_temp, multiplicands_different), operands[0].Abs(), operands[1].Abs()); // min isn't required to flush denormals, eq is. - DxbcOpEq(DxbcDest::R(is_zero_temp, absolute_different), + DxbcOpEq(DxbcDest::R(is_zero_temp, multiplicands_different), DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f)); - DxbcOpMovC(DxbcDest::R(system_temp_result_, absolute_different), - DxbcSrc::R(is_zero_temp), - is_mad ? operands[2] : DxbcSrc::LF(0.0f), + // Not replacing true `0 + term` with movc of the term because +0 + -0 + // should result in +0, not -0. + DxbcOpMovC(DxbcDest::R(system_temp_result_, multiplicands_different), + DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f), DxbcSrc::R(system_temp_result_)); // Release is_zero_temp. PopSystemTemp(); } + if (instr.vector_opcode == AluVectorOpcode::kMad) { + DxbcOpAdd(per_component_dest, DxbcSrc::R(system_temp_result_), + operands[2]); + } } break; case AluVectorOpcode::kMax: @@ -179,69 +181,41 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( component_count = 4; } result_swizzle = DxbcSrc::kXXXX; - uint32_t absolute_different = + uint32_t multiplicands_different = uint32_t((1 << component_count) - 1) & - ~instr.vector_operands[0].GetAbsoluteIdenticalComponents( + ~instr.vector_operands[0].GetIdenticalMultiplicandComponents( instr.vector_operands[1]); - if (absolute_different) { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. - // Add component products only if non-zero. For dp4, 16 scalar - // operations in the worst case (as opposed to always 20 for - // eq/movc/eq/movc/dp4 or min/eq/movc/movc/dp4 for preparing operands - // for dp4). - DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0001), - operands[0].SelectFromSwizzled(0), - operands[1].SelectFromSwizzled(0)); - if (absolute_different & 0b0001) { - DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0010), - operands[0].SelectFromSwizzled(0).Abs(), - operands[1].SelectFromSwizzled(0).Abs()); - DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0010), - DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY), + for (uint32_t i = 0; i < component_count; ++i) { + DxbcOpMul(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001), + operands[0].SelectFromSwizzled(i), + operands[1].SelectFromSwizzled(i)); + if ((multiplicands_different & (1 << i)) != 0) { + // Shader Model 3: +-0 or denormal * anything = +0 (also not replacing + // true `0 + term` with movc of the term because +0 + -0 should result + // in +0, not -0). + DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100), + operands[0].SelectFromSwizzled(i).Abs(), + operands[1].SelectFromSwizzled(i).Abs()); + DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100), + DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), DxbcSrc::LF(0.0f)); - DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001), - DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY), - DxbcSrc::LF(0.0f), - DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); - } - for (uint32_t i = 1; i < component_count; ++i) { - bool component_different = (absolute_different & (1 << i)) != 0; - DxbcOpMAd(DxbcDest::R(system_temp_result_, - component_different ? 0b0010 : 0b0001), - operands[0].SelectFromSwizzled(i), - operands[1].SelectFromSwizzled(i), - DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); - if (component_different) { - DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100), - operands[0].SelectFromSwizzled(i).Abs(), - operands[1].SelectFromSwizzled(i).Abs()); - DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100), + DxbcOpMovC(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001), DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), - DxbcSrc::LF(0.0f)); - DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001), - DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), - DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), - DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY)); - } + DxbcSrc::LF(0.0f), + DxbcSrc::R(system_temp_result_, + i ? DxbcSrc::kYYYY : DxbcSrc::kXXXX)); } - } else { - if (component_count == 2) { - DxbcOpDP2(DxbcDest::R(system_temp_result_, 0b0001), operands[0], - operands[1]); - } else if (component_count == 3) { - DxbcOpDP3(DxbcDest::R(system_temp_result_, 0b0001), operands[0], - operands[1]); - } else { - assert_true(component_count == 4); - DxbcOpDP4(DxbcDest::R(system_temp_result_, 0b0001), operands[0], - operands[1]); + if (i) { + // Not using DXBC dp# to avoid fused multiply-add, PC GPUs are scalar + // as of 2020 anyway, and not using mad for the same reason (mul + // followed by add may be optimized into non-fused mad by the driver + // in the identical operands case also). + DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY)); } } if (component_count == 2) { - // Add the third operand. Since floating-point addition isn't - // associative, even though adding this in multiply-add for the first - // component would be faster, it's safer to add here, in the end. DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001), DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), operands[2].SelectFromSwizzled(0)); @@ -592,14 +566,13 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( DxbcOpMov(DxbcDest::R(system_temp_result_, 0b0001), DxbcSrc::LF(1.0f)); } if (used_result_components & 0b0010) { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0010), operands[0].SelectFromSwizzled(1), operands[1].SelectFromSwizzled(1)); - if (!(instr.vector_operands[0].GetAbsoluteIdenticalComponents( + if (!(instr.vector_operands[0].GetIdenticalMultiplicandComponents( instr.vector_operands[1]) & 0b0010)) { + // Shader Model 3: +-0 or denormal * anything = +0. DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100), operands[0].SelectFromSwizzled(1).Abs(), operands[1].SelectFromSwizzled(1).Abs()); @@ -700,8 +673,7 @@ void DxbcShaderTranslator::ProcessScalarAluOperation( DxbcOpMul(ps_dest, operand_0_a, operand_0_b); if (instr.scalar_operands[0].components[0] != instr.scalar_operands[0].components[1]) { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. + // Shader Model 3: +-0 or denormal * anything = +0. uint32_t is_zero_temp = PushSystemTemp(); DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(), operand_0_b.Abs()); @@ -714,58 +686,50 @@ void DxbcShaderTranslator::ProcessScalarAluOperation( PopSystemTemp(); } break; - case AluScalarOpcode::kMulsPrev: { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. - uint32_t is_zero_temp = PushSystemTemp(); - DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(), - ps_src.Abs()); - // min isn't required to flush denormals, eq is. - DxbcOpEq(DxbcDest::R(is_zero_temp, 0b0001), - DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f)); - DxbcOpMul(ps_dest, operand_0_a, ps_src); - DxbcOpMovC(ps_dest, DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX), - DxbcSrc::LF(0.0f), ps_src); - // Release is_zero_temp. - PopSystemTemp(); - } break; + case AluScalarOpcode::kMulsPrev: case AluScalarOpcode::kMulsPrev2: { uint32_t test_temp = PushSystemTemp(); - // Check if need to select the src0.a * ps case. - // ps != -FLT_MAX. - DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX)); - // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since -FLT_MAX - // is already loaded to an SGPR, this is also false if it's NaN. - DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(), - DxbcSrc::LF(-FLT_MAX)); - DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), - DxbcSrc::R(test_temp, DxbcSrc::kXXXX), - DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); - // isfinite(src0.b). - DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -operand_0_b.Abs(), - DxbcSrc::LF(-FLT_MAX)); - DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), - DxbcSrc::R(test_temp, DxbcSrc::kXXXX), - DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); - // src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked - // for NaN). - DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f), operand_0_b); - DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), - DxbcSrc::R(test_temp, DxbcSrc::kXXXX), - DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); - DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX)); - // Shader Model 3: 0 or denormal * anything = 0. - // ps is already known to be not NaN or Infinity, so multiplying it by 0 - // will result in 0. However, src0.a can be anything, so the result should - // be zero if ps is zero. - // FIXME(Triang3l): Signed zero needs research and handling. - DxbcOpEq(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(0.0f)); + if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) { + // Check if need to select the src0.a * ps case. + // ps != -FLT_MAX. + DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX)); + // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since + // -FLT_MAX is already loaded to an SGPR, this is also false if it's + // NaN. + DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(), + DxbcSrc::LF(-FLT_MAX)); + DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), + DxbcSrc::R(test_temp, DxbcSrc::kXXXX), + DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); + // isfinite(src0.b). + DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -operand_0_b.Abs(), + DxbcSrc::LF(-FLT_MAX)); + DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), + DxbcSrc::R(test_temp, DxbcSrc::kXXXX), + DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); + // src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked + // for NaN). + DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f), + operand_0_b); + DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), + DxbcSrc::R(test_temp, DxbcSrc::kXXXX), + DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); + DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX)); + } + // Shader Model 3: +-0 or denormal * anything = +0. + DxbcOpMin(DxbcDest::R(test_temp, 0b0001), operand_0_a.Abs(), + ps_src.Abs()); + // min isn't required to flush denormals, eq is. + DxbcOpEq(DxbcDest::R(test_temp, 0b0001), + DxbcSrc::R(test_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f)); DxbcOpMul(ps_dest, operand_0_a, ps_src); DxbcOpMovC(ps_dest, DxbcSrc::R(test_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f), ps_src); - DxbcOpElse(); - DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX)); - DxbcOpEndIf(); + if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) { + DxbcOpElse(); + DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX)); + DxbcOpEndIf(); + } // Release test_temp. PopSystemTemp(); } break; @@ -1023,11 +987,10 @@ void DxbcShaderTranslator::ProcessScalarAluOperation( case AluScalarOpcode::kMulsc0: case AluScalarOpcode::kMulsc1: DxbcOpMul(ps_dest, operand_0_a, operand_1); - if (!(instr.scalar_operands[0].GetAbsoluteIdenticalComponents( + if (!(instr.scalar_operands[0].GetIdenticalMultiplicandComponents( instr.scalar_operands[1]) & 0b0001)) { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. + // Shader Model 3: +-0 or denormal * anything = +0. uint32_t is_zero_temp = PushSystemTemp(); DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(), operand_1.Abs()); diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 2c25e682d..0f3220601 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -212,14 +212,19 @@ struct InstructionOperand { return false; } - // Returns which components of two operands are identical, but may have - // different signs (for simplicity of usage with GetComponent, treating the - // rightmost component as replicated). - uint32_t GetAbsoluteIdenticalComponents( + // Returns which components of two operands are identical, so that + // multiplication of them would result in pow2 with + sign, including in case + // they're zero (because -0 * |-0|, or -0 * +0, is -0), for providing a fast + // path in emulation of the Shader Model 3 +-0 * x = +0 multiplication + // behavior (disregarding component_count for simplicity of usage with + // GetComponent, treating the rightmost component as replicated). + uint32_t GetIdenticalMultiplicandComponents( const InstructionOperand& other) const { if (storage_source != other.storage_source || storage_index != other.storage_index || - storage_addressing_mode != other.storage_addressing_mode) { + storage_addressing_mode != other.storage_addressing_mode || + is_absolute_value != other.is_absolute_value || + (!is_absolute_value && is_negated != other.is_negated)) { return 0; } uint32_t identical_components = 0; @@ -229,15 +234,14 @@ struct InstructionOperand { } return identical_components; } - // Returns which components of two operands will always be bitwise equal, but - // may have different signs (disregarding component_count for simplicity of - // usage with GetComponent, treating the rightmost component as replicated). + // Returns which components of two operands will always be bitwise equal + // (disregarding component_count for simplicity of usage with GetComponent, + // treating the rightmost component as replicated). uint32_t GetIdenticalComponents(const InstructionOperand& other) const { - if (is_negated != other.is_negated || - is_absolute_value != other.is_absolute_value) { + if (is_negated != other.is_negated) { return 0; } - return GetAbsoluteIdenticalComponents(other); + return GetIdenticalMultiplicandComponents(other); } }; diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index c0c035167..21ccbaff9 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -800,13 +800,26 @@ static_assert_size(TextureFetchInstruction, 12); // Both are valid only within the current ALU clause. They are not modified // when the instruction that would write them fails its predication check. // - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for -// multiplication (0 or denormal * anything = 0) wherever it's present (mul, -// mad, dp, etc.) and for NaN in min/max. It's very important to respect this -// rule for multiplication, as games often rely on it in vector normalization -// (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of things in -// games - causes white screen in Halo 3, white specular on characters in GTA -// IV. -// TODO(Triang3l): Investigate signed zero handling in multiplication. +// multiplication (+-0 or denormal * anything = +0) wherever it's present +// (mul, mad, dp, etc.) and for NaN in min/max. It's very important to respect +// this rule for multiplication, as games often rely on it in vector +// normalization (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of +// things in games - causes white screen in Halo 3, white specular on +// characters in GTA IV. The result is always positive zero in this case, no +// matter what the signs of the other operands are, according to R5xx +// Acceleration section 8.7.5 "Legacy multiply behavior" and testing on +// Adreno 200. This means that the following need to be taken into account +// (according to 8.7.2 "ALU Non-Transcendental Floating Point"): +// - +0 * -0 is -0 with IEEE conformance, however, with this legacy SM3 +// handling, it should result in +0. +// - +0 + -0 is +0, so multiply-add should not be replaced with conditional +// move of the third operand in case of zero multiplicands, because the term +// may be -0, while the result should be +0 in this case. +// http://developer.amd.com/wordpress/media/2013/10/R5xx_Acceleration_v1.5.pdf +// Multiply-add also appears to be not fused (the SM3 behavior instruction on +// GCN is called v_mad_legacy_f32, not v_fma_legacy_f32) - shader translators +// should not use instructions that may be interpreted by the host GPU as +// fused multiply-add. enum class AluScalarOpcode : uint32_t { // Floating-Point Add