From adebaba7997cc7534dfc31cac83b4a619448f79a Mon Sep 17 00:00:00 2001 From: Joel Linn Date: Wed, 28 Oct 2020 23:18:37 +0100 Subject: [PATCH 1/3] Allow building without git. --- tools/build/premake | 51 ++++---------------- xenia-build | 111 +++++++++++++++++++++++--------------------- 2 files changed, 68 insertions(+), 94 deletions(-) diff --git a/tools/build/premake b/tools/build/premake index 14e3d5ebc..9113958a5 100644 --- a/tools/build/premake +++ b/tools/build/premake @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python3 # Copyright 2015 Ben Vanik. All Rights Reserved. @@ -107,13 +107,14 @@ def has_bin(bin): return None -def shell_call(command, throw_on_error=True, stdout_path=None): +def shell_call(command, throw_on_error=True, stdout_path=None, stderr_path=None, shell=False): """Executes a shell command. Args: command: Command to execute, as a list of parameters. throw_on_error: Whether to throw an error or return the status code. stdout_path: File path to write stdout output to. + stderr_path: File path to write stderr output to. Returns: If throw_on_error is False the status code of the call will be returned. @@ -121,17 +122,22 @@ def shell_call(command, throw_on_error=True, stdout_path=None): stdout_file = None if stdout_path: stdout_file = open(stdout_path, 'w') + stderr_file = None + if stderr_path: + stderr_file = open(stderr_path, 'w') result = 0 try: if throw_on_error: result = 1 - subprocess.check_call(command, shell=False, stdout=stdout_file) + subprocess.check_call(command, shell=shell, stdout=stdout_file, stderr=stderr_file) result = 0 else: - result = subprocess.call(command, shell=False, stdout=stdout_file) + result = subprocess.call(command, shell=shell, stdout=stdout_file, stderr=stderr_file) finally: if stdout_file: stdout_file.close() + if stderr_file: + stderr_file.close() return result @@ -196,42 +202,5 @@ def import_subprocess_environment(args): os.environ[var.upper()] = setting break -def git_submodule_update(): - """Runs a full recursive git submodule init and update. - - Older versions of git do not support 'update --init --recursive'. We could - check and run it on versions that do support it and speed things up a bit. - """ - if True: - shell_call([ - 'git', - 'submodule', - 'update', - '--init', - '--recursive', - ]) - else: - shell_call([ - 'git', - 'submodule', - 'init', - ]) - shell_call([ - 'git', - 'submodule', - 'foreach', - '--recursive', - 'git', - 'submodule', - 'init', - ]) - shell_call([ - 'git', - 'submodule', - 'update', - '--recursive', - ]) - - if __name__ == '__main__': main() diff --git a/xenia-build b/xenia-build index 0fafa738d..081f36481 100755 --- a/xenia-build +++ b/xenia-build @@ -34,8 +34,11 @@ def main(): # Check git exists. if not has_bin('git'): - print('ERROR: git must be installed and on PATH.') - sys.exit(1) + print('WARNING: Git should be installed and on PATH. Version info will be omitted from all binaries!') + print('') + elif not git_is_repository(): + print('WARNING: The source tree is unversioned. Version info will be omitted from all binaries!') + print('') # Check python version. if not sys.version_info[:2] >= (3, 6): @@ -185,13 +188,14 @@ def get_bin(binary): return None -def shell_call(command, throw_on_error=True, stdout_path=None, shell=False): +def shell_call(command, throw_on_error=True, stdout_path=None, stderr_path=None, shell=False): """Executes a shell command. Args: command: Command to execute, as a list of parameters. throw_on_error: Whether to throw an error or return the status code. stdout_path: File path to write stdout output to. + stderr_path: File path to write stderr output to. Returns: If throw_on_error is False the status code of the call will be returned. @@ -199,21 +203,49 @@ def shell_call(command, throw_on_error=True, stdout_path=None, shell=False): stdout_file = None if stdout_path: stdout_file = open(stdout_path, 'w') + stderr_file = None + if stderr_path: + stderr_file = open(stderr_path, 'w') result = 0 try: if throw_on_error: result = 1 - subprocess.check_call(command, shell=shell, stdout=stdout_file) + subprocess.check_call(command, shell=shell, stdout=stdout_file, stderr=stderr_file) result = 0 else: - result = subprocess.call(command, shell=shell, stdout=stdout_file) + result = subprocess.call(command, shell=shell, stdout=stdout_file, stderr=stderr_file) finally: if stdout_file: stdout_file.close() + if stderr_file: + stderr_file.close() return result -def get_git_head_info(): +def generate_version_h(): + """Generates a build/version.h file that contains current git info. + """ + if git_is_repository(): + (branch_name, commit, commit_short) = git_get_head_info() + else: + branch_name = 'tarball' + commit = ':(-dont-do-this' + commit_short = ':(' + + contents = '''// Autogenerated by `xb premake`. + #ifndef GENERATED_VERSION_H_ + #define GENERATED_VERSION_H_ + #define XE_BUILD_BRANCH "%s" + #define XE_BUILD_COMMIT "%s" + #define XE_BUILD_COMMIT_SHORT "%s" + #define XE_BUILD_DATE __DATE__ + #endif // GENERATED_VERSION_H_ + ''' % (branch_name, commit, commit_short) + with open('build/version.h', 'w') as f: + f.write(contents) + + +def git_get_head_info(): """Queries the current branch and commit checksum from git. Returns: @@ -247,58 +279,28 @@ def get_git_head_info(): return branch_name, commit, commit_short -def generate_version_h(): - """Generates a build/version.h file that contains current git info. +def git_is_repository(): + """Checks if git is available and this source tree is versioned. """ - (branch_name, commit, commit_short) = get_git_head_info() - contents = '''// Autogenerated by `xb premake`. - #ifndef GENERATED_VERSION_H_ - #define GENERATED_VERSION_H_ - #define XE_BUILD_BRANCH "%s" - #define XE_BUILD_COMMIT "%s" - #define XE_BUILD_COMMIT_SHORT "%s" - #define XE_BUILD_DATE __DATE__ - #endif // GENERATED_VERSION_H_ - ''' % (branch_name, commit, commit_short) - with open('build/version.h', 'w') as f: - f.write(contents) + if not has_bin('git'): + return False + return shell_call([ + 'git', + 'rev-parse', + '--is-inside-work-tree', + ], throw_on_error=False, stdout_path=os.devnull, stderr_path=os.devnull) == 0 def git_submodule_update(): """Runs a full recursive git submodule init and update. - - Older versions of git do not support 'update --init --recursive'. We could - check and run it on versions that do support it and speed things up a bit. """ - if True: - shell_call([ - 'git', - 'submodule', - 'update', - '--init', - '--recursive', - ]) - else: - shell_call([ - 'git', - 'submodule', - 'init', - ]) - shell_call([ - 'git', - 'submodule', - 'foreach', - '--recursive', - 'git', - 'submodule', - 'init', - ]) - shell_call([ - 'git', - 'submodule', - 'update', - '--recursive', - ]) + shell_call([ + 'git', + 'submodule', + 'update', + '--init', + '--recursive', + ]) def get_clang_format_binary(): @@ -491,7 +493,10 @@ class SetupCommand(Command): # Setup submodules. print('- git submodule init / update...') - git_submodule_update() + if git_is_repository(): + git_submodule_update() + else: + print('WARNING: Git not available or not a repository. Dependencies may be missing.') print('') print('- running premake...') From feb8258a5e9b6481d8181af95016881f4b559d9a Mon Sep 17 00:00:00 2001 From: Triang3l Date: Fri, 30 Oct 2020 22:14:38 +0300 Subject: [PATCH 2/3] [DXBC] Multiplication signed zero handling --- src/xenia/gpu/dxbc_shader_translator_alu.cc | 215 ++++++++------------ src/xenia/gpu/shader.h | 26 ++- src/xenia/gpu/ucode.h | 27 ++- 3 files changed, 124 insertions(+), 144 deletions(-) diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc index 74faf6e13..5fef220b0 100644 --- a/src/xenia/gpu/dxbc_shader_translator_alu.cc +++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc @@ -68,32 +68,34 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( break; case AluVectorOpcode::kMul: case AluVectorOpcode::kMad: { - bool is_mad = instr.vector_opcode == AluVectorOpcode::kMad; - if (is_mad) { - DxbcOpMAd(per_component_dest, operands[0], operands[1], operands[2]); - } else { - DxbcOpMul(per_component_dest, operands[0], operands[1]); - } - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. - uint32_t absolute_different = + // Not using DXBC mad to prevent fused multiply-add (mul followed by add + // may be optimized into non-fused mad by the driver in the identical + // operands case also). + DxbcOpMul(per_component_dest, operands[0], operands[1]); + uint32_t multiplicands_different = used_result_components & - ~instr.vector_operands[0].GetAbsoluteIdenticalComponents( + ~instr.vector_operands[0].GetIdenticalMultiplicandComponents( instr.vector_operands[1]); - if (absolute_different) { + if (multiplicands_different) { + // Shader Model 3: +-0 or denormal * anything = +0. uint32_t is_zero_temp = PushSystemTemp(); - DxbcOpMin(DxbcDest::R(is_zero_temp, absolute_different), + DxbcOpMin(DxbcDest::R(is_zero_temp, multiplicands_different), operands[0].Abs(), operands[1].Abs()); // min isn't required to flush denormals, eq is. - DxbcOpEq(DxbcDest::R(is_zero_temp, absolute_different), + DxbcOpEq(DxbcDest::R(is_zero_temp, multiplicands_different), DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f)); - DxbcOpMovC(DxbcDest::R(system_temp_result_, absolute_different), - DxbcSrc::R(is_zero_temp), - is_mad ? operands[2] : DxbcSrc::LF(0.0f), + // Not replacing true `0 + term` with movc of the term because +0 + -0 + // should result in +0, not -0. + DxbcOpMovC(DxbcDest::R(system_temp_result_, multiplicands_different), + DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f), DxbcSrc::R(system_temp_result_)); // Release is_zero_temp. PopSystemTemp(); } + if (instr.vector_opcode == AluVectorOpcode::kMad) { + DxbcOpAdd(per_component_dest, DxbcSrc::R(system_temp_result_), + operands[2]); + } } break; case AluVectorOpcode::kMax: @@ -179,69 +181,41 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( component_count = 4; } result_swizzle = DxbcSrc::kXXXX; - uint32_t absolute_different = + uint32_t multiplicands_different = uint32_t((1 << component_count) - 1) & - ~instr.vector_operands[0].GetAbsoluteIdenticalComponents( + ~instr.vector_operands[0].GetIdenticalMultiplicandComponents( instr.vector_operands[1]); - if (absolute_different) { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. - // Add component products only if non-zero. For dp4, 16 scalar - // operations in the worst case (as opposed to always 20 for - // eq/movc/eq/movc/dp4 or min/eq/movc/movc/dp4 for preparing operands - // for dp4). - DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0001), - operands[0].SelectFromSwizzled(0), - operands[1].SelectFromSwizzled(0)); - if (absolute_different & 0b0001) { - DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0010), - operands[0].SelectFromSwizzled(0).Abs(), - operands[1].SelectFromSwizzled(0).Abs()); - DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0010), - DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY), + for (uint32_t i = 0; i < component_count; ++i) { + DxbcOpMul(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001), + operands[0].SelectFromSwizzled(i), + operands[1].SelectFromSwizzled(i)); + if ((multiplicands_different & (1 << i)) != 0) { + // Shader Model 3: +-0 or denormal * anything = +0 (also not replacing + // true `0 + term` with movc of the term because +0 + -0 should result + // in +0, not -0). + DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100), + operands[0].SelectFromSwizzled(i).Abs(), + operands[1].SelectFromSwizzled(i).Abs()); + DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100), + DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), DxbcSrc::LF(0.0f)); - DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001), - DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY), - DxbcSrc::LF(0.0f), - DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); - } - for (uint32_t i = 1; i < component_count; ++i) { - bool component_different = (absolute_different & (1 << i)) != 0; - DxbcOpMAd(DxbcDest::R(system_temp_result_, - component_different ? 0b0010 : 0b0001), - operands[0].SelectFromSwizzled(i), - operands[1].SelectFromSwizzled(i), - DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); - if (component_different) { - DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100), - operands[0].SelectFromSwizzled(i).Abs(), - operands[1].SelectFromSwizzled(i).Abs()); - DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100), + DxbcOpMovC(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001), DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), - DxbcSrc::LF(0.0f)); - DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001), - DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), - DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), - DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY)); - } + DxbcSrc::LF(0.0f), + DxbcSrc::R(system_temp_result_, + i ? DxbcSrc::kYYYY : DxbcSrc::kXXXX)); } - } else { - if (component_count == 2) { - DxbcOpDP2(DxbcDest::R(system_temp_result_, 0b0001), operands[0], - operands[1]); - } else if (component_count == 3) { - DxbcOpDP3(DxbcDest::R(system_temp_result_, 0b0001), operands[0], - operands[1]); - } else { - assert_true(component_count == 4); - DxbcOpDP4(DxbcDest::R(system_temp_result_, 0b0001), operands[0], - operands[1]); + if (i) { + // Not using DXBC dp# to avoid fused multiply-add, PC GPUs are scalar + // as of 2020 anyway, and not using mad for the same reason (mul + // followed by add may be optimized into non-fused mad by the driver + // in the identical operands case also). + DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY)); } } if (component_count == 2) { - // Add the third operand. Since floating-point addition isn't - // associative, even though adding this in multiply-add for the first - // component would be faster, it's safer to add here, in the end. DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001), DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), operands[2].SelectFromSwizzled(0)); @@ -592,14 +566,13 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( DxbcOpMov(DxbcDest::R(system_temp_result_, 0b0001), DxbcSrc::LF(1.0f)); } if (used_result_components & 0b0010) { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0010), operands[0].SelectFromSwizzled(1), operands[1].SelectFromSwizzled(1)); - if (!(instr.vector_operands[0].GetAbsoluteIdenticalComponents( + if (!(instr.vector_operands[0].GetIdenticalMultiplicandComponents( instr.vector_operands[1]) & 0b0010)) { + // Shader Model 3: +-0 or denormal * anything = +0. DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100), operands[0].SelectFromSwizzled(1).Abs(), operands[1].SelectFromSwizzled(1).Abs()); @@ -700,8 +673,7 @@ void DxbcShaderTranslator::ProcessScalarAluOperation( DxbcOpMul(ps_dest, operand_0_a, operand_0_b); if (instr.scalar_operands[0].components[0] != instr.scalar_operands[0].components[1]) { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. + // Shader Model 3: +-0 or denormal * anything = +0. uint32_t is_zero_temp = PushSystemTemp(); DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(), operand_0_b.Abs()); @@ -714,58 +686,50 @@ void DxbcShaderTranslator::ProcessScalarAluOperation( PopSystemTemp(); } break; - case AluScalarOpcode::kMulsPrev: { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. - uint32_t is_zero_temp = PushSystemTemp(); - DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(), - ps_src.Abs()); - // min isn't required to flush denormals, eq is. - DxbcOpEq(DxbcDest::R(is_zero_temp, 0b0001), - DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f)); - DxbcOpMul(ps_dest, operand_0_a, ps_src); - DxbcOpMovC(ps_dest, DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX), - DxbcSrc::LF(0.0f), ps_src); - // Release is_zero_temp. - PopSystemTemp(); - } break; + case AluScalarOpcode::kMulsPrev: case AluScalarOpcode::kMulsPrev2: { uint32_t test_temp = PushSystemTemp(); - // Check if need to select the src0.a * ps case. - // ps != -FLT_MAX. - DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX)); - // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since -FLT_MAX - // is already loaded to an SGPR, this is also false if it's NaN. - DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(), - DxbcSrc::LF(-FLT_MAX)); - DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), - DxbcSrc::R(test_temp, DxbcSrc::kXXXX), - DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); - // isfinite(src0.b). - DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -operand_0_b.Abs(), - DxbcSrc::LF(-FLT_MAX)); - DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), - DxbcSrc::R(test_temp, DxbcSrc::kXXXX), - DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); - // src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked - // for NaN). - DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f), operand_0_b); - DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), - DxbcSrc::R(test_temp, DxbcSrc::kXXXX), - DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); - DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX)); - // Shader Model 3: 0 or denormal * anything = 0. - // ps is already known to be not NaN or Infinity, so multiplying it by 0 - // will result in 0. However, src0.a can be anything, so the result should - // be zero if ps is zero. - // FIXME(Triang3l): Signed zero needs research and handling. - DxbcOpEq(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(0.0f)); + if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) { + // Check if need to select the src0.a * ps case. + // ps != -FLT_MAX. + DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX)); + // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since + // -FLT_MAX is already loaded to an SGPR, this is also false if it's + // NaN. + DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(), + DxbcSrc::LF(-FLT_MAX)); + DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), + DxbcSrc::R(test_temp, DxbcSrc::kXXXX), + DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); + // isfinite(src0.b). + DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -operand_0_b.Abs(), + DxbcSrc::LF(-FLT_MAX)); + DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), + DxbcSrc::R(test_temp, DxbcSrc::kXXXX), + DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); + // src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked + // for NaN). + DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f), + operand_0_b); + DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), + DxbcSrc::R(test_temp, DxbcSrc::kXXXX), + DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); + DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX)); + } + // Shader Model 3: +-0 or denormal * anything = +0. + DxbcOpMin(DxbcDest::R(test_temp, 0b0001), operand_0_a.Abs(), + ps_src.Abs()); + // min isn't required to flush denormals, eq is. + DxbcOpEq(DxbcDest::R(test_temp, 0b0001), + DxbcSrc::R(test_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f)); DxbcOpMul(ps_dest, operand_0_a, ps_src); DxbcOpMovC(ps_dest, DxbcSrc::R(test_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f), ps_src); - DxbcOpElse(); - DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX)); - DxbcOpEndIf(); + if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) { + DxbcOpElse(); + DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX)); + DxbcOpEndIf(); + } // Release test_temp. PopSystemTemp(); } break; @@ -1023,11 +987,10 @@ void DxbcShaderTranslator::ProcessScalarAluOperation( case AluScalarOpcode::kMulsc0: case AluScalarOpcode::kMulsc1: DxbcOpMul(ps_dest, operand_0_a, operand_1); - if (!(instr.scalar_operands[0].GetAbsoluteIdenticalComponents( + if (!(instr.scalar_operands[0].GetIdenticalMultiplicandComponents( instr.scalar_operands[1]) & 0b0001)) { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. + // Shader Model 3: +-0 or denormal * anything = +0. uint32_t is_zero_temp = PushSystemTemp(); DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(), operand_1.Abs()); diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 2c25e682d..0f3220601 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -212,14 +212,19 @@ struct InstructionOperand { return false; } - // Returns which components of two operands are identical, but may have - // different signs (for simplicity of usage with GetComponent, treating the - // rightmost component as replicated). - uint32_t GetAbsoluteIdenticalComponents( + // Returns which components of two operands are identical, so that + // multiplication of them would result in pow2 with + sign, including in case + // they're zero (because -0 * |-0|, or -0 * +0, is -0), for providing a fast + // path in emulation of the Shader Model 3 +-0 * x = +0 multiplication + // behavior (disregarding component_count for simplicity of usage with + // GetComponent, treating the rightmost component as replicated). + uint32_t GetIdenticalMultiplicandComponents( const InstructionOperand& other) const { if (storage_source != other.storage_source || storage_index != other.storage_index || - storage_addressing_mode != other.storage_addressing_mode) { + storage_addressing_mode != other.storage_addressing_mode || + is_absolute_value != other.is_absolute_value || + (!is_absolute_value && is_negated != other.is_negated)) { return 0; } uint32_t identical_components = 0; @@ -229,15 +234,14 @@ struct InstructionOperand { } return identical_components; } - // Returns which components of two operands will always be bitwise equal, but - // may have different signs (disregarding component_count for simplicity of - // usage with GetComponent, treating the rightmost component as replicated). + // Returns which components of two operands will always be bitwise equal + // (disregarding component_count for simplicity of usage with GetComponent, + // treating the rightmost component as replicated). uint32_t GetIdenticalComponents(const InstructionOperand& other) const { - if (is_negated != other.is_negated || - is_absolute_value != other.is_absolute_value) { + if (is_negated != other.is_negated) { return 0; } - return GetAbsoluteIdenticalComponents(other); + return GetIdenticalMultiplicandComponents(other); } }; diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index c0c035167..21ccbaff9 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -800,13 +800,26 @@ static_assert_size(TextureFetchInstruction, 12); // Both are valid only within the current ALU clause. They are not modified // when the instruction that would write them fails its predication check. // - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for -// multiplication (0 or denormal * anything = 0) wherever it's present (mul, -// mad, dp, etc.) and for NaN in min/max. It's very important to respect this -// rule for multiplication, as games often rely on it in vector normalization -// (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of things in -// games - causes white screen in Halo 3, white specular on characters in GTA -// IV. -// TODO(Triang3l): Investigate signed zero handling in multiplication. +// multiplication (+-0 or denormal * anything = +0) wherever it's present +// (mul, mad, dp, etc.) and for NaN in min/max. It's very important to respect +// this rule for multiplication, as games often rely on it in vector +// normalization (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of +// things in games - causes white screen in Halo 3, white specular on +// characters in GTA IV. The result is always positive zero in this case, no +// matter what the signs of the other operands are, according to R5xx +// Acceleration section 8.7.5 "Legacy multiply behavior" and testing on +// Adreno 200. This means that the following need to be taken into account +// (according to 8.7.2 "ALU Non-Transcendental Floating Point"): +// - +0 * -0 is -0 with IEEE conformance, however, with this legacy SM3 +// handling, it should result in +0. +// - +0 + -0 is +0, so multiply-add should not be replaced with conditional +// move of the third operand in case of zero multiplicands, because the term +// may be -0, while the result should be +0 in this case. +// http://developer.amd.com/wordpress/media/2013/10/R5xx_Acceleration_v1.5.pdf +// Multiply-add also appears to be not fused (the SM3 behavior instruction on +// GCN is called v_mad_legacy_f32, not v_fma_legacy_f32) - shader translators +// should not use instructions that may be interpreted by the host GPU as +// fused multiply-add. enum class AluScalarOpcode : uint32_t { // Floating-Point Add From ae3b68c7b602b34412db5de16e20bd933e0ab1bd Mon Sep 17 00:00:00 2001 From: Triang3l Date: Fri, 30 Oct 2020 22:31:30 +0300 Subject: [PATCH 3/3] [DXBC] Fast mul path only for fully identical components because neg is post-abs --- src/xenia/gpu/dxbc_shader_translator_alu.cc | 15 ++++++----- src/xenia/gpu/shader.h | 28 +++++++-------------- 2 files changed, 16 insertions(+), 27 deletions(-) diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc index 5fef220b0..b2d24f89b 100644 --- a/src/xenia/gpu/dxbc_shader_translator_alu.cc +++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc @@ -74,7 +74,7 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( DxbcOpMul(per_component_dest, operands[0], operands[1]); uint32_t multiplicands_different = used_result_components & - ~instr.vector_operands[0].GetIdenticalMultiplicandComponents( + ~instr.vector_operands[0].GetIdenticalComponents( instr.vector_operands[1]); if (multiplicands_different) { // Shader Model 3: +-0 or denormal * anything = +0. @@ -181,15 +181,14 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( component_count = 4; } result_swizzle = DxbcSrc::kXXXX; - uint32_t multiplicands_different = - uint32_t((1 << component_count) - 1) & - ~instr.vector_operands[0].GetIdenticalMultiplicandComponents( - instr.vector_operands[1]); + uint32_t different = uint32_t((1 << component_count) - 1) & + ~instr.vector_operands[0].GetIdenticalComponents( + instr.vector_operands[1]); for (uint32_t i = 0; i < component_count; ++i) { DxbcOpMul(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001), operands[0].SelectFromSwizzled(i), operands[1].SelectFromSwizzled(i)); - if ((multiplicands_different & (1 << i)) != 0) { + if ((different & (1 << i)) != 0) { // Shader Model 3: +-0 or denormal * anything = +0 (also not replacing // true `0 + term` with movc of the term because +0 + -0 should result // in +0, not -0). @@ -569,7 +568,7 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0010), operands[0].SelectFromSwizzled(1), operands[1].SelectFromSwizzled(1)); - if (!(instr.vector_operands[0].GetIdenticalMultiplicandComponents( + if (!(instr.vector_operands[0].GetIdenticalComponents( instr.vector_operands[1]) & 0b0010)) { // Shader Model 3: +-0 or denormal * anything = +0. @@ -987,7 +986,7 @@ void DxbcShaderTranslator::ProcessScalarAluOperation( case AluScalarOpcode::kMulsc0: case AluScalarOpcode::kMulsc1: DxbcOpMul(ps_dest, operand_0_a, operand_1); - if (!(instr.scalar_operands[0].GetIdenticalMultiplicandComponents( + if (!(instr.scalar_operands[0].GetIdenticalComponents( instr.scalar_operands[1]) & 0b0001)) { // Shader Model 3: +-0 or denormal * anything = +0. diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 0f3220601..d253bdad0 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -212,19 +212,18 @@ struct InstructionOperand { return false; } - // Returns which components of two operands are identical, so that - // multiplication of them would result in pow2 with + sign, including in case - // they're zero (because -0 * |-0|, or -0 * +0, is -0), for providing a fast - // path in emulation of the Shader Model 3 +-0 * x = +0 multiplication - // behavior (disregarding component_count for simplicity of usage with - // GetComponent, treating the rightmost component as replicated). - uint32_t GetIdenticalMultiplicandComponents( - const InstructionOperand& other) const { + // Returns which components of two operands will always be bitwise equal + // (disregarding component_count for simplicity of usage with GetComponent, + // treating the rightmost component as replicated). This, strictly with all + // conditions, must be used when emulating Shader Model 3 +-0 * x = +0 + // multiplication behavior with IEEE-compliant multiplication (because + // -0 * |-0|, or -0 * +0, is -0, while the result must be +0). + uint32_t GetIdenticalComponents(const InstructionOperand& other) const { if (storage_source != other.storage_source || storage_index != other.storage_index || storage_addressing_mode != other.storage_addressing_mode || - is_absolute_value != other.is_absolute_value || - (!is_absolute_value && is_negated != other.is_negated)) { + is_negated != other.is_negated || + is_absolute_value != other.is_absolute_value) { return 0; } uint32_t identical_components = 0; @@ -234,15 +233,6 @@ struct InstructionOperand { } return identical_components; } - // Returns which components of two operands will always be bitwise equal - // (disregarding component_count for simplicity of usage with GetComponent, - // treating the rightmost component as replicated). - uint32_t GetIdenticalComponents(const InstructionOperand& other) const { - if (is_negated != other.is_negated) { - return 0; - } - return GetIdenticalMultiplicandComponents(other); - } }; struct ParsedExecInstruction {