Jit64: Make DoubleToSingle a common asm routine

2019-05-21 23:37:22 -04:00 · 2019-05-21 23:37:22 -04:00 · c4799e5977
parent 9136abf07e
commit c4799e5977
6 changed files with 88 additions and 81 deletions
--- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
@ -236,6 +236,8 @@ void Jit64AsmRoutineManager::GenerateCommon()
  GenFres();
  mfcr = AlignCode4();
  GenMfcr();
+  cdts = AlignCode4();
+  GenConvertDoubleToSingle();

  GenQuantizedLoads();
  GenQuantizedSingleLoads();
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
@ -115,7 +115,8 @@ void Jit64::stfXXX(UGeckoInstruction inst)
    {
      RCX64Reg Rs = fpr.Bind(s, RCMode::Read);
      RegCache::Realize(Rs);
-      ConvertDoubleToSingle(XMM0, Rs);
+      MOVAPD(XMM0, Rs);
+      CALL(asm_routines.cdts);
    }
    MOVD_xmm(R(RSCRATCH), XMM0);
  }
--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
@ -868,89 +868,9 @@ void EmuCodeBlock::Force25BitPrecision(X64Reg output, const OpArg& input, X64Reg
  }
 }

-// Since the following float conversion functions are used in non-arithmetic PPC float
-// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs
-// into QNaNs. This means we can't use CVTSS2SD/CVTSD2SS. The x87 FPU doesn't even support
-// flush-to-zero so we can use FLD+FSTP even on denormals.
-// If the number is a NaN, make sure to set the QNaN bit back to its original value.
-
-// Another problem is that officially, converting doubles to single format results in undefined
-// behavior.  Relying on undefined behavior is a bug so no software should ever do this.
-// Super Mario 64 (on Wii VC) accidentally relies on this behavior.  See issue #11173
-
-alignas(16) static const __m128i double_exponent = _mm_set_epi64x(0, 0x7ff0000000000000);
-alignas(16) static const __m128i double_fraction = _mm_set_epi64x(0, 0x000fffffffffffff);
-alignas(16) static const __m128i double_sign_bit = _mm_set_epi64x(0, 0x8000000000000000);
-alignas(16) static const __m128i double_explicit_top_bit = _mm_set_epi64x(0, 0x0010000000000000);
-alignas(16) static const __m128i double_top_two_bits = _mm_set_epi64x(0, 0xc000000000000000);
-alignas(16) static const __m128i double_bottom_bits = _mm_set_epi64x(0, 0x07ffffffe0000000);
 alignas(16) static const __m128i double_qnan_bit = _mm_set_epi64x(0xffffffffffffffff,
                                                                  0xfff7ffffffffffff);

-// This is the same algorithm used in the interpreter (and actual hardware)
-// The documentation states that the conversion of a double with an outside the
-// valid range for a single (or a single denormal) is undefined.
-// But testing on actual hardware shows it always picks bits 0..1 and 5..34
-// unless the exponent is in the range of 874 to 896.
-void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
-{
-  MOVAPD(XMM1, R(src));
-
-  // Grab Exponent
-  PAND(XMM1, MConst(double_exponent));
-  PSRLQ(XMM1, 52);
-  MOVD_xmm(R(RSCRATCH), XMM1);
-
-  // Check if the double is in the range of valid single subnormal
-  SUB(16, R(RSCRATCH), Imm16(874));
-  CMP(16, R(RSCRATCH), Imm16(896 - 874));
-  FixupBranch NoDenormalize = J_CC(CC_A);
-
-  // Denormalise
-
-  // shift = (905 - Exponent) plus the 21 bit double to single shift
-  MOV(16, R(RSCRATCH), Imm16(905 + 21));
-  MOVD_xmm(XMM0, R(RSCRATCH));
-  PSUBQ(XMM0, R(XMM1));
-
-  // xmm1 = fraction | 0x0010000000000000
-  MOVAPD(XMM1, R(src));
-  PAND(XMM1, MConst(double_fraction));
-  POR(XMM1, MConst(double_explicit_top_bit));
-
-  // fraction >> shift
-  PSRLQ(XMM1, R(XMM0));
-
-  // OR the sign bit in.
-  MOVAPD(XMM0, R(src));
-  PAND(XMM0, MConst(double_sign_bit));
-  PSRLQ(XMM0, 32);
-  POR(XMM1, R(XMM0));
-
-  FixupBranch end = J(false);  // Goto end
-
-  SetJumpTarget(NoDenormalize);
-
-  // Don't Denormalize
-
-  // We want bits 0, 1
-  MOVAPD(XMM1, R(src));
-  PAND(XMM1, MConst(double_top_two_bits));
-  PSRLQ(XMM1, 32);
-
-  // And 5 through to 34
-  MOVAPD(XMM0, R(src));
-  PAND(XMM0, MConst(double_bottom_bits));
-  PSRLQ(XMM0, 29);
-
-  // OR them togther
-  POR(XMM1, R(XMM0));
-
-  // End
-  SetJumpTarget(end);
-  MOVDDUP(dst, R(XMM1));
-}
-
 // Converting single->double is a bit easier because all single denormals are double normals.
 void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
 {
--- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
@ -9,6 +9,7 @@
 #include "Common/CPUDetect.h"
 #include "Common/CommonTypes.h"
 #include "Common/FloatUtils.h"
+#include "Common/Intrinsics.h"
 #include "Common/JitRegister.h"
 #include "Common/x64ABI.h"
 #include "Common/x64Emitter.h"
@ -25,6 +26,87 @@

 using namespace Gen;

+alignas(16) static const __m128i double_fraction = _mm_set_epi64x(0, 0x000fffffffffffff);
+alignas(16) static const __m128i double_sign_bit = _mm_set_epi64x(0, 0x8000000000000000);
+alignas(16) static const __m128i double_explicit_top_bit = _mm_set_epi64x(0, 0x0010000000000000);
+alignas(16) static const __m128i double_top_two_bits = _mm_set_epi64x(0, 0xc000000000000000);
+alignas(16) static const __m128i double_bottom_bits = _mm_set_epi64x(0, 0x07ffffffe0000000);
+
+// Since the following float conversion functions are used in non-arithmetic PPC float
+// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs
+// into QNaNs. This means we can't use CVTSS2SD/CVTSD2SS. The x87 FPU doesn't even support
+// flush-to-zero so we can use FLD+FSTP even on denormals.
+// If the number is a NaN, make sure to set the QNaN bit back to its original value.
+
+// Another problem is that officially, converting doubles to single format results in undefined
+// behavior.  Relying on undefined behavior is a bug so no software should ever do this.
+// Super Mario 64 (on Wii VC) accidentally relies on this behavior.  See issue #11173
+
+// This is the same algorithm used in the interpreter (and actual hardware)
+// The documentation states that the conversion of a double with an outside the
+// valid range for a single (or a single denormal) is undefined.
+// But testing on actual hardware shows it always picks bits 0..1 and 5..34
+// unless the exponent is in the range of 874 to 896.
+
+void CommonAsmRoutines::GenConvertDoubleToSingle()
+{
+  // Input in XMM0, output to XMM0
+  // Clobbers RSCRATCH/RSCRATCH2/XMM1
+
+  const void* start = GetCodePtr();
+
+  // Grab Exponent
+  MOVQ_xmm(R(RSCRATCH), XMM0);
+  MOV(64, R(RSCRATCH2), R(RSCRATCH));
+  SHR(64, R(RSCRATCH), Imm8(52));
+  AND(16, R(RSCRATCH), Imm16(0x7ff));
+
+  // Check if the double is in the range of valid single subnormal
+  SUB(16, R(RSCRATCH), Imm16(874));
+  CMP(16, R(RSCRATCH), Imm16(896 - 874));
+  FixupBranch Denormalize = J_CC(CC_NA);
+
+  // Don't Denormalize
+
+  // We want bits 0, 1
+  MOVAPD(XMM1, R(XMM0));
+  PAND(XMM1, MConst(double_top_two_bits));
+  PSRLQ(XMM1, 32);
+
+  // And 5 through to 34
+  PAND(XMM0, MConst(double_bottom_bits));
+  PSRLQ(XMM0, 29);
+
+  // OR them togther
+  POR(XMM0, R(XMM1));
+  RET();
+
+  // Denormalise
+  SetJumpTarget(Denormalize);
+
+  // shift = (905 - Exponent) plus the 21 bit double to single shift
+  NEG(16, R(RSCRATCH));
+  ADD(16, R(RSCRATCH), Imm16((905 + 21) - 874));
+  MOVQ_xmm(XMM1, R(RSCRATCH));
+
+  // XMM0 = fraction | 0x0010000000000000
+  PAND(XMM0, MConst(double_fraction));
+  POR(XMM0, MConst(double_explicit_top_bit));
+
+  // fraction >> shift
+  PSRLQ(XMM0, R(XMM1));
+
+  // OR the sign bit in.
+  SHR(64, R(RSCRATCH2), Imm8(32));
+  AND(32, R(RSCRATCH2), Imm32(0x80000000));
+  MOVQ_xmm(XMM1, R(RSCRATCH2));
+
+  POR(XMM0, R(XMM1));
+  RET();
+
+  JitRegister::Register(start, GetCodePtr(), "JIT_cdts");
+}
+
 void CommonAsmRoutines::GenFrsqrte()
 {
  const void* start = GetCodePtr();
--- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h
+++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h
@ -31,6 +31,7 @@ public:
  void GenMfcr();

 protected:
+  void GenConvertDoubleToSingle();
  const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type);
  const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type);
  void GenQuantizedLoads();
--- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
+++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
@ -25,6 +25,7 @@ struct CommonAsmRoutinesBase
  const u8* frsqrte;
  const u8* fres;
  const u8* mfcr;
+  const u8* cdts;

  // In: array index: GQR to use.
  // In: ECX: Address to read from.