From 466a7afde336d381e250d51e4c268c769e072a02 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 24 Oct 2013 13:52:22 +0200 Subject: [PATCH 1/4] Interpreter: support non-IEEE mode emulation v2: fix fxsave on visual studio, thx @ rodolfo for this patch --- Source/Core/Common/Src/CPUDetect.h | 3 ++ Source/Core/Common/Src/FPURoundMode.h | 2 +- .../Core/Common/Src/GenericFPURoundMode.cpp | 2 +- Source/Core/Common/Src/x64CPUDetect.cpp | 22 +++++++++++++ Source/Core/Common/Src/x64FPURoundMode.cpp | 32 ++++++++++++++----- .../Interpreter_SystemRegisters.cpp | 11 ++----- 6 files changed, 53 insertions(+), 19 deletions(-) diff --git a/Source/Core/Common/Src/CPUDetect.h b/Source/Core/Common/Src/CPUDetect.h index e93a902d63..eab62d3d19 100644 --- a/Source/Core/Common/Src/CPUDetect.h +++ b/Source/Core/Common/Src/CPUDetect.h @@ -43,6 +43,9 @@ struct CPUInfo bool bAVX; bool bFMA; bool bAES; + // FXSAVE/FXRSTOR + bool bFXSR; + bool bDAZ; bool bLAHFSAHF64; bool bLongMode; diff --git a/Source/Core/Common/Src/FPURoundMode.h b/Source/Core/Common/Src/FPURoundMode.h index fad4d5d6aa..c552ad7ff0 100644 --- a/Source/Core/Common/Src/FPURoundMode.h +++ b/Source/Core/Common/Src/FPURoundMode.h @@ -36,7 +36,7 @@ namespace FPURoundMode void SetPrecisionMode(u32 mode); - void SetSIMDMode(u32 mode); + void SetSIMDMode(u32 roundingMode, u32 nonIEEEMode); /* * There are two different flavors of float to int conversion: diff --git a/Source/Core/Common/Src/GenericFPURoundMode.cpp b/Source/Core/Common/Src/GenericFPURoundMode.cpp index cc878291a1..c8e70a4990 100644 --- a/Source/Core/Common/Src/GenericFPURoundMode.cpp +++ b/Source/Core/Common/Src/GenericFPURoundMode.cpp @@ -26,7 +26,7 @@ namespace FPURoundMode void SetPrecisionMode(u32 mode) { } - void SetSIMDMode(u32 mode) + void SetSIMDMode(u32 mode, u32 nonIEEEMode) { } void SaveSIMDState() diff --git a/Source/Core/Common/Src/x64CPUDetect.cpp b/Source/Core/Common/Src/x64CPUDetect.cpp index 2b434ad2b6..182cca5224 100644 --- a/Source/Core/Common/Src/x64CPUDetect.cpp +++ b/Source/Core/Common/Src/x64CPUDetect.cpp @@ -162,6 +162,28 @@ void CPUInfo::Detect() if ((cpu_id[2] >> 20) & 1) bSSE4_2 = true; if ((cpu_id[2] >> 25) & 1) bAES = true; + if ((cpu_id[3] >> 24) & 1) + { + // We can use FXSAVE. + bFXSR = true; + + GC_ALIGNED16(u8 fx_state[512]); + memset(fx_state, 0, sizeof(fx_state)); +#ifdef _WIN32 +#ifdef _M_IX86 + _fxsave(fx_state); +#elif defined (_M_X64) + _fxsave64(fx_state); +#endif +#else + __asm__("fxsave %0" : "=m" (fx_state)); +#endif + + // lowest byte of MXCSR_MASK + if ((fx_state[0x1C] >> 6) & 1) + bDAZ = true; + } + // AVX support requires 3 separate checks: // - Is the AVX bit set in CPUID? // - Is the XSAVE bit set in CPUID? diff --git a/Source/Core/Common/Src/x64FPURoundMode.cpp b/Source/Core/Common/Src/x64FPURoundMode.cpp index 2c950ade96..a8b0d16809 100644 --- a/Source/Core/Common/Src/x64FPURoundMode.cpp +++ b/Source/Core/Common/Src/x64FPURoundMode.cpp @@ -4,6 +4,7 @@ #include "Common.h" #include "FPURoundMode.h" +#include "CPUDetect.h" #ifndef _WIN32 static const unsigned short FPU_ROUND_NEAR = 0 << 10; @@ -14,8 +15,11 @@ static const unsigned short FPU_ROUND_MASK = 3 << 10; #include #endif -const u32 MASKS = 0x1F80; // mask away the interrupts. +// OR-mask for disabling FPU exceptions (bits 7-12 in the MXCSR register) +const u32 EXCEPTION_MASK = 0x1F80; +// Denormals-Are-Zero (non-IEEE mode: denormal inputs are set to +/- 0) const u32 DAZ = 0x40; +// Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0) const u32 FTZ = 0x8000; namespace FPURoundMode @@ -79,16 +83,28 @@ namespace FPURoundMode //but still - set any useful sse options here #endif } - void SetSIMDMode(u32 mode) + + void SetSIMDMode(u32 roundingMode, u32 nonIEEEMode) { - static const u32 ssetable[4] = + // lookup table for FPSCR.RN-to-MXCSR.RC translation + static const u32 roundingModeLUT[4] = { - (0 << 13) | MASKS, - (3 << 13) | MASKS, - (2 << 13) | MASKS, - (1 << 13) | MASKS, + (0 << 13) | EXCEPTION_MASK, // nearest + (3 << 13) | EXCEPTION_MASK, // -inf + (2 << 13) | EXCEPTION_MASK, // +inf + (1 << 13) | EXCEPTION_MASK, // zero }; - u32 csr = ssetable[mode]; + u32 csr = roundingModeLUT[roundingMode]; + + static const u32 denormalLUT[2] = + { + FTZ, // flush-to-zero only + FTZ | DAZ, // flush-to-zero and denormals-are-zero (may not be supported) + }; + if (nonIEEEMode) + { + csr |= denormalLUT[cpu_info.bDAZ]; + } _mm_setcsr(csr); } diff --git a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp index 688d166608..475f7591ce 100644 --- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp +++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp @@ -48,15 +48,8 @@ static void FPSCRtoFPUSettings(UReg_FPSCR fp) // Pokemon Colosseum does this. Gah. } - // Also corresponding SSE rounding mode setting - if (FPSCR.NI) - { - // Either one of these two breaks Beyond Good & Evil. - // if (cpu_info.bSSSE3) - // csr |= DAZ; - // csr |= FTZ; - } - FPURoundMode::SetSIMDMode(FPSCR.RN); + // Set SSE rounding mode and denormal handling + FPURoundMode::SetSIMDMode(FPSCR.RN, FPSCR.NI); } void Interpreter::mtfsb0x(UGeckoInstruction _inst) From cd069fdce1bd7a30f99f3bed9dfe0af60cd562d8 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 24 Oct 2013 22:05:53 +0200 Subject: [PATCH 2/4] Interpreter: software-based flush-to-zero bDAZ is now called bFlushToZero to better reflect what it's actually used for. I decided not to support any hardware-based flush-to-zero on systems that don't support this for both inputs _and_ outputs. It makes the code cleaner and the intersection of CPUs that support SSE2 but not DAZ should be very small. --- Source/Core/Common/Src/CPUDetect.h | 5 +++- Source/Core/Common/Src/MathUtil.h | 4 +-- Source/Core/Common/Src/x64CPUDetect.cpp | 8 ++++- Source/Core/Common/Src/x64FPURoundMode.cpp | 2 +- .../PowerPC/Interpreter/Interpreter_FPUtils.h | 29 ++++++++----------- 5 files changed, 26 insertions(+), 22 deletions(-) diff --git a/Source/Core/Common/Src/CPUDetect.h b/Source/Core/Common/Src/CPUDetect.h index eab62d3d19..967be0949b 100644 --- a/Source/Core/Common/Src/CPUDetect.h +++ b/Source/Core/Common/Src/CPUDetect.h @@ -45,7 +45,10 @@ struct CPUInfo bool bAES; // FXSAVE/FXRSTOR bool bFXSR; - bool bDAZ; + // This flag indicates that the hardware supports some mode + // in which denormal inputs _and_ outputs are automatically set to (signed) zero. + // TODO: ARM + bool bFlushToZero; bool bLAHFSAHF64; bool bLongMode; diff --git a/Source/Core/Common/Src/MathUtil.h b/Source/Core/Common/Src/MathUtil.h index 31772c3c60..f085c6ed2b 100644 --- a/Source/Core/Common/Src/MathUtil.h +++ b/Source/Core/Common/Src/MathUtil.h @@ -64,10 +64,10 @@ inline float FlushToZero(float f) return x.f; } -inline double FlushToZeroAsFloat(double d) +inline double FlushToZero(double d) { IntDouble x; x.d = d; - if ((x.i & DOUBLE_EXP) < 0x3800000000000000ULL) + if ((x.i & DOUBLE_EXP) == 0) x.i &= DOUBLE_SIGN; // turn into signed zero return x.d; } diff --git a/Source/Core/Common/Src/x64CPUDetect.cpp b/Source/Core/Common/Src/x64CPUDetect.cpp index 182cca5224..d6f36eb142 100644 --- a/Source/Core/Common/Src/x64CPUDetect.cpp +++ b/Source/Core/Common/Src/x64CPUDetect.cpp @@ -162,6 +162,7 @@ void CPUInfo::Detect() if ((cpu_id[2] >> 20) & 1) bSSE4_2 = true; if ((cpu_id[2] >> 25) & 1) bAES = true; + // To check DAZ support, we first need to check FXSAVE support. if ((cpu_id[3] >> 24) & 1) { // We can use FXSAVE. @@ -181,7 +182,12 @@ void CPUInfo::Detect() // lowest byte of MXCSR_MASK if ((fx_state[0x1C] >> 6) & 1) - bDAZ = true; + { + // On x86, the FTZ field (supported since SSE1) only flushes denormal _outputs_ to zero, + // now that we checked DAZ support (flushing denormal _inputs_ to zero), + // we can set our generic flag. + bFlushToZero = true; + } } // AVX support requires 3 separate checks: diff --git a/Source/Core/Common/Src/x64FPURoundMode.cpp b/Source/Core/Common/Src/x64FPURoundMode.cpp index a8b0d16809..f46c6000eb 100644 --- a/Source/Core/Common/Src/x64FPURoundMode.cpp +++ b/Source/Core/Common/Src/x64FPURoundMode.cpp @@ -103,7 +103,7 @@ namespace FPURoundMode }; if (nonIEEEMode) { - csr |= denormalLUT[cpu_info.bDAZ]; + csr |= denormalLUT[cpu_info.bFlushToZero]; } _mm_setcsr(csr); } diff --git a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FPUtils.h index d379bf7049..9190a18ed7 100644 --- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FPUtils.h +++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FPUtils.h @@ -5,6 +5,7 @@ #ifndef _INTERPRETER_FPUTILS_H #define _INTERPRETER_FPUTILS_H +#include "CPUDetect.h" #include "Interpreter.h" #include "MathUtil.h" @@ -69,28 +70,22 @@ inline void UpdateFPSCR() inline double ForceSingle(double _x) { - //if (FPSCR.RN != 0) - // PanicAlert("RN = %d at %x", (int)FPSCR.RN, PC); - if (FPSCR.NI) - _x = FlushToZeroAsFloat(_x); - - double x = static_cast(_x); - + // convert to float... + float x = _x; + if (!cpu_info.bFlushToZero && FPSCR.NI) + { + x = FlushToZero(x); + } + // ...and back to double: return x; } inline double ForceDouble(double d) { - //if (FPSCR.RN != 0) - // PanicAlert("RN = %d at %x", (int)FPSCR.RN, PC); - - //if (FPSCR.NI) - //{ - // IntDouble x; x.d = d; - //if ((x.i & DOUBLE_EXP) == 0) - // x.i &= DOUBLE_SIGN; // turn into signed zero - // return x.d; - //} + if (!cpu_info.bFlushToZero && FPSCR.NI) + { + d = FlushToZero(d); + } return d; } From d78c8c21a21a58f6bcd0fd8c01648c5c0bf2cc8b Mon Sep 17 00:00:00 2001 From: degasus Date: Thu, 24 Oct 2013 22:50:20 +0200 Subject: [PATCH 3/4] jit64: enable fp instructions faddx fsubx fdivx fdivsx all of them are now accurate enough as we handle non-ieee floats correctly --- Source/Core/Core/Src/PowerPC/Jit64/Jit.h | 2 +- .../Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp | 16 ++++++++-------- .../Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h index 71a556cb5f..139414a103 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h @@ -182,7 +182,7 @@ public: void ps_sum(UGeckoInstruction inst); void ps_muls(UGeckoInstruction inst); - void fp_arith_s(UGeckoInstruction inst); + void fp_arith(UGeckoInstruction inst); void frsqrtex(UGeckoInstruction inst); void fcmpx(UGeckoInstruction inst); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp index 0ff2bf5d7f..dc81015573 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp @@ -320,12 +320,12 @@ static GekkoOPTemplate table31_2[] = static GekkoOPTemplate table59[] = { - {18, &Jit64::Default}, //{"fdivsx", OPTYPE_FPU, FL_RC_BIT_F, 16}}, - {20, &Jit64::fp_arith_s}, //"fsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, - {21, &Jit64::fp_arith_s}, //"faddsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {18, &Jit64::fp_arith}, //{"fdivsx", OPTYPE_FPU, FL_RC_BIT_F, 16}}, + {20, &Jit64::fp_arith}, //"fsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {21, &Jit64::fp_arith}, //"faddsx", OPTYPE_FPU, FL_RC_BIT_F}}, // {22, &Jit64::Default}, //"fsqrtsx", OPTYPE_FPU, FL_RC_BIT_F}}, // Not implemented on gekko {24, &Jit64::Default}, //"fresx", OPTYPE_FPU, FL_RC_BIT_F}}, - {25, &Jit64::fp_arith_s}, //"fmulsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {25, &Jit64::fp_arith}, //"fmulsx", OPTYPE_FPU, FL_RC_BIT_F}}, {28, &Jit64::fmaddXX}, //"fmsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, {29, &Jit64::fmaddXX}, //"fmaddsx", OPTYPE_FPU, FL_RC_BIT_F}}, {30, &Jit64::fmaddXX}, //"fnmsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, @@ -354,12 +354,12 @@ static GekkoOPTemplate table63[] = static GekkoOPTemplate table63_2[] = { - {18, &Jit64::Default}, //"fdivx", OPTYPE_FPU, FL_RC_BIT_F, 30}}, - {20, &Jit64::Default}, //"fsubx", OPTYPE_FPU, FL_RC_BIT_F}}, - {21, &Jit64::Default}, //"faddx", OPTYPE_FPU, FL_RC_BIT_F}}, + {18, &Jit64::fp_arith}, //"fdivx", OPTYPE_FPU, FL_RC_BIT_F, 30}}, + {20, &Jit64::fp_arith}, //"fsubx", OPTYPE_FPU, FL_RC_BIT_F}}, + {21, &Jit64::fp_arith}, //"faddx", OPTYPE_FPU, FL_RC_BIT_F}}, {22, &Jit64::Default}, //"fsqrtx", OPTYPE_FPU, FL_RC_BIT_F}}, {23, &Jit64::Default}, //"fselx", OPTYPE_FPU, FL_RC_BIT_F}}, - {25, &Jit64::fp_arith_s}, //"fmulx", OPTYPE_FPU, FL_RC_BIT_F}}, + {25, &Jit64::fp_arith}, //"fmulx", OPTYPE_FPU, FL_RC_BIT_F}}, {26, &Jit64::frsqrtex}, //"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F}}, {28, &Jit64::fmaddXX}, //"fmsubx", OPTYPE_FPU, FL_RC_BIT_F}}, {29, &Jit64::fmaddXX}, //"fmaddx", OPTYPE_FPU, FL_RC_BIT_F}}, diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp index c4699ecc62..b0d0ab4853 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -85,7 +85,7 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, fpr.UnlockAll(); } -void Jit64::fp_arith_s(UGeckoInstruction inst) +void Jit64::fp_arith(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITFloatingPointOff) @@ -106,7 +106,7 @@ void Jit64::fp_arith_s(UGeckoInstruction inst) case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD, &XEmitter::VADDSD); break; //add case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, &XEmitter::VMULSD); break; //mul default: - _assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!"); + _assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!"); } } From 288bef280784e02cc80f0aee9360e45548c06e9a Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Sun, 3 Nov 2013 23:56:30 +0100 Subject: [PATCH 4/4] x64: add small warning if CPU has SSE2 but not DAZ --- Source/Core/Common/Src/x64CPUDetect.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Source/Core/Common/Src/x64CPUDetect.cpp b/Source/Core/Common/Src/x64CPUDetect.cpp index d6f36eb142..2fa25e8074 100644 --- a/Source/Core/Common/Src/x64CPUDetect.cpp +++ b/Source/Core/Common/Src/x64CPUDetect.cpp @@ -250,7 +250,12 @@ std::string CPUInfo::Summarize() { std::string sum(cpu_string); if (bSSE) sum += ", SSE"; - if (bSSE2) sum += ", SSE2"; + if (bSSE2) + { + sum += ", SSE2"; + if (!bFlushToZero) + sum += " (but not DAZ!)"; + } if (bSSE3) sum += ", SSE3"; if (bSSSE3) sum += ", SSSE3"; if (bSSE4_1) sum += ", SSE4.1";