From 466a7afde336d381e250d51e4c268c769e072a02 Mon Sep 17 00:00:00 2001
From: Tillmann Karras <tilkax@gmail.com>
Date: Thu, 24 Oct 2013 13:52:22 +0200
Subject: [PATCH 1/4] Interpreter: support non-IEEE mode emulation

v2: fix fxsave on visual studio, thx @ rodolfo for this patch
---
 Source/Core/Common/Src/CPUDetect.h            |  3 ++
 Source/Core/Common/Src/FPURoundMode.h         |  2 +-
 .../Core/Common/Src/GenericFPURoundMode.cpp   |  2 +-
 Source/Core/Common/Src/x64CPUDetect.cpp       | 22 +++++++++++++
 Source/Core/Common/Src/x64FPURoundMode.cpp    | 32 ++++++++++++++-----
 .../Interpreter_SystemRegisters.cpp           | 11 ++-----
 6 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/Source/Core/Common/Src/CPUDetect.h b/Source/Core/Common/Src/CPUDetect.h
index e93a902d63..eab62d3d19 100644
--- a/Source/Core/Common/Src/CPUDetect.h
+++ b/Source/Core/Common/Src/CPUDetect.h
@@ -43,6 +43,9 @@ struct CPUInfo
 	bool bAVX;
 	bool bFMA;
 	bool bAES;
+	// FXSAVE/FXRSTOR
+	bool bFXSR;
+	bool bDAZ;
 	bool bLAHFSAHF64;
 	bool bLongMode;
 
diff --git a/Source/Core/Common/Src/FPURoundMode.h b/Source/Core/Common/Src/FPURoundMode.h
index fad4d5d6aa..c552ad7ff0 100644
--- a/Source/Core/Common/Src/FPURoundMode.h
+++ b/Source/Core/Common/Src/FPURoundMode.h
@@ -36,7 +36,7 @@ namespace FPURoundMode
 
 	void SetPrecisionMode(u32 mode);
 
-	void SetSIMDMode(u32 mode);
+	void SetSIMDMode(u32 roundingMode, u32 nonIEEEMode);
 
 /*
  * There are two different flavors of float to int conversion:
diff --git a/Source/Core/Common/Src/GenericFPURoundMode.cpp b/Source/Core/Common/Src/GenericFPURoundMode.cpp
index cc878291a1..c8e70a4990 100644
--- a/Source/Core/Common/Src/GenericFPURoundMode.cpp
+++ b/Source/Core/Common/Src/GenericFPURoundMode.cpp
@@ -26,7 +26,7 @@ namespace FPURoundMode
 	void SetPrecisionMode(u32 mode)
 	{
 	}
-	void SetSIMDMode(u32 mode)
+	void SetSIMDMode(u32 mode, u32 nonIEEEMode)
 	{
 	}
 	void SaveSIMDState()
diff --git a/Source/Core/Common/Src/x64CPUDetect.cpp b/Source/Core/Common/Src/x64CPUDetect.cpp
index 2b434ad2b6..182cca5224 100644
--- a/Source/Core/Common/Src/x64CPUDetect.cpp
+++ b/Source/Core/Common/Src/x64CPUDetect.cpp
@@ -162,6 +162,28 @@ void CPUInfo::Detect()
 		if ((cpu_id[2] >> 20) & 1) bSSE4_2 = true;
 		if ((cpu_id[2] >> 25) & 1) bAES = true;
 
+		if ((cpu_id[3] >> 24) & 1)
+		{
+			// We can use FXSAVE.
+			bFXSR = true;
+
+			GC_ALIGNED16(u8 fx_state[512]);
+			memset(fx_state, 0, sizeof(fx_state));
+#ifdef _WIN32
+#ifdef _M_IX86
+			_fxsave(fx_state);
+#elif defined (_M_X64)
+			_fxsave64(fx_state);
+#endif
+#else
+			__asm__("fxsave %0" : "=m" (fx_state));
+#endif
+
+			// lowest byte of MXCSR_MASK
+			if ((fx_state[0x1C] >> 6) & 1)
+				bDAZ = true;
+		}
+
 		// AVX support requires 3 separate checks:
 		//  - Is the AVX bit set in CPUID?
 		//  - Is the XSAVE bit set in CPUID?
diff --git a/Source/Core/Common/Src/x64FPURoundMode.cpp b/Source/Core/Common/Src/x64FPURoundMode.cpp
index 2c950ade96..a8b0d16809 100644
--- a/Source/Core/Common/Src/x64FPURoundMode.cpp
+++ b/Source/Core/Common/Src/x64FPURoundMode.cpp
@@ -4,6 +4,7 @@
 
 #include "Common.h"
 #include "FPURoundMode.h"
+#include "CPUDetect.h"
 
 #ifndef _WIN32
 static const unsigned short FPU_ROUND_NEAR = 0 << 10;
@@ -14,8 +15,11 @@ static const unsigned short FPU_ROUND_MASK = 3 << 10;
 #include <xmmintrin.h>
 #endif
 
-const u32 MASKS = 0x1F80;  // mask away the interrupts.
+// OR-mask for disabling FPU exceptions (bits 7-12 in the MXCSR register)
+const u32 EXCEPTION_MASK = 0x1F80;
+// Denormals-Are-Zero (non-IEEE mode: denormal inputs are set to +/- 0)
 const u32 DAZ = 0x40;
+// Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0)
 const u32 FTZ = 0x8000;
 
 namespace FPURoundMode
@@ -79,16 +83,28 @@ namespace FPURoundMode
 			//but still - set any useful sse options here
 		#endif
 	}
-	void SetSIMDMode(u32 mode)
+
+	void SetSIMDMode(u32 roundingMode, u32 nonIEEEMode)
 	{
-		static const u32 ssetable[4] =
+		// lookup table for FPSCR.RN-to-MXCSR.RC translation
+		static const u32 roundingModeLUT[4] =
 		{
-			(0 << 13) | MASKS,
-			(3 << 13) | MASKS,
-			(2 << 13) | MASKS,
-			(1 << 13) | MASKS,
+			(0 << 13) | EXCEPTION_MASK, // nearest
+			(3 << 13) | EXCEPTION_MASK, // -inf
+			(2 << 13) | EXCEPTION_MASK, // +inf
+			(1 << 13) | EXCEPTION_MASK, // zero
 		};
-		u32 csr = ssetable[mode];
+		u32 csr = roundingModeLUT[roundingMode];
+
+		static const u32 denormalLUT[2] =
+		{
+			FTZ,       // flush-to-zero only
+			FTZ | DAZ, // flush-to-zero and denormals-are-zero (may not be supported)
+		};
+		if (nonIEEEMode)
+		{
+			csr |= denormalLUT[cpu_info.bDAZ];
+		}
 		_mm_setcsr(csr);
 	}
 
diff --git a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp
index 688d166608..475f7591ce 100644
--- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp
+++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp
@@ -48,15 +48,8 @@ static void FPSCRtoFPUSettings(UReg_FPSCR fp)
 		// Pokemon Colosseum does this. Gah.
 	}
 
-	// Also corresponding SSE rounding mode setting
-	if (FPSCR.NI)
-	{
-		// Either one of these two breaks Beyond Good & Evil.
-		// if (cpu_info.bSSSE3)
-		//     csr |= DAZ;
-		// csr |= FTZ;
-	}
-	FPURoundMode::SetSIMDMode(FPSCR.RN);
+	// Set SSE rounding mode and denormal handling
+	FPURoundMode::SetSIMDMode(FPSCR.RN, FPSCR.NI);
 }
 
 void Interpreter::mtfsb0x(UGeckoInstruction _inst)

From cd069fdce1bd7a30f99f3bed9dfe0af60cd562d8 Mon Sep 17 00:00:00 2001
From: Tillmann Karras <tilkax@gmail.com>
Date: Thu, 24 Oct 2013 22:05:53 +0200
Subject: [PATCH 2/4] Interpreter: software-based flush-to-zero

bDAZ is now called bFlushToZero to better reflect what it's actually
used for.

I decided not to support any hardware-based flush-to-zero on systems
that don't support this for both inputs _and_ outputs. It makes the code
cleaner and the intersection of CPUs that support SSE2 but not DAZ
should be very small.
---
 Source/Core/Common/Src/CPUDetect.h            |  5 +++-
 Source/Core/Common/Src/MathUtil.h             |  4 +--
 Source/Core/Common/Src/x64CPUDetect.cpp       |  8 ++++-
 Source/Core/Common/Src/x64FPURoundMode.cpp    |  2 +-
 .../PowerPC/Interpreter/Interpreter_FPUtils.h | 29 ++++++++-----------
 5 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/Source/Core/Common/Src/CPUDetect.h b/Source/Core/Common/Src/CPUDetect.h
index eab62d3d19..967be0949b 100644
--- a/Source/Core/Common/Src/CPUDetect.h
+++ b/Source/Core/Common/Src/CPUDetect.h
@@ -45,7 +45,10 @@ struct CPUInfo
 	bool bAES;
 	// FXSAVE/FXRSTOR
 	bool bFXSR;
-	bool bDAZ;
+	// This flag indicates that the hardware supports some mode
+	// in which denormal inputs _and_ outputs are automatically set to (signed) zero.
+	// TODO: ARM
+	bool bFlushToZero;
 	bool bLAHFSAHF64;
 	bool bLongMode;
 
diff --git a/Source/Core/Common/Src/MathUtil.h b/Source/Core/Common/Src/MathUtil.h
index 31772c3c60..f085c6ed2b 100644
--- a/Source/Core/Common/Src/MathUtil.h
+++ b/Source/Core/Common/Src/MathUtil.h
@@ -64,10 +64,10 @@ inline float FlushToZero(float f)
 	return x.f;
 }
 
-inline double FlushToZeroAsFloat(double d)
+inline double FlushToZero(double d)
 {
 	IntDouble x; x.d = d;
-	if ((x.i & DOUBLE_EXP) < 0x3800000000000000ULL)
+	if ((x.i & DOUBLE_EXP) == 0)
 		x.i &= DOUBLE_SIGN;  // turn into signed zero
 	return x.d;
 }
diff --git a/Source/Core/Common/Src/x64CPUDetect.cpp b/Source/Core/Common/Src/x64CPUDetect.cpp
index 182cca5224..d6f36eb142 100644
--- a/Source/Core/Common/Src/x64CPUDetect.cpp
+++ b/Source/Core/Common/Src/x64CPUDetect.cpp
@@ -162,6 +162,7 @@ void CPUInfo::Detect()
 		if ((cpu_id[2] >> 20) & 1) bSSE4_2 = true;
 		if ((cpu_id[2] >> 25) & 1) bAES = true;
 
+		// To check DAZ support, we first need to check FXSAVE support.
 		if ((cpu_id[3] >> 24) & 1)
 		{
 			// We can use FXSAVE.
@@ -181,7 +182,12 @@ void CPUInfo::Detect()
 
 			// lowest byte of MXCSR_MASK
 			if ((fx_state[0x1C] >> 6) & 1)
-				bDAZ = true;
+			{
+				// On x86, the FTZ field (supported since SSE1) only flushes denormal _outputs_ to zero,
+				// now that we checked DAZ support (flushing denormal _inputs_ to zero),
+				// we can set our generic flag.
+				bFlushToZero = true;
+			}
 		}
 
 		// AVX support requires 3 separate checks:
diff --git a/Source/Core/Common/Src/x64FPURoundMode.cpp b/Source/Core/Common/Src/x64FPURoundMode.cpp
index a8b0d16809..f46c6000eb 100644
--- a/Source/Core/Common/Src/x64FPURoundMode.cpp
+++ b/Source/Core/Common/Src/x64FPURoundMode.cpp
@@ -103,7 +103,7 @@ namespace FPURoundMode
 		};
 		if (nonIEEEMode)
 		{
-			csr |= denormalLUT[cpu_info.bDAZ];
+			csr |= denormalLUT[cpu_info.bFlushToZero];
 		}
 		_mm_setcsr(csr);
 	}
diff --git a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FPUtils.h
index d379bf7049..9190a18ed7 100644
--- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FPUtils.h
@@ -5,6 +5,7 @@
 #ifndef _INTERPRETER_FPUTILS_H
 #define _INTERPRETER_FPUTILS_H
 
+#include "CPUDetect.h"
 #include "Interpreter.h"
 #include "MathUtil.h"
 
@@ -69,28 +70,22 @@ inline void UpdateFPSCR()
 
 inline double ForceSingle(double _x)
 {
-	//if (FPSCR.RN != 0)
-	//	PanicAlert("RN = %d at %x", (int)FPSCR.RN, PC);
-	if (FPSCR.NI)
-		_x = FlushToZeroAsFloat(_x);
-
-	double x = static_cast<float>(_x);
-
+	// convert to float...
+	float x = _x;
+	if (!cpu_info.bFlushToZero && FPSCR.NI)
+	{
+		x = FlushToZero(x);
+	}
+	// ...and back to double:
 	return x;
 }
 
 inline double ForceDouble(double d)
 {
-	//if (FPSCR.RN != 0)
-	//	PanicAlert("RN = %d at %x", (int)FPSCR.RN, PC);
-
-	//if (FPSCR.NI)
-	//{
-	//	IntDouble x; x.d = d;
-		//if ((x.i & DOUBLE_EXP) == 0)
-		//	x.i &= DOUBLE_SIGN;  // turn into signed zero
-	//	return x.d;
-	//}
+	if (!cpu_info.bFlushToZero && FPSCR.NI)
+	{
+		d = FlushToZero(d);
+	}
 	return d;
 }
 

From d78c8c21a21a58f6bcd0fd8c01648c5c0bf2cc8b Mon Sep 17 00:00:00 2001
From: degasus <wickmarkus@web.de>
Date: Thu, 24 Oct 2013 22:50:20 +0200
Subject: [PATCH 3/4] jit64: enable fp instructions faddx fsubx fdivx fdivsx

all of them are now accurate enough as we handle non-ieee floats correctly
---
 Source/Core/Core/Src/PowerPC/Jit64/Jit.h         |  2 +-
 .../Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp | 16 ++++++++--------
 .../Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h
index 71a556cb5f..139414a103 100644
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h
@@ -182,7 +182,7 @@ public:
 	void ps_sum(UGeckoInstruction inst);
 	void ps_muls(UGeckoInstruction inst);
 
-	void fp_arith_s(UGeckoInstruction inst);
+	void fp_arith(UGeckoInstruction inst);
 	void frsqrtex(UGeckoInstruction inst);
 
 	void fcmpx(UGeckoInstruction inst);
diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp
index 0ff2bf5d7f..dc81015573 100644
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp
@@ -320,12 +320,12 @@ static GekkoOPTemplate table31_2[] =
 
 static GekkoOPTemplate table59[] =
 {
-	{18, &Jit64::Default},       //{"fdivsx",   OPTYPE_FPU, FL_RC_BIT_F, 16}},
-	{20, &Jit64::fp_arith_s}, //"fsubsx",   OPTYPE_FPU, FL_RC_BIT_F}},
-	{21, &Jit64::fp_arith_s}, //"faddsx",   OPTYPE_FPU, FL_RC_BIT_F}},
+	{18, &Jit64::fp_arith}, //{"fdivsx",   OPTYPE_FPU, FL_RC_BIT_F, 16}},
+	{20, &Jit64::fp_arith}, //"fsubsx",   OPTYPE_FPU, FL_RC_BIT_F}},
+	{21, &Jit64::fp_arith}, //"faddsx",   OPTYPE_FPU, FL_RC_BIT_F}},
 //	{22, &Jit64::Default}, //"fsqrtsx",  OPTYPE_FPU, FL_RC_BIT_F}}, // Not implemented on gekko
 	{24, &Jit64::Default}, //"fresx",    OPTYPE_FPU, FL_RC_BIT_F}},
-	{25, &Jit64::fp_arith_s}, //"fmulsx",   OPTYPE_FPU, FL_RC_BIT_F}},
+	{25, &Jit64::fp_arith}, //"fmulsx",   OPTYPE_FPU, FL_RC_BIT_F}},
 	{28, &Jit64::fmaddXX}, //"fmsubsx",  OPTYPE_FPU, FL_RC_BIT_F}},
 	{29, &Jit64::fmaddXX}, //"fmaddsx",  OPTYPE_FPU, FL_RC_BIT_F}},
 	{30, &Jit64::fmaddXX}, //"fnmsubsx", OPTYPE_FPU, FL_RC_BIT_F}},
@@ -354,12 +354,12 @@ static GekkoOPTemplate table63[] =
 
 static GekkoOPTemplate table63_2[] =
 {
-	{18, &Jit64::Default}, //"fdivx",    OPTYPE_FPU, FL_RC_BIT_F, 30}},
-	{20, &Jit64::Default}, //"fsubx",    OPTYPE_FPU, FL_RC_BIT_F}},
-	{21, &Jit64::Default}, //"faddx",    OPTYPE_FPU, FL_RC_BIT_F}},
+	{18, &Jit64::fp_arith}, //"fdivx",    OPTYPE_FPU, FL_RC_BIT_F, 30}},
+	{20, &Jit64::fp_arith}, //"fsubx",    OPTYPE_FPU, FL_RC_BIT_F}},
+	{21, &Jit64::fp_arith}, //"faddx",    OPTYPE_FPU, FL_RC_BIT_F}},
 	{22, &Jit64::Default}, //"fsqrtx",   OPTYPE_FPU, FL_RC_BIT_F}},
 	{23, &Jit64::Default}, //"fselx",    OPTYPE_FPU, FL_RC_BIT_F}},
-	{25, &Jit64::fp_arith_s}, //"fmulx",    OPTYPE_FPU, FL_RC_BIT_F}},
+	{25, &Jit64::fp_arith}, //"fmulx",    OPTYPE_FPU, FL_RC_BIT_F}},
 	{26, &Jit64::frsqrtex}, //"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F}},
 	{28, &Jit64::fmaddXX}, //"fmsubx",   OPTYPE_FPU, FL_RC_BIT_F}},
 	{29, &Jit64::fmaddXX}, //"fmaddx",   OPTYPE_FPU, FL_RC_BIT_F}},
diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp
index c4699ecc62..b0d0ab4853 100644
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -85,7 +85,7 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single,
 	fpr.UnlockAll();
 }
 
-void Jit64::fp_arith_s(UGeckoInstruction inst)
+void Jit64::fp_arith(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
 	JITDISABLE(bJITFloatingPointOff)
@@ -106,7 +106,7 @@ void Jit64::fp_arith_s(UGeckoInstruction inst)
 	case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true,  single, &XEmitter::ADDSD, &XEmitter::VADDSD); break; //add
 	case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true,  single, &XEmitter::MULSD, &XEmitter::VMULSD); break; //mul
 	default:
-		_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
+		_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
 	}
 }
 

From 288bef280784e02cc80f0aee9360e45548c06e9a Mon Sep 17 00:00:00 2001
From: Tillmann Karras <tilkax@gmail.com>
Date: Sun, 3 Nov 2013 23:56:30 +0100
Subject: [PATCH 4/4] x64: add small warning if CPU has SSE2 but not DAZ

---
 Source/Core/Common/Src/x64CPUDetect.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Source/Core/Common/Src/x64CPUDetect.cpp b/Source/Core/Common/Src/x64CPUDetect.cpp
index d6f36eb142..2fa25e8074 100644
--- a/Source/Core/Common/Src/x64CPUDetect.cpp
+++ b/Source/Core/Common/Src/x64CPUDetect.cpp
@@ -250,7 +250,12 @@ std::string CPUInfo::Summarize()
 {
 	std::string sum(cpu_string);
 	if (bSSE) sum += ", SSE";
-	if (bSSE2) sum += ", SSE2";
+	if (bSSE2)
+	{
+		sum += ", SSE2";
+		if (!bFlushToZero)
+			sum += " (but not DAZ!)";
+	}
 	if (bSSE3) sum += ", SSE3";
 	if (bSSSE3) sum += ", SSSE3";
 	if (bSSE4_1) sum += ", SSE4.1";