From ad4ad7c1eded30d60f0cff81317c3a69061289a1 Mon Sep 17 00:00:00 2001
From: magumagu <magumagu9@gmail.com>
Date: Wed, 21 May 2014 21:29:43 -0700
Subject: [PATCH 1/7] Use accurate frsqrte in Interpreter.

The implementation of frsqrte exposed by this change isn't completely
correct; that will be fixed in a later commit.
---
 .../Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp  | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp
index e87472c87b..fbdaf7646a 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp
@@ -419,7 +419,6 @@ void Interpreter::frsqrtex(UGeckoInstruction _inst)
 	}
 	else
 	{
-#ifdef VERY_ACCURATE_FP
 		if (b == 0.0)
 		{
 			SetFPException(FPSCR_ZX);
@@ -442,11 +441,6 @@ void Interpreter::frsqrtex(UGeckoInstruction _inst)
 			outa |= frsqrtex_lut[idx] >> 12;
 			riPS0(_inst.FD) = ((u64)outa << 32) + (u64)outb;
 		}
-#else
-		if (b == 0.0)
-			SetFPException(FPSCR_ZX);
-		rPS0(_inst.FD) = ForceDouble(1.0 / sqrt(b));
-#endif
 	}
 	UpdateFPRF(rPS0(_inst.FD));
 	if (_inst.Rc) Helper_UpdateCR1();

From 2f8a147eda29297e7166962f8b5eea71a87787ad Mon Sep 17 00:00:00 2001
From: magumagu <magumagu9@gmail.com>
Date: Thu, 22 May 2014 01:06:30 -0700
Subject: [PATCH 2/7] Interpreter: make fres match hardware.

New table-based implementation written based on actual hardware behavior.
(hwtest coming soon).
---
 .../PowerPC/Interpreter/Interpreter_FPUtils.h | 58 +++++++++++++++++++
 .../Interpreter/Interpreter_FloatingPoint.cpp | 14 +----
 .../Interpreter/Interpreter_Paired.cpp        | 18 +-----
 3 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
index 5131e526b3..3971999fa8 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@@ -271,3 +271,61 @@ inline u64 ConvertToDouble(u32 _x)
 		return ((x & 0xc0000000) << 32) | z | ((x & 0x3fffffff) << 29);
 	}
 }
+
+// Used by fres and ps_res.
+inline double ApproximateReciprocal(double val)
+{
+	static const int expected_base[] = {
+		0x7ff800, 0x783800, 0x70ea00, 0x6a0800,
+		0x638800, 0x5d6200, 0x579000, 0x520800,
+		0x4cc800, 0x47ca00, 0x430800, 0x3e8000,
+		0x3a2c00, 0x360800, 0x321400, 0x2e4a00,
+		0x2aa800, 0x272c00, 0x23d600, 0x209e00,
+		0x1d8800, 0x1a9000, 0x17ae00, 0x14f800,
+		0x124400, 0x0fbe00, 0x0d3800, 0x0ade00,
+		0x088400, 0x065000, 0x041c00, 0x020c00,
+	};
+	static const int expected_dec[] = {
+		0x3e1, 0x3a7, 0x371, 0x340,
+		0x313, 0x2ea, 0x2c4, 0x2a0,
+		0x27f, 0x261, 0x245, 0x22a,
+		0x212, 0x1fb, 0x1e5, 0x1d1,
+		0x1be, 0x1ac, 0x19b, 0x18b,
+		0x17c, 0x16e, 0x15b, 0x15b,
+		0x143, 0x143, 0x12d, 0x12d,
+		0x11a, 0x11a, 0x108, 0x106,
+	};
+
+	union {
+		double valf;
+		long long vali;
+	};
+	valf = val;
+	long long mantissa = vali & ((1LL << 52) - 1);
+	long long sign = vali & (1ULL << 63);
+	long long exponent = vali & (0x7FFLL << 52);
+
+	// Special case 0
+	if (mantissa == 0 && exponent == 0)
+		return sign ? -INFINITY : INFINITY;
+	// Special case NaN-ish numbers
+	if (exponent == (0x7FFLL << 52))
+	{
+		if (mantissa == 0)
+			return sign ? -0.0 : 0.0;
+		return 0.0 + valf;
+	}
+	// Special case small inputs
+	if (exponent < (895LL << 52))
+		return sign ? -FLT_MAX : FLT_MAX;
+	// Special case large inputs
+	if (exponent >= (1149LL << 52))
+		return sign ? -0.0f : 0.0f;
+
+	exponent = (0x7FDLL << 52) - exponent;
+
+	int i = (int)(mantissa >> 37);
+	vali = sign | exponent;
+	vali |= (long long)(expected_base[i / 1024] - (expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29;
+	return valf;
+}
diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp
index fbdaf7646a..980e0e7078 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp
@@ -388,19 +388,7 @@ void Interpreter::fdivsx(UGeckoInstruction _inst)
 void Interpreter::fresx(UGeckoInstruction _inst)
 {
 	double b = rPS0(_inst.FB);
-	double one_over = ForceSingle(1.0 / b);
-	// this is based on the real hardware tests
-	if (b != 0.0 && IsINF(one_over))
-	{
-		if (one_over > 0)
-			riPS0(_inst.FD) = riPS1(_inst.FD) = MAX_SINGLE;
-		else
-			riPS0(_inst.FD) = riPS1(_inst.FD) = MIN_SINGLE;
-	}
-	else
-	{
-		rPS0(_inst.FD) = rPS1(_inst.FD) = one_over;
-	}
+	rPS0(_inst.FD) = rPS1(_inst.FD) = ApproximateReciprocal(b);
 	if (b == 0.0)
 	{
 		SetFPException(FPSCR_ZX);
diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp
index 387d46ea96..babf80db09 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp
@@ -178,22 +178,8 @@ void Interpreter::ps_res(UGeckoInstruction _inst)
 	{
 		SetFPException(FPSCR_ZX);
 	}
-	rPS0(_inst.FD) = ForceSingle(1.0 / a);
-	if (a != 0.0 && IsINF(rPS0(_inst.FD)))
-	{
-		if (rPS0(_inst.FD) > 0)
-			riPS0(_inst.FD) = MAX_SINGLE; // largest finite single
-		else
-			riPS0(_inst.FD) = MIN_SINGLE; // most negative finite single
-	}
-	rPS1(_inst.FD) = ForceSingle(1.0 / b);
-	if (b != 0.0 && IsINF(rPS1(_inst.FD)))
-	{
-		if (rPS1(_inst.FD) > 0)
-			riPS1(_inst.FD) = MAX_SINGLE;
-		else
-			riPS1(_inst.FD) = MIN_SINGLE;
-	}
+	rPS0(_inst.FD) = ApproximateReciprocal(a);
+	rPS1(_inst.FD) = ApproximateReciprocal(b);
 	UpdateFPRF(rPS0(_inst.FD));
 	if (_inst.Rc) Helper_UpdateCR1();
 }

From 567724b2f8027b4b99e8101af705ff5ceec9c9e7 Mon Sep 17 00:00:00 2001
From: magumagu <magumagu9@gmail.com>
Date: Fri, 23 May 2014 14:57:34 -0700
Subject: [PATCH 3/7] Jit: get rid of incorrect implementations of fres and
 frsqrte.

The existing implementations produce answers which aren't consistent with
the hardware, and games care about correct floating point math.

These can be reimplemented at some point in the future, if someone cares
enough, but the general case is probably too much code to inline.

(I'm leaving the ARM implementations in place by request, even though they
suffer the same issues.)
---
 Source/Core/Core/PowerPC/Jit64/Jit.h          |  2 --
 .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp  |  6 ++--
 .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp  | 15 --------
 Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 35 -------------------
 Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp   |  9 -----
 .../Core/PowerPC/Jit64IL/JitIL_Tables.cpp     |  2 +-
 Source/Core/Core/PowerPC/JitArmIL/IR_Arm.cpp  |  1 -
 Source/Core/Core/PowerPC/JitILCommon/IR.cpp   |  2 +-
 Source/Core/Core/PowerPC/JitILCommon/IR.h     |  4 ---
 .../JitILCommon/JitILBase_FloatingPoint.cpp   |  9 +----
 10 files changed, 6 insertions(+), 79 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index 0125eeae86..d273d32aaf 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -168,12 +168,10 @@ public:
 	void ps_arith(UGeckoInstruction inst); //aggregate
 	void ps_mergeXX(UGeckoInstruction inst);
 	void ps_maddXX(UGeckoInstruction inst);
-	void ps_recip(UGeckoInstruction inst);
 	void ps_sum(UGeckoInstruction inst);
 	void ps_muls(UGeckoInstruction inst);
 
 	void fp_arith(UGeckoInstruction inst);
-	void frsqrtex(UGeckoInstruction inst);
 
 	void fcmpx(UGeckoInstruction inst);
 	void fmrx(UGeckoInstruction inst);
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
index 1375d926c0..2a8c3d6072 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
@@ -138,9 +138,9 @@ static GekkoOPTemplate table4_2[] =
 	{20, &Jit64::ps_arith},  //"ps_sub",    OPTYPE_PS, 0}},
 	{21, &Jit64::ps_arith},  //"ps_add",    OPTYPE_PS, 0}},
 	{23, &Jit64::ps_sel},    //"ps_sel",    OPTYPE_PS, 0}},
-	{24, &Jit64::ps_recip},  //"ps_res",    OPTYPE_PS, 0}},
+	{24, &Jit64::FallBackToInterpreter},    //"ps_res",    OPTYPE_PS, 0}},
 	{25, &Jit64::ps_arith},  //"ps_mul",    OPTYPE_PS, 0}},
-	{26, &Jit64::ps_recip},  //"ps_rsqrte", OPTYPE_PS, 0, 1}},
+	{26, &Jit64::FallBackToInterpreter},    //"ps_rsqrte", OPTYPE_PS, 0, 1}},
 	{28, &Jit64::ps_maddXX}, //"ps_msub",   OPTYPE_PS, 0}},
 	{29, &Jit64::ps_maddXX}, //"ps_madd",   OPTYPE_PS, 0}},
 	{30, &Jit64::ps_maddXX}, //"ps_nmsub",  OPTYPE_PS, 0}},
@@ -360,7 +360,7 @@ static GekkoOPTemplate table63_2[] =
 	{22, &Jit64::FallBackToInterpreter}, //"fsqrtx",   OPTYPE_FPU, FL_RC_BIT_F}},
 	{23, &Jit64::FallBackToInterpreter}, //"fselx",    OPTYPE_FPU, FL_RC_BIT_F}},
 	{25, &Jit64::fp_arith},              //"fmulx",    OPTYPE_FPU, FL_RC_BIT_F}},
-	{26, &Jit64::frsqrtex},              //"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F}},
+	{26, &Jit64::FallBackToInterpreter}, //"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F}},
 	{28, &Jit64::fmaddXX},               //"fmsubx",   OPTYPE_FPU, FL_RC_BIT_F}},
 	{29, &Jit64::fmaddXX},               //"fmaddx",   OPTYPE_FPU, FL_RC_BIT_F}},
 	{30, &Jit64::fmaddXX},               //"fnmsubx",  OPTYPE_FPU, FL_RC_BIT_F}},
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index e733c3f56a..b5cd8833a6 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -101,21 +101,6 @@ void Jit64::fp_arith(UGeckoInstruction inst)
 	}
 }
 
-void Jit64::frsqrtex(UGeckoInstruction inst)
-{
-	INSTRUCTION_START
-	JITDISABLE(bJITFloatingPointOff)
-	int d = inst.FD;
-	int b = inst.FB;
-	fpr.Lock(b, d);
-	fpr.BindToRegister(d, true, true);
-	MOVSD(XMM0, M((void *)&one_const));
-	SQRTSD(XMM1, fpr.R(b));
-	DIVSD(XMM0, R(XMM1));
-	MOVSD(fpr.R(d), XMM0);
-	fpr.UnlockAll();
-}
-
 void Jit64::fmaddXX(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
index 58d73a845e..cd71dbbd9e 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
@@ -112,41 +112,6 @@ void Jit64::ps_sign(UGeckoInstruction inst)
 	fpr.UnlockAll();
 }
 
-// ps_res and ps_rsqrte
-void Jit64::ps_recip(UGeckoInstruction inst)
-{
-	INSTRUCTION_START
-	JITDISABLE(bJITPairedOff)
-
-	if (inst.Rc)
-	{
-		FallBackToInterpreter(inst);
-		return;
-	}
-
-	OpArg divisor;
-	int d = inst.FD;
-	int b = inst.FB;
-	fpr.Lock(d, b);
-	fpr.BindToRegister(d, (d == b));
-	switch (inst.SUBOP5)
-	{
-	case 24:
-		// ps_res
-		divisor = fpr.R(b);
-		break;
-	case 26:
-		// ps_rsqrte
-		SQRTPD(XMM0, fpr.R(b));
-		divisor = R(XMM0);
-		break;
-	}
-	MOVAPD(XMM1, M((void*)&psOneOne));
-	DIVPD(XMM1, divisor);
-	MOVAPD(fpr.R(d), XMM1);
-	fpr.UnlockAll();
-}
-
 //add a, b, c
 
 //mov a, b
diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
index 53d0323c71..cd31cedb2d 100644
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@@ -674,7 +674,6 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) {
 		case FSMul:
 		case FSAdd:
 		case FSSub:
-		case FSRSqrt:
 		case FDMul:
 		case FDAdd:
 		case FDSub:
@@ -1435,14 +1434,6 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) {
 			fregEmitBinInst(RI, I, &JitIL::SUBSS);
 			break;
 		}
-		case FSRSqrt: {
-			if (!thisUsed) break;
-			X64Reg reg = fregURegWithoutMov(RI, I);
-			Jit->RSQRTSS(reg, fregLocForInst(RI, getOp1(I)));
-			RI.fregs[reg] = I;
-			fregNormalRegClear(RI, I);
-			break;
-		}
 		case FDMul: {
 			if (!thisUsed) break;
 			fregEmitBinInst(RI, I, &JitIL::MULSD);
diff --git a/Source/Core/Core/PowerPC/Jit64IL/JitIL_Tables.cpp b/Source/Core/Core/PowerPC/Jit64IL/JitIL_Tables.cpp
index 59d5a3e581..c3214f6e18 100644
--- a/Source/Core/Core/PowerPC/Jit64IL/JitIL_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/JitIL_Tables.cpp
@@ -361,7 +361,7 @@ static GekkoOPTemplate table63_2[] =
 	{22, &JitIL::FallBackToInterpreter}, //"fsqrtx",   OPTYPE_FPU, FL_RC_BIT_F}},
 	{23, &JitIL::FallBackToInterpreter}, //"fselx",    OPTYPE_FPU, FL_RC_BIT_F}},
 	{25, &JitIL::fp_arith_s},            //"fmulx",    OPTYPE_FPU, FL_RC_BIT_F}},
-	{26, &JitIL::fp_arith_s},            //"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F}},
+	{26, &JitIL::FallBackToInterpreter}, //"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F}},
 	{28, &JitIL::fmaddXX},               //"fmsubx",   OPTYPE_FPU, FL_RC_BIT_F}},
 	{29, &JitIL::fmaddXX},               //"fmaddx",   OPTYPE_FPU, FL_RC_BIT_F}},
 	{30, &JitIL::fmaddXX},               //"fnmsubx",  OPTYPE_FPU, FL_RC_BIT_F}},
diff --git a/Source/Core/Core/PowerPC/JitArmIL/IR_Arm.cpp b/Source/Core/Core/PowerPC/JitArmIL/IR_Arm.cpp
index d85260d15a..931056f592 100644
--- a/Source/Core/Core/PowerPC/JitArmIL/IR_Arm.cpp
+++ b/Source/Core/Core/PowerPC/JitArmIL/IR_Arm.cpp
@@ -394,7 +394,6 @@ static void DoWriteCode(IRBuilder* ibuild, JitArmIL* Jit, u32 exitAddress) {
 		case FSMul:
 		case FSAdd:
 		case FSSub:
-		case FSRSqrt:
 		case FDMul:
 		case FDAdd:
 		case FDSub:
diff --git a/Source/Core/Core/PowerPC/JitILCommon/IR.cpp b/Source/Core/Core/PowerPC/JitILCommon/IR.cpp
index 3684e4e514..f9bd2a5820 100644
--- a/Source/Core/Core/PowerPC/JitILCommon/IR.cpp
+++ b/Source/Core/Core/PowerPC/JitILCommon/IR.cpp
@@ -1128,7 +1128,7 @@ unsigned IRBuilder::getNumberOfOperands(InstLoc I) const {
 		numberOfOperands[CInt32] = 0;
 
 		static unsigned ZeroOp[] = {LoadCR, LoadLink, LoadMSR, LoadGReg, LoadCTR, InterpreterBranch, LoadCarry, RFIExit, LoadFReg, LoadFRegDENToZero, LoadGQR, Int3, };
-		static unsigned UOp[] = {StoreLink, BranchUncond, StoreCR, StoreMSR, StoreFPRF, StoreGReg, StoreCTR, Load8, Load16, Load32, SExt16, SExt8, Cntlzw, Not, StoreCarry, SystemCall, ShortIdleLoop, LoadSingle, LoadDouble, LoadPaired, StoreFReg, DupSingleToMReg, DupSingleToPacked, ExpandPackedToMReg, CompactMRegToPacked, FSNeg, FSRSqrt, FDNeg, FPDup0, FPDup1, FPNeg, DoubleToSingle, StoreGQR, StoreSRR, };
+		static unsigned UOp[] = {StoreLink, BranchUncond, StoreCR, StoreMSR, StoreFPRF, StoreGReg, StoreCTR, Load8, Load16, Load32, SExt16, SExt8, Cntlzw, Not, StoreCarry, SystemCall, ShortIdleLoop, LoadSingle, LoadDouble, LoadPaired, StoreFReg, DupSingleToMReg, DupSingleToPacked, ExpandPackedToMReg, CompactMRegToPacked, FSNeg, FDNeg, FPDup0, FPDup1, FPNeg, DoubleToSingle, StoreGQR, StoreSRR, };
 		static unsigned BiOp[] = {BranchCond, IdleBranch, And, Xor, Sub, Or, Add, Mul, Rol, Shl, Shrl, Sarl, ICmpEq, ICmpNe, ICmpUgt, ICmpUlt, ICmpSgt, ICmpSlt, ICmpSge, ICmpSle, Store8, Store16, Store32, ICmpCRSigned, ICmpCRUnsigned, FallBackToInterpreter, StoreSingle, StoreDouble, StorePaired, InsertDoubleInMReg, FSMul, FSAdd, FSSub, FDMul, FDAdd, FDSub, FPAdd, FPMul, FPSub, FPMerge00, FPMerge01, FPMerge10, FPMerge11, FDCmpCR, };
 		for (auto& op : ZeroOp) {
 			numberOfOperands[op] = 0;
diff --git a/Source/Core/Core/PowerPC/JitILCommon/IR.h b/Source/Core/Core/PowerPC/JitILCommon/IR.h
index 062738edc6..f84f29fe02 100644
--- a/Source/Core/Core/PowerPC/JitILCommon/IR.h
+++ b/Source/Core/Core/PowerPC/JitILCommon/IR.h
@@ -113,7 +113,6 @@ enum Opcode {
 	FSAdd,
 	FSSub,
 	FSNeg,
-	FSRSqrt,
 	FPAdd,
 	FPMul,
 	FPSub,
@@ -464,9 +463,6 @@ public:
 	InstLoc EmitFSNeg(InstLoc op1) {
 		return FoldUOp(FSNeg, op1);
 	}
-	InstLoc EmitFSRSqrt(InstLoc op1) {
-		return FoldUOp(FSRSqrt, op1);
-	}
 	InstLoc EmitFDMul(InstLoc op1, InstLoc op2) {
 		return FoldBiOp(FDMul, op1, op2);
 	}
diff --git a/Source/Core/Core/PowerPC/JitILCommon/JitILBase_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitILCommon/JitILBase_FloatingPoint.cpp
index 464e133c20..ec3d01e663 100644
--- a/Source/Core/Core/PowerPC/JitILCommon/JitILBase_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitILCommon/JitILBase_FloatingPoint.cpp
@@ -9,8 +9,7 @@ void JitILBase::fp_arith_s(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
 	JITDISABLE(bJITFloatingPointOff)
-	if (inst.Rc || (inst.SUBOP5 != 25 && inst.SUBOP5 != 20 &&
-	                inst.SUBOP5 != 21 && inst.SUBOP5 != 26))
+	if (inst.Rc || (inst.SUBOP5 != 25 && inst.SUBOP5 != 20 && inst.SUBOP5 != 21))
 	{
 		FallBackToInterpreter(inst);
 		return;
@@ -35,12 +34,6 @@ void JitILBase::fp_arith_s(UGeckoInstruction inst)
 	case 25: //mul
 		val = ibuild.EmitFDMul(val, ibuild.EmitLoadFReg(inst.FC));
 		break;
-	case 26: //rsqrte
-		val = ibuild.EmitLoadFReg(inst.FB);
-		val = ibuild.EmitDoubleToSingle(val);
-		val = ibuild.EmitFSRSqrt(val);
-		val = ibuild.EmitDupSingleToMReg(val);
-		break;
 	default:
 		_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
 	}

From 129e76e60d125295ea1dfe536779958fbc6c59fb Mon Sep 17 00:00:00 2001
From: magumagu <magumagu9@gmail.com>
Date: Thu, 22 May 2014 01:53:22 -0700
Subject: [PATCH 4/7] Interpreter: refactor the rsqrte code, and use it for
 ps_rsqrte.

---
 .../PowerPC/Interpreter/Interpreter_FPUtils.h | 32 +++++++++++++++++++
 .../Interpreter/Interpreter_FloatingPoint.cpp | 28 ++--------------
 .../Interpreter/Interpreter_Paired.cpp        | 27 +++-------------
 3 files changed, 40 insertions(+), 47 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
index 3971999fa8..6c112eac66 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@@ -6,6 +6,7 @@
 
 #include "Common/CPUDetect.h"
 #include "Common/MathUtil.h"
+#include "Core/PowerPC/LUT_frsqrtex.h"
 #include "Core/PowerPC/Interpreter/Interpreter.h"
 
 using namespace MathUtil;
@@ -329,3 +330,34 @@ inline double ApproximateReciprocal(double val)
 	vali |= (long long)(expected_base[i / 1024] - (expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29;
 	return valf;
 }
+
+inline double ApproximateReciprocalSquareRoot(double val)
+{
+	if (val < 0)
+		return PPC_NAN;
+	if (val == 0.0)
+		return INFINITY;
+
+	union
+	{
+		double valf;
+		u64 vali;
+	};
+	valf = val;
+
+	u32 fsa = vali >> 32;
+	u32 idx = (fsa >> 5) % (sizeof(frsqrtex_lut) / sizeof(frsqrtex_lut[0]));
+
+	s32 e = fsa >> (32 - 12);
+	e &= 2047;
+	e -= 1023;
+	s32 oe = -((e + 1) / 2);
+	oe -= ((e + 1) & 1);
+
+	u32 outb = frsqrtex_lut[idx] << 20;
+	u32 outa = ((oe + 1023) & 2047) << 20;
+	outa |= frsqrtex_lut[idx] >> 12;
+
+	vali = ((u64)outa << 32) + (u64)outb;
+	return valf;
+}
diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp
index 980e0e7078..f214a53447 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp
@@ -18,7 +18,6 @@
 #endif
 
 #include "Common/MathUtil.h"
-#include "Core/PowerPC/LUT_frsqrtex.h"
 #include "Core/PowerPC/Interpreter/Interpreter.h"
 #include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h"
 
@@ -403,33 +402,12 @@ void Interpreter::frsqrtex(UGeckoInstruction _inst)
 	if (b < 0.0)
 	{
 		SetFPException(FPSCR_VXSQRT);
-		rPS0(_inst.FD) = PPC_NAN;
 	}
-	else
+	else if (b == 0.0)
 	{
-		if (b == 0.0)
-		{
-			SetFPException(FPSCR_ZX);
-			riPS0(_inst.FD) = 0x7ff0000000000000;
-		}
-		else
-		{
-			u32 fsa = Common::swap32(Common::swap64(riPS0(_inst.FB)));
-			u32 fsb = Common::swap32(Common::swap64(riPS0(_inst.FB)) >> 32);
-			u32 idx=(fsa >> 5) % (sizeof(frsqrtex_lut) / sizeof(frsqrtex_lut[0]));
-
-			s32 e = fsa >> (32-12);
-			e &= 2047;
-			e -= 1023;
-			s32 oe =- ((e + 1) / 2);
-			oe -= ((e + 1) & 1);
-
-			u32 outb = frsqrtex_lut[idx] << 20;
-			u32 outa = ((oe + 1023) & 2047) << 20;
-			outa |= frsqrtex_lut[idx] >> 12;
-			riPS0(_inst.FD) = ((u64)outa << 32) + (u64)outb;
-		}
+		SetFPException(FPSCR_ZX);
 	}
+	rPS0(_inst.FD) = ApproximateReciprocalSquareRoot(b);
 	UpdateFPRF(rPS0(_inst.FD));
 	if (_inst.Rc) Helper_UpdateCR1();
 }
diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp
index babf80db09..bfd8007ead 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp
@@ -186,36 +186,19 @@ void Interpreter::ps_res(UGeckoInstruction _inst)
 
 void Interpreter::ps_rsqrte(UGeckoInstruction _inst)
 {
-	// this code is based on the real hardware tests
 	if (rPS0(_inst.FB) == 0.0 || rPS1(_inst.FB) == 0.0)
 	{
 		SetFPException(FPSCR_ZX);
 	}
-	// PS0
-	if (rPS0(_inst.FB) < 0.0)
+
+	if (rPS0(_inst.FB) < 0.0 || rPS1(_inst.FB) < 0.0)
 	{
 		SetFPException(FPSCR_VXSQRT);
-		rPS0(_inst.FD) = PPC_NAN;
-	}
-	else
-	{
-		rPS0(_inst.FD) = 1.0 / sqrt(rPS0(_inst.FB));
-		u32 t = ConvertToSingle(riPS0(_inst.FD));
-		rPS0(_inst.FD) = *(float*)&t;
-	}
-	// PS1
-	if (rPS1(_inst.FB) < 0.0)
-	{
-		SetFPException(FPSCR_VXSQRT);
-		rPS1(_inst.FD) = PPC_NAN;
-	}
-	else
-	{
-		rPS1(_inst.FD) = 1.0 / sqrt(rPS1(_inst.FB));
-		u32 t = ConvertToSingle(riPS1(_inst.FD));
-		rPS1(_inst.FD) = *(float*)&t;
 	}
 
+	rPS0(_inst.FD) = ApproximateReciprocalSquareRoot(rPS0(_inst.FB));
+	rPS1(_inst.FD) = ApproximateReciprocalSquareRoot(rPS1(_inst.FB));
+
 	UpdateFPRF(rPS0(_inst.FD));
 	if (_inst.Rc) Helper_UpdateCR1();
 }

From a9a2d3d98d727a72a0bc2e1fe457ade81b0a333f Mon Sep 17 00:00:00 2001
From: magumagu <magumagu9@gmail.com>
Date: Thu, 22 May 2014 06:12:29 -0700
Subject: [PATCH 5/7] New frsqrte implementation; verified accurate.

This is similar to the old implementation, but it uses smaller tables, and
handles more edge cases correctly.  (hwtest coming soon.)
---
 .../PowerPC/Interpreter/Interpreter_FPUtils.h | 74 ++++++++++++++-----
 1 file changed, 55 insertions(+), 19 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
index 6c112eac66..c5d24f7a03 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@@ -6,7 +6,6 @@
 
 #include "Common/CPUDetect.h"
 #include "Common/MathUtil.h"
-#include "Core/PowerPC/LUT_frsqrtex.h"
 #include "Core/PowerPC/Interpreter/Interpreter.h"
 
 using namespace MathUtil;
@@ -333,31 +332,68 @@ inline double ApproximateReciprocal(double val)
 
 inline double ApproximateReciprocalSquareRoot(double val)
 {
-	if (val < 0)
-		return PPC_NAN;
-	if (val == 0.0)
-		return INFINITY;
+	static const int expected_base[] = {
+		0x3ffa000, 0x3c29000, 0x38aa000, 0x3572000,
+		0x3279000, 0x2fb7000, 0x2d26000, 0x2ac0000,
+		0x2881000, 0x2665000, 0x2468000, 0x2287000,
+		0x20c1000, 0x1f12000, 0x1d79000, 0x1bf4000,
+		0x1a7e800, 0x17cb800, 0x1552800, 0x130c000,
+		0x10f2000, 0x0eff000, 0x0d2e000, 0x0b7c000,
+		0x09e5000, 0x0867000, 0x06ff000, 0x05ab800,
+		0x046a000, 0x0339800, 0x0218800, 0x0105800,
+	};
+	static const int expected_dec[] = {
+		0x7a4, 0x700, 0x670, 0x5f2,
+		0x584, 0x524, 0x4cc, 0x47e,
+		0x43a, 0x3fa, 0x3c2, 0x38e,
+		0x35e, 0x332, 0x30a, 0x2e6,
+		0x568, 0x4f3, 0x48d, 0x435,
+		0x3e7, 0x3a2, 0x365, 0x32e,
+		0x2fc, 0x2d0, 0x2a8, 0x283,
+		0x261, 0x243, 0x226, 0x20b,
+	};
 
-	union
-	{
+	union {
 		double valf;
-		u64 vali;
+		long long vali;
 	};
 	valf = val;
+	long long mantissa = vali & ((1LL << 52) - 1);
+	long long sign = vali & (1ULL << 63);
+	long long exponent = vali & (0x7FFLL << 52);
 
-	u32 fsa = vali >> 32;
-	u32 idx = (fsa >> 5) % (sizeof(frsqrtex_lut) / sizeof(frsqrtex_lut[0]));
+	// Special case 0
+	if (mantissa == 0 && exponent == 0)
+		return sign ? -INFINITY : INFINITY;
+	// Special case NaN-ish numbers
+	if (exponent == (0x7FFLL << 52))
+	{
+		if (mantissa == 0)
+			return sign ? NAN : 0.0;
+		return 0.0 + valf;
+	}
+	// Negative numbers return NaN
+	if (sign)
+		return NAN;
 
-	s32 e = fsa >> (32 - 12);
-	e &= 2047;
-	e -= 1023;
-	s32 oe = -((e + 1) / 2);
-	oe -= ((e + 1) & 1);
+	if (!exponent)
+	{
+		// "Normalize" denormal values
+		do
+		{
+			exponent -= 1LL << 52;
+			mantissa <<= 1;
+		} while (!(mantissa & (1LL << 52)));
+		mantissa &= (1LL << 52) - 1;
+		exponent += 1LL << 52;
+	}
 
-	u32 outb = frsqrtex_lut[idx] << 20;
-	u32 outa = ((oe + 1023) & 2047) << 20;
-	outa |= frsqrtex_lut[idx] >> 12;
+	bool odd_exponent = !(exponent & (1LL << 52));
+	exponent = ((0x3FFLL << 52) - ((exponent - (0x3FELL << 52)) / 2)) & (0x7FFLL << 52);
 
-	vali = ((u64)outa << 32) + (u64)outb;
+	int i = (int)(mantissa >> 37);
+	vali = sign | exponent;
+	int index = i / 2048 + (odd_exponent ? 16 : 0);
+	vali |= (long long)(expected_base[index] - expected_dec[index] * (i % 2048)) << 26;
 	return valf;
 }

From d0ed3b81922fe1a6650853931b2a2b26b8d526c6 Mon Sep 17 00:00:00 2001
From: magumagu <magumagu9@gmail.com>
Date: Thu, 22 May 2014 19:20:43 -0700
Subject: [PATCH 6/7] Jit: Use infinity and NaN from numeric_limits.

MSVC's implementation of INFINITY is unusable.
---
 .../PowerPC/Interpreter/Interpreter_FPUtils.h    | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
index c5d24f7a03..906913c423 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@@ -4,6 +4,8 @@
 
 #pragma once
 
+#include <limits>
+
 #include "Common/CPUDetect.h"
 #include "Common/MathUtil.h"
 #include "Core/PowerPC/Interpreter/Interpreter.h"
@@ -307,7 +309,8 @@ inline double ApproximateReciprocal(double val)
 
 	// Special case 0
 	if (mantissa == 0 && exponent == 0)
-		return sign ? -INFINITY : INFINITY;
+		return sign ? -std::numeric_limits<double>::infinity() :
+		               std::numeric_limits<double>::infinity();
 	// Special case NaN-ish numbers
 	if (exponent == (0x7FFLL << 52))
 	{
@@ -364,17 +367,22 @@ inline double ApproximateReciprocalSquareRoot(double val)
 
 	// Special case 0
 	if (mantissa == 0 && exponent == 0)
-		return sign ? -INFINITY : INFINITY;
+		return sign ? -std::numeric_limits<double>::infinity() :
+		               std::numeric_limits<double>::infinity();
 	// Special case NaN-ish numbers
 	if (exponent == (0x7FFLL << 52))
 	{
 		if (mantissa == 0)
-			return sign ? NAN : 0.0;
+		{
+			if (sign)
+				return std::numeric_limits<double>::quiet_NaN();
+			return 0.0;
+		}
 		return 0.0 + valf;
 	}
 	// Negative numbers return NaN
 	if (sign)
-		return NAN;
+		return std::numeric_limits<double>::quiet_NaN();
 
 	if (!exponent)
 	{

From 6955fef16171bc1ed2892b291ad1ff22a54c6882 Mon Sep 17 00:00:00 2001
From: magumagu <magumagu9@gmail.com>
Date: Fri, 23 May 2014 15:06:09 -0700
Subject: [PATCH 7/7] Interpreter: Code style fixes.

---
 .../PowerPC/Interpreter/Interpreter_FPUtils.h | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
index 906913c423..02684e4d7a 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@@ -298,14 +298,15 @@ inline double ApproximateReciprocal(double val)
 		0x11a, 0x11a, 0x108, 0x106,
 	};
 
-	union {
+	union
+	{
 		double valf;
-		long long vali;
+		s64 vali;
 	};
 	valf = val;
-	long long mantissa = vali & ((1LL << 52) - 1);
-	long long sign = vali & (1ULL << 63);
-	long long exponent = vali & (0x7FFLL << 52);
+	s64 mantissa = vali & ((1LL << 52) - 1);
+	s64 sign = vali & (1ULL << 63);
+	s64 exponent = vali & (0x7FFLL << 52);
 
 	// Special case 0
 	if (mantissa == 0 && exponent == 0)
@@ -329,7 +330,7 @@ inline double ApproximateReciprocal(double val)
 
 	int i = (int)(mantissa >> 37);
 	vali = sign | exponent;
-	vali |= (long long)(expected_base[i / 1024] - (expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29;
+	vali |= (s64)(expected_base[i / 1024] - (expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29;
 	return valf;
 }
 
@@ -356,14 +357,15 @@ inline double ApproximateReciprocalSquareRoot(double val)
 		0x261, 0x243, 0x226, 0x20b,
 	};
 
-	union {
+	union
+	{
 		double valf;
-		long long vali;
+		s64 vali;
 	};
 	valf = val;
-	long long mantissa = vali & ((1LL << 52) - 1);
-	long long sign = vali & (1ULL << 63);
-	long long exponent = vali & (0x7FFLL << 52);
+	s64 mantissa = vali & ((1LL << 52) - 1);
+	s64 sign = vali & (1ULL << 63);
+	s64 exponent = vali & (0x7FFLL << 52);
 
 	// Special case 0
 	if (mantissa == 0 && exponent == 0)
@@ -402,6 +404,6 @@ inline double ApproximateReciprocalSquareRoot(double val)
 	int i = (int)(mantissa >> 37);
 	vali = sign | exponent;
 	int index = i / 2048 + (odd_exponent ? 16 : 0);
-	vali |= (long long)(expected_base[index] - expected_dec[index] * (i % 2048)) << 26;
+	vali |= (s64)(expected_base[index] - expected_dec[index] * (i % 2048)) << 26;
 	return valf;
 }