Jit64[IL]: fix float conversions

Floating-point is complicated... Some background: Denormals are floats that are too close to zero to be stored in a normalized way (their exponent would need more bits). Since they are stored unnormalized, they are hard to work with, even in hardware. That's why both PowerPC and SSE can be configured to operate in faster but non-standard-conpliant modes in which these numbers are simply rounded ('flushed') to zero. Internally, we do the same as the PowerPC CPU and store all floats in double format. This means that for loading and storing singles we need a conversion. The PowerPC CPU does this in hardware. We previously did this using CVTSS2SD/CVTSD2SS. Unfortunately, these instructions are considered arithmetic and therefore flush denormals to zero if non-IEEE mode is active. This normally wouldn't be a problem since the next arithmetic floating-point instruction would do the same anyway but as it turns out some games actually use floating-point instructions for copying arbitrary data. My idea for fixing this problem was to use x87 instructions since the x87 FPU never supported flush-to-zero and thus doesn't mangle denormals. However, there is one more problem to deal with: SNaNs are automatically converted to QNaNs (by setting the most-significant bit of the fraction). I opted to fix this by manually resetting the QNaN bit of all values with all-1s exponent.
2014-02-03 23:56:11 +01:00 · 2014-02-03 23:56:11 +01:00 · db196d8c5b
parent c25c4a6e20
commit db196d8c5b
8 changed files with 173 additions and 37 deletions
--- a/Source/Core/Common/x64Emitter.cpp
+++ b/Source/Core/Common/x64Emitter.cpp
@ -1437,7 +1437,19 @@ void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {
 	Write8(0x0f);
 	Write8(0x38);
 	Write8(0x00);
-	arg.WriteRest(this, 0);
+	arg.WriteRest(this);
+}
+
+void XEmitter::PTEST(X64Reg dest, OpArg arg) {
+	if (!cpu_info.bSSE4_1) {
+		PanicAlert("Trying to use PTEST on a system that doesn't support it. Nobody hears your screams.");
+	}
+	Write8(0x66);
+	Write8(0x0f);
+	Write8(0x38);
+	Write8(0x17);
+	arg.operandReg = dest;
+	arg.WriteRest(this);
 }

 void XEmitter::PAND(X64Reg dest, OpArg arg)     {WriteSSEOp(64, 0xDB, true, dest, arg);}
@ -1497,6 +1509,8 @@ void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64,
 void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseMUL, false, regOp1, regOp2, arg);}
 void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseDIV, false, regOp1, regOp2, arg);}
 void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)  {WriteAVXOp(64, sseSQRT, false, regOp1, regOp2, arg);}
+void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(64, sseAND, false, regOp1, regOp2, arg);}
+void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseANDN, false, regOp1, regOp2, arg);}

 // Prefixes

@ -1526,6 +1540,7 @@ void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, OpArg arg)
 void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, src);}
 void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, dest);}
 void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, dest);}
+void XEmitter::FNSTSW_AX() { Write8(0xDF); Write8(0xE0); }

 void XEmitter::RTDSC() { Write8(0x0F); Write8(0x31); }

--- a/Source/Core/Common/x64Emitter.h
+++ b/Source/Core/Common/x64Emitter.h
@ -433,9 +433,27 @@ public:
 	void REPNE();

 	// x87
+	enum x87StatusWordBits {
+		x87_InvalidOperation = 0x1,
+		x87_DenormalizedOperand = 0x2,
+		x87_DivisionByZero = 0x4,
+		x87_Overflow = 0x8,
+		x87_Underflow = 0x10,
+		x87_Precision = 0x20,
+		x87_StackFault = 0x40,
+		x87_ErrorSummary = 0x80,
+		x87_C0 = 0x100,
+		x87_C1 = 0x200,
+		x87_C2 = 0x400,
+		x87_TopOfStack = 0x2000 | 0x1000 | 0x800,
+		x87_C3 = 0x4000,
+		x87_FPUBusy = 0x8000,
+	};
+
 	void FLD(int bits, OpArg src);
 	void FST(int bits, OpArg dest);
 	void FSTP(int bits, OpArg dest);
+	void FNSTSW_AX();
 	void FWAIT();

 	// SSE/SSE2: Floating point arithmetic
@ -562,6 +580,7 @@ public:
 	void PUNPCKLWD(X64Reg dest, const OpArg &arg);
 	void PUNPCKLDQ(X64Reg dest, const OpArg &arg);

+	void PTEST(X64Reg dest, OpArg arg);
 	void PAND(X64Reg dest, OpArg arg);
 	void PANDN(X64Reg dest, OpArg arg);
 	void PXOR(X64Reg dest, OpArg arg);
@ -631,6 +650,8 @@ public:
 	void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
 	void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
 	void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg);

 	void RTDSC();

--- a/Source/Core/Common/x64FPURoundMode.cpp
+++ b/Source/Core/Common/x64FPURoundMode.cpp
@ -16,11 +16,11 @@ static const unsigned short FPU_ROUND_MASK = 3 << 10;
 #endif

 // OR-mask for disabling FPU exceptions (bits 7-12 in the MXCSR register)
-const u32 EXCEPTION_MASK = 0x1F80;
+static const u32 EXCEPTION_MASK = 0x1F80;
 // Denormals-Are-Zero (non-IEEE mode: denormal inputs are set to +/- 0)
-const u32 DAZ = 0x40;
+static const u32 DAZ = 0x40;
 // Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0)
-const u32 FTZ = 0x8000;
+static const u32 FTZ = 0x8000;

 namespace FPURoundMode
 {
--- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
@ -378,7 +378,7 @@ void RegCache::Flush(FlushMode mode)
 	{
 		if (locks[i])
 		{
-			PanicAlert("Someone forgot to unlock PPC reg %i.", i);
+			PanicAlert("Someone forgot to unlock PPC reg %i (X64 reg %i).", i, RX(i));
 		}
 		if (regs[i].away)
 		{
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
@ -12,6 +12,8 @@
 #include "JitAsm.h"
 #include "JitRegCache.h"

+namespace {
+
 // pshufb todo: MOVQ
 const u8 GC_ALIGNED16(bswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 const u8 GC_ALIGNED16(bswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
@ -19,11 +21,10 @@ const u8 GC_ALIGNED16(bswapShuffle1x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 8, 9, 10,
 const u8 GC_ALIGNED16(bswapShuffle1x8Dupe[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0};
 const u8 GC_ALIGNED16(bswapShuffle2x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};

-namespace {
-
 u64 GC_ALIGNED16(temp64);
-u32 GC_ALIGNED16(temp32);
+
 }
+
 // TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common,
 // and pshufb could help a lot.
 // Also add hacks for things like lfs/stfs the same reg consecutively, that is, simple memory moves.
@ -46,11 +47,9 @@ void Jit64::lfs(UGeckoInstruction inst)

 	MEMCHECK_START

-	MOV(32, M(&temp32), R(EAX));
 	fpr.Lock(d);
 	fpr.BindToRegister(d, false);
-	CVTSS2SD(fpr.RX(d), M(&temp32));
-	MOVDDUP(fpr.RX(d), fpr.R(d));
+	ConvertSingleToDouble(fpr.RX(d), EAX, true);

 	MEMCHECK_END

@ -235,13 +234,15 @@ void Jit64::stfs(UGeckoInstruction inst)
 		return;
 	}

+	fpr.BindToRegister(s, true, false);
+	ConvertDoubleToSingle(XMM0, fpr.RX(s));
+
 	if (gpr.R(a).IsImm())
 	{
 		u32 addr = (u32)(gpr.R(a).offset + offset);
 		if (Memory::IsRAMAddress(addr))
 		{
 			if (cpu_info.bSSSE3) {
-				CVTSD2SS(XMM0, fpr.R(s));
 				PSHUFB(XMM0, M((void *)bswapShuffle1x4));
 				WriteFloatToConstRamAddress(XMM0, addr);
 				return;
@ -250,7 +251,6 @@ void Jit64::stfs(UGeckoInstruction inst)
 		else if (addr == 0xCC008000)
 		{
 			// Float directly to write gather pipe! Fun!
-			CVTSD2SS(XMM0, fpr.R(s));
 			CALL((void*)asm_routines.fifoDirectWriteFloat);
 			// TODO
 			js.fifoBytesThisBlock += 4;
@ -260,7 +260,6 @@ void Jit64::stfs(UGeckoInstruction inst)

 	gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
 	gpr.Lock(a);
-	fpr.Lock(s);
 	MOV(32, R(ABI_PARAM2), gpr.R(a));
 	ADD(32, R(ABI_PARAM2), Imm32(offset));
 	if (update && offset)
@ -275,7 +274,6 @@ void Jit64::stfs(UGeckoInstruction inst)

 		MEMCHECK_END
 	}
-	CVTSD2SS(XMM0, fpr.R(s));
 	SafeWriteFloatToReg(XMM0, ABI_PARAM2, RegistersInUse());
 	gpr.UnlockAll();
 	gpr.UnlockAllX();
@ -290,11 +288,14 @@ void Jit64::stfsx(UGeckoInstruction inst)

 	// We can take a shortcut here - it's not likely that a hardware access would use this instruction.
 	gpr.FlushLockX(ABI_PARAM1);
-	fpr.Lock(inst.RS);
 	MOV(32, R(ABI_PARAM1), gpr.R(inst.RB));
 	if (inst.RA)
 		ADD(32, R(ABI_PARAM1), gpr.R(inst.RA));
-	CVTSD2SS(XMM0, fpr.R(inst.RS));
+
+	int s = inst.RS;
+	fpr.Lock(s);
+	fpr.BindToRegister(s, true, false);
+	ConvertDoubleToSingle(XMM0, fpr.RX(s));
 	MOVD_xmm(R(EAX), XMM0);
 	SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse());

@ -313,21 +314,20 @@ void Jit64::lfsx(UGeckoInstruction inst)
 	{
 		ADD(32, R(EAX), gpr.R(inst.RA));
 	}
+	fpr.Lock(inst.RS);
+	fpr.BindToRegister(inst.RS, false);
+	X64Reg s = fpr.RX(inst.RS);
 	if (cpu_info.bSSSE3 && !js.memcheck) {
-		fpr.Lock(inst.RS);
-		fpr.BindToRegister(inst.RS, false, true);
-		X64Reg r = fpr.R(inst.RS).GetSimpleReg();
 #ifdef _M_IX86
 		AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
-		MOVD_xmm(r, MDisp(EAX, (u32)Memory::base));
+		MOVD_xmm(XMM0, MDisp(EAX, (u32)Memory::base));
 #else
-		MOVD_xmm(r, MComplex(RBX, EAX, SCALE_1, 0));
+		MOVD_xmm(XMM0, MComplex(RBX, EAX, SCALE_1, 0));
 #endif
 		MEMCHECK_START

-		PSHUFB(r, M((void *)bswapShuffle1x4));
-		CVTSS2SD(r, R(r));
-		MOVDDUP(r, R(r));
+		PSHUFB(XMM0, M((void *)bswapShuffle1x4));
+		ConvertSingleToDouble(s, XMM0);

 		MEMCHECK_END
 	} else {
@ -335,11 +335,7 @@ void Jit64::lfsx(UGeckoInstruction inst)

 		MEMCHECK_START

-		MOV(32, M(&temp32), R(EAX));
-		CVTSS2SD(XMM0, M(&temp32));
-		fpr.Lock(inst.RS);
-		fpr.BindToRegister(inst.RS, false, true);
-		MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0));
+		ConvertSingleToDouble(s, EAX, true);

 		MEMCHECK_END
 	}
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@ -1288,10 +1288,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit) {
 		}
 		case DupSingleToMReg: {
 			if (!thisUsed) break;
-			X64Reg reg = fregURegWithoutMov(RI, I);
-			Jit->CVTSS2SD(reg, fregLocForInst(RI, getOp1(I)));
-			Jit->MOVDDUP(reg, R(reg));
-			RI.fregs[reg] = I;
+
+			X64Reg input = fregEnsureInReg(RI, getOp1(I));
+			X64Reg output = fregURegWithoutMov(RI, I);
+			Jit->ConvertSingleToDouble(output, input);
+
+			RI.fregs[output] = I;
 			fregNormalRegClear(RI, I);
 			break;
 		}
@ -1412,9 +1414,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit) {
 		}
 		case DoubleToSingle: {
 			if (!thisUsed) break;
-			X64Reg reg = fregURegWithoutMov(RI, I);
-			Jit->CVTSD2SS(reg, fregLocForInst(RI, getOp1(I)));
-			RI.fregs[reg] = I;
+
+			X64Reg input = fregEnsureInReg(RI, getOp1(I));
+			X64Reg output = fregURegWithoutMov(RI, I);
+			Jit->ConvertDoubleToSingle(output, input);
+
+			RI.fregs[output] = I;
 			fregNormalRegClear(RI, I);
 			break;
 		}
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@ -416,6 +416,101 @@ void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm) {
 	}
 }

+static u32 GC_ALIGNED16(temp32);
+static u64 GC_ALIGNED16(temp64);
+#ifdef _WIN32
+#include <intrin.h>
+#ifdef _M_X64
+static const __m128i GC_ALIGNED16(single_qnan_bit) = _mm_set_epi64x(0, 0x0000000000400000);
+static const __m128i GC_ALIGNED16(single_exponent) = _mm_set_epi64x(0, 0x000000007f800000);
+static const __m128i GC_ALIGNED16(double_qnan_bit) = _mm_set_epi64x(0, 0x0008000000000000);
+static const __m128i GC_ALIGNED16(double_exponent) = _mm_set_epi64x(0, 0x7ff0000000000000);
+#else
+static const __m128i GC_ALIGNED16(single_qnan_bit) = _mm_set_epi32(0, 0, 0x00000000, 0x00400000);
+static const __m128i GC_ALIGNED16(single_exponent) = _mm_set_epi32(0, 0, 0x00000000, 0x7f800000);
+static const __m128i GC_ALIGNED16(double_qnan_bit) = _mm_set_epi32(0, 0, 0x00080000, 0x00000000);
+static const __m128i GC_ALIGNED16(double_exponent) = _mm_set_epi32(0, 0, 0x7ff00000, 0x00000000);
+#endif
+#else
+static const __uint128_t GC_ALIGNED16(single_qnan_bit) = 0x0000000000400000;
+static const __uint128_t GC_ALIGNED16(single_exponent) = 0x000000007f800000;
+static const __uint128_t GC_ALIGNED16(double_qnan_bit) = 0x0008000000000000;
+static const __uint128_t GC_ALIGNED16(double_exponent) = 0x7ff0000000000000;
+#endif
+
+// Since the following two functions are used in non-arithmetic PPC float instructions,
+// they must convert floats bitexact and never flush denormals to zero or turn SNaNs into QNaNs.
+// This means we can't use CVTSS2SD/CVTSD2SS :(
+// The x87 FPU doesn't even support flush-to-zero so we can use FLD+FSTP even on denormals.
+// If the number is a NaN, make sure to set the QNaN bit back to its original value.
+
+void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
+{
+	if (src_is_gpr) {
+		MOV(32, M(&temp32), R(src));
+		MOVD_xmm(XMM1, R(src));
+	} else {
+		MOVSS(M(&temp32), src);
+		MOVSS(R(XMM1), src);
+	}
+	FLD(32, M(&temp32));
+	CCFlags cond;
+	if (cpu_info.bSSE4_1) {
+		PTEST(XMM1, M((void *)&single_exponent));
+		cond = CC_NC;
+	} else {
+		FNSTSW_AX();
+		TEST(16, R(AX), Imm16(x87_InvalidOperation));
+		cond = CC_Z;
+	}
+	FSTP(64, M(&temp64));
+	MOVSD(dst, M(&temp64));
+	FixupBranch dont_reset_qnan_bit = J_CC(cond);
+
+	PANDN(XMM1, M((void *)&single_qnan_bit));
+	PSLLQ(XMM1, 29);
+	if (cpu_info.bAVX) {
+		VPANDN(dst, XMM1, R(dst));
+	} else {
+		PANDN(XMM1, R(dst));
+		MOVSD(dst, R(XMM1));
+	}
+
+	SetJumpTarget(dont_reset_qnan_bit);
+	MOVDDUP(dst, R(dst));
+}
+
+void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
+{
+	MOVSD(M(&temp64), src);
+	MOVSD(XMM1, R(src));
+	FLD(64, M(&temp64));
+	CCFlags cond;
+	if (cpu_info.bSSE4_1) {
+		PTEST(XMM1, M((void *)&double_exponent));
+		cond = CC_NC;
+	} else {
+		FNSTSW_AX();
+		TEST(16, R(AX), Imm16(x87_InvalidOperation));
+		cond = CC_Z;
+	}
+	FSTP(32, M(&temp32));
+	MOVSS(XMM0, M(&temp32));
+	FixupBranch dont_reset_qnan_bit = J_CC(cond);
+
+	PANDN(XMM1, M((void *)&double_qnan_bit));
+	PSRLQ(XMM1, 29);
+	if (cpu_info.bAVX) {
+		VPANDN(XMM0, XMM1, R(XMM0));
+	} else {
+		PANDN(XMM1, R(XMM0));
+		MOVSS(XMM0, R(XMM1));
+	}
+
+	SetJumpTarget(dont_reset_qnan_bit);
+	MOVDDUP(dst, R(XMM0));
+}
+
 void EmuCodeBlock::JitClearCA()
 {
 	AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@ -47,6 +47,10 @@ public:

 	void ForceSinglePrecisionS(Gen::X64Reg xmm);
 	void ForceSinglePrecisionP(Gen::X64Reg xmm);
+
+	// AX might get trashed
+	void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);
+	void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src);
 protected:
 	std::unordered_map<u8 *, u32> registersInUseAtLoc;
 };