From 59c1a46ab1990d902b1ef2d59b79bd44f6f11e12 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Fri, 22 Aug 2014 15:27:26 -0700
Subject: [PATCH] JIT: faster PPC_FP code

The PPC_FP conversion code can be made a lot simpler with the observation
that the only values that need to be sent through the slow x87 path are
denormals.

A whole bunch faster: 708->678 seconds on POV-RAY.
---
 .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp  | 113 ++++++------------
 1 file changed, 39 insertions(+), 74 deletions(-)

diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index 2734e02715..e7d51213bb 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -524,11 +524,6 @@ static u64 GC_ALIGNED16(temp64);
 
 static const float GC_ALIGNED16(m_zero[]) = { 0.0f, 0.0f, 0.0f, 0.0f };
 
-static const __m128i GC_ALIGNED16(single_qnan_bit) = _mm_set_epi64x(0, 0x0000000000400000);
-static const __m128i GC_ALIGNED16(single_exponent) = _mm_set_epi64x(0, 0x000000007f800000);
-static const __m128i GC_ALIGNED16(double_qnan_bit) = _mm_set_epi64x(0, 0x0008000000000000);
-static const __m128i GC_ALIGNED16(double_exponent) = _mm_set_epi64x(0, 0x7ff0000000000000);
-
 // Since the following float conversion functions are used in non-arithmetic PPC float instructions,
 // they must convert floats bitexact and never flush denormals to zero or turn SNaNs into QNaNs.
 // This means we can't use CVTSS2SD/CVTSD2SS :(
@@ -542,6 +537,7 @@ static const __m128i GC_ALIGNED16(double_exponent) = _mm_set_epi64x(0, 0x7ff0000
 //#define MORE_ACCURATE_DOUBLETOSINGLE
 #ifdef MORE_ACCURATE_DOUBLETOSINGLE
 
+static const __m128i GC_ALIGNED16(double_exponent) = _mm_set_epi64x(0, 0x7ff0000000000000);
 static const __m128i GC_ALIGNED16(double_fraction) = _mm_set_epi64x(0, 0x000fffffffffffff);
 static const __m128i GC_ALIGNED16(double_sign_bit) = _mm_set_epi64x(0, 0x8000000000000000);
 static const __m128i GC_ALIGNED16(double_explicit_top_bit) = _mm_set_epi64x(0, 0x0010000000000000);
@@ -619,95 +615,64 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
 
 void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
 {
+	// Most games have flush-to-zero enabled, which causes the single -> double -> single process here to be lossy.
+	// This is a problem when games use float operations to copy non-float data.
+	// Changing the FPU mode is very expensive, so we can't do that.
+	// Here, check to see if the exponent is small enough that it will result in a denormal, and pass it to the x87 unit
+	// if it is.
+	MOVQ_xmm(R(RAX), src);
+	SHR(64, R(RAX), Imm8(55));
+	// Exponents 0x369 <= x <= 0x380 are denormal. This code accepts the range 0x368 <= x <= 0x387
+	// to save an instruction, since diverting a few more floats to the slow path can't hurt much.
+	SUB(8, R(AL), Imm8(0x6D));
+	CMP(8, R(AL), Imm8(0x3));
+	FixupBranch x87Conversion = J_CC(CC_BE);
+	CVTSD2SS(dst, R(src));
+	FixupBranch continue1 = J();
+
+	SetJumpTarget(x87Conversion);
 	MOVSD(M(&temp64), src);
-	MOVSD(XMM1, R(src));
 	FLD(64, M(&temp64));
-	CCFlags cond;
-	if (cpu_info.bSSE4_1)
-	{
-		PTEST(XMM1, M((void *)&double_exponent));
-		cond = CC_NC;
-	}
-	else
-	{
-		// emulate PTEST; checking FPU flags is incorrect because the NaN bits
-		// are sticky (persist between instructions)
-		MOVSD(XMM0, M((void *)&double_exponent));
-		PAND(XMM0, R(XMM1));
-		PCMPEQB(XMM0, M((void *)&m_zero));
-		PMOVMSKB(EAX, R(XMM0));
-		CMP(32, R(EAX), Imm32(0xffff));
-		cond = CC_Z;
-	}
 	FSTP(32, M(&temp32));
-	MOVSS(XMM0, M(&temp32));
-	FixupBranch dont_reset_qnan_bit = J_CC(cond);
+	MOVSS(dst, M(&temp32));
 
-	PANDN(XMM1, M((void *)&double_qnan_bit));
-	PSRLQ(XMM1, 29);
-	if (cpu_info.bAVX)
-	{
-		VPANDN(XMM0, XMM1, R(XMM0));
-	}
-	else
-	{
-		PANDN(XMM1, R(XMM0));
-		MOVSS(XMM0, R(XMM1));
-	}
-
-	SetJumpTarget(dont_reset_qnan_bit);
-	MOVDDUP(dst, R(XMM0));
+	SetJumpTarget(continue1);
 }
 #endif // MORE_ACCURATE_DOUBLETOSINGLE
 
 void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
 {
+	// If the input isn't denormal, just do things the simple way -- otherwise, go through the x87 unit, which has
+	// flush-to-zero off.
+	X64Reg gprsrc = src_is_gpr ? src : EAX;
 	if (src_is_gpr)
 	{
-		MOV(32, M(&temp32), R(src));
-		MOVD_xmm(XMM1, R(src));
+		MOVD_xmm(dst, R(src));
 	}
 	else
 	{
-		MOVSS(M(&temp32), src);
-		MOVSS(R(XMM1), src);
+		if (dst != src)
+			MOVAPD(dst, R(src));
+		MOVD_xmm(EAX, R(src));
 	}
+	// A sneaky hack: floating-point zero is rather common and we don't want to confuse it for denormals and
+	// needlessly send it through the slow path. If we subtract 1 before doing the comparison, it turns
+	// float-zero into 0xffffffff (skipping the slow path). This results in a single non-denormal being sent
+	// through the slow path (0x00800000), but the performance effects of that should be negligible.
+	SUB(32, R(gprsrc), Imm8(1));
+	TEST(32, R(gprsrc), Imm32(0x7f800000));
 
+	FixupBranch x87Conversion = J_CC(CC_Z);
+	CVTSS2SD(dst, R(dst));
+	FixupBranch continue1 = J();
+
+	SetJumpTarget(x87Conversion);
+	MOVSS(M(&temp32), dst);
 	FLD(32, M(&temp32));
-	CCFlags cond;
-	if (cpu_info.bSSE4_1)
-	{
-		PTEST(XMM1, M((void *)&single_exponent));
-		cond = CC_NC;
-	}
-	else
-	{
-		// emulate PTEST; checking FPU flags is incorrect because the NaN bits
-		// are sticky (persist between instructions)
-		MOVSS(XMM0, M((void *)&single_exponent));
-		PAND(XMM0, R(XMM1));
-		PCMPEQB(XMM0, M((void *)&m_zero));
-		PMOVMSKB(EAX, R(XMM0));
-		CMP(32, R(EAX), Imm32(0xffff));
-		cond = CC_Z;
-	}
 	FSTP(64, M(&temp64));
 	MOVSD(dst, M(&temp64));
-	FixupBranch dont_reset_qnan_bit = J_CC(cond);
 
-	PANDN(XMM1, M((void *)&single_qnan_bit));
-	PSLLQ(XMM1, 29);
-	if (cpu_info.bAVX)
-	{
-		VPANDN(dst, XMM1, R(dst));
-	}
-	else
-	{
-		PANDN(XMM1, R(dst));
-		MOVSD(dst, R(XMM1));
-	}
-
-	SetJumpTarget(dont_reset_qnan_bit);
+	SetJumpTarget(continue1);
 	MOVDDUP(dst, R(dst));
 }