JIT: more optimizing of float ops based on known input characteristics

If the inputs are both float singles, and the top half is known to be identical to the bottom half, we can use packed arithmetic instead of scalar to skip the movddup. This is slower on a few rather old CPUs, plus the Atom+Silvermont, so detect Atom and disable it in that case. Also avoid PPC_FP on stores if we know that the output came from a float op.
2014-10-11 14:22:44 -07:00 · 2014-10-11 14:22:44 -07:00 · 72c96c20d3
parent 4e0591cdf1
commit 72c96c20d3
9 changed files with 154 additions and 39 deletions
--- a/Source/Core/Common/CPUDetect.h
+++ b/Source/Core/Common/CPUDetect.h
@ -50,10 +50,10 @@ struct CPUInfo
 	bool bMOVBE;
 	// This flag indicates that the hardware supports some mode
 	// in which denormal inputs _and_ outputs are automatically set to (signed) zero.
-	// TODO: ARM
 	bool bFlushToZero;
 	bool bLAHFSAHF64;
 	bool bLongMode;
+	bool bAtom;

 	// ARM specific CPUInfo
 	bool bSwp;
--- a/Source/Core/Common/x64CPUDetect.cpp
+++ b/Source/Core/Common/x64CPUDetect.cpp
@ -129,6 +129,12 @@ void CPUInfo::Detect()
 	if (max_std_fn >= 1)
 	{
 		__cpuid(cpu_id, 0x00000001);
+		int family = ((cpu_id[0] >> 8) & 0xf) + ((cpu_id[0] >> 20) & 0xff);
+		int model = ((cpu_id[0] >> 4) & 0xf) + ((cpu_id[0] >> 12) & 0xf0);
+		// Detect people unfortunate enough to be running Dolphin on an Atom
+		if (family == 6 && (model == 0x1C || model == 0x26 ||model == 0x27 || model == 0x35 || model == 0x36 ||
+		                    model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D))
+			bAtom = true;
 		logical_cpu_count = (cpu_id[1] >> 16) & 0xFF;
 		ht = (cpu_id[3] >> 28) & 1;

--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@ -151,7 +151,7 @@ public:
 	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
 		          bool Rc = false, bool carry = false);
 	void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
-	               void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
+	               void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool packed = false, bool roundRHS = false);
 	void FloatCompare(UGeckoInstruction inst, bool upper = false);

 	// OPCODES
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -11,11 +11,12 @@
 using namespace Gen;

 static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
+static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
 static const u64 GC_ALIGNED16(psAbsMask[2])  = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
 static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};

 void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg),
-                      void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
+                      void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool packed, bool roundRHS)
 {
 	fpr.Lock(d, a, b);
 	fpr.BindToRegister(d, d == a || d == b || !single);
@ -34,12 +35,19 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X
 	}
 	else
 	{
-		avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), false, reversible);
+		avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), packed, reversible);
 	}
 	if (single)
 	{
-		ForceSinglePrecisionS(fpr.RX(d));
-		MOVDDUP(fpr.RX(d), fpr.R(d));
+		if (packed)
+		{
+			ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
+		}
+		else
+		{
+			ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d));
+			MOVDDUP(fpr.RX(d), fpr.R(d));
+		}
 	}
 	SetFPRFIfNeeded(inst, fpr.RX(d));
 	fpr.UnlockAll();
@ -63,14 +71,32 @@ void Jit64::fp_arith(UGeckoInstruction inst)
 	JITDISABLE(bJITFloatingPointOff);
 	FALLBACK_IF(inst.Rc);

+	int a = inst.FA;
+	int b = inst.FB;
+	int c = inst.FC;
+	int d = inst.FD;
+	int arg2 = inst.SUBOP5 == 25 ? c : b;
+
 	bool single = inst.OPCD == 59;
 	bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
+	// If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP at the end by
+	// using packed arithmetic instead.
+	bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[arg2];
+	// Packed divides are slower than scalar divides on basically all x86, so this optimization isn't worth it in that case.
+	// Atoms (and a few really old CPUs) are also slower on packed operations than scalar ones.
+	if (inst.SUBOP5 == 18 || cpu_info.bAtom)
+		packed = false;
+
 	switch (inst.SUBOP5)
 	{
-	case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div
-	case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub
-	case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add
-	case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, round_input); break; //mul
+	case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
+	                   packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, inst, packed); break;
+	case 20: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
+	                   packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, inst, packed); break;
+	case 21: fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
+	                   packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, inst, packed); break;
+	case 25: fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
+	                   packed ? &XEmitter::MULPD : &XEmitter::MULSD, inst, packed, round_input); break;
 	default:
 		_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
 	}
@ -88,6 +114,9 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 	int d = inst.FD;
 	bool single = inst.OPCD == 59;
 	bool round_input = single && !jit->js.op->fprIsSingle[c];
+	bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[b] && jit->js.op->fprIsDuplicated[c];
+	if (cpu_info.bAtom)
+		packed = false;

 	fpr.Lock(a, b, c, d);

@ -109,20 +138,32 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 		switch (inst.SUBOP5)
 		{
 		case 28: //msub
-			VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
+			if (packed)
+				VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
+			else
+				VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
 			break;
 		case 29: //madd
-			VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
+			if (packed)
+				VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
+			else
+				VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
 			break;
 			// PowerPC and x86 define NMADD/NMSUB differently
 			// x86: D = -A*C (+/-) B
 			// PPC: D = -(A*C (+/-) B)
 			// so we have to swap them; the ADD/SUB here isn't a typo.
 		case 30: //nmsub
-			VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
+			if (packed)
+				VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
+			else
+				VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
 			break;
 		case 31: //nmadd
-			VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
+			if (packed)
+				VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
+			else
+				VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
 			break;
 		}
 	}
@ -133,9 +174,17 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 			Force25BitPrecision(XMM1, fpr.R(c), XMM0);
 		else
 			MOVAPD(XMM1, fpr.R(c));
-		MULSD(XMM1, fpr.R(a));
 		MOVAPD(XMM0, fpr.R(b));
-		SUBSD(XMM0, R(XMM1));
+		if (packed)
+		{
+			MULPD(XMM1, fpr.R(a));
+			SUBPD(XMM0, R(XMM1));
+		}
+		else
+		{
+			MULSD(XMM1, fpr.R(a));
+			SUBSD(XMM0, R(XMM1));
+		}
 	}
 	else
 	{
@ -143,22 +192,39 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 			Force25BitPrecision(XMM0, fpr.R(c), XMM1);
 		else
 			MOVAPD(XMM0, fpr.R(c));
-		MULSD(XMM0, fpr.R(a));
-		if (inst.SUBOP5 == 28) //msub
-			SUBSD(XMM0, fpr.R(b));
-		else                   //(n)madd
-			ADDSD(XMM0, fpr.R(b));
+		if (packed)
+		{
+			MULPD(XMM0, fpr.R(a));
+			if (inst.SUBOP5 == 28) //msub
+				SUBPD(XMM0, fpr.R(b));
+			else                   //(n)madd
+				ADDPD(XMM0, fpr.R(b));
+		}
+		else
+		{
+			MULSD(XMM0, fpr.R(a));
+			if (inst.SUBOP5 == 28)
+				SUBSD(XMM0, fpr.R(b));
+			else
+				ADDSD(XMM0, fpr.R(b));
+		}
 		if (inst.SUBOP5 == 31) //nmadd
-			PXOR(XMM0, M((void*)&psSignBits));
+			PXOR(XMM0, M((void*)&(packed ? psSignBits2 : psSignBits)));
 	}

 	fpr.BindToRegister(d, !single);
-	//YES it is necessary to dupe the result :(
-	//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
+
 	if (single)
 	{
-		ForceSinglePrecisionS(XMM0);
-		MOVDDUP(fpr.RX(d), R(XMM0));
+		if (packed)
+		{
+			ForceSinglePrecisionP(fpr.RX(d), XMM0);
+		}
+		else
+		{
+			ForceSinglePrecisionS(fpr.RX(d), XMM0);
+			MOVDDUP(fpr.RX(d), fpr.R(d));
+		}
 	}
 	else
 	{
@ -427,7 +493,7 @@ void Jit64::frspx(UGeckoInstruction inst)
 	fpr.BindToRegister(d, d == b);
 	if (b != d)
 		MOVAPD(fpr.RX(d), fpr.R(b));
-	ForceSinglePrecisionS(fpr.RX(d));
+	ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d));
 	MOVDDUP(fpr.RX(d), fpr.R(d));
 	SetFPRFIfNeeded(inst, fpr.RX(d));
 	fpr.UnlockAll();
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
@ -108,8 +108,15 @@ void Jit64::stfXXX(UGeckoInstruction inst)

 	if (single)
 	{
-		fpr.BindToRegister(s, true, false);
-		ConvertDoubleToSingle(XMM0, fpr.RX(s));
+		if (jit->js.op->fprIsStoreSafe[s])
+		{
+			CVTSD2SS(XMM0, fpr.R(s));
+		}
+		else
+		{
+			fpr.BindToRegister(s, true, false);
+			ConvertDoubleToSingle(XMM0, fpr.RX(s));
+		}
 		MOVD_xmm(R(RSCRATCH), XMM0);
 	}
 	else
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@ -667,13 +667,17 @@ void EmuCodeBlock::WriteToConstRamAddress(int accessSize, OpArg arg, u32 address
 		MOV(accessSize, MDisp(RMEM, address & 0x3FFFFFFF), R(reg));
 }

-void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm)
+void EmuCodeBlock::ForceSinglePrecisionS(X64Reg output, X64Reg input)
 {
 	// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
 	if (jit->jo.accurateSinglePrecision)
 	{
-		CVTSD2SS(xmm, R(xmm));
-		CVTSS2SD(xmm, R(xmm));
+		CVTSD2SS(input, R(input));
+		CVTSS2SD(output, R(input));
+	}
+	else if (output != input)
+	{
+		MOVAPD(output, R(input));
 	}
 }

--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@ -130,7 +130,7 @@ public:
 	void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8),
 	            Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm);

-	void ForceSinglePrecisionS(Gen::X64Reg xmm);
+	void ForceSinglePrecisionS(Gen::X64Reg output, Gen::X64Reg input);
 	void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input);
 	void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp);

--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@ -830,18 +830,45 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
 			fprInUse[code[i].fregOut] = true;
 	}

-	// Forward scan, for flags that need the other direction for calculation
-	BitSet32 fprIsSingle;
+	// Forward scan, for flags that need the other direction for calculation.
+	BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe;
 	for (u32 i = 0; i < block->m_num_instructions; i++)
 	{
 		code[i].fprIsSingle = fprIsSingle;
+		code[i].fprIsDuplicated = fprIsDuplicated;
+		code[i].fprIsStoreSafe = fprIsStoreSafe;
 		if (code[i].fregOut >= 0)
 		{
-			// This instruction outputs float, so we can omit the special rounding done in fmuls/fmadds
-			if (code[i].opinfo->type == OPTYPE_SINGLEFP || code[i].opinfo->type == OPTYPE_PS || strncmp(code[i].opinfo->opname, "lfs", 3))
+			fprIsSingle[code[i].fregOut] = false;
+			fprIsDuplicated[code[i].fregOut] = false;
+			fprIsStoreSafe[code[i].fregOut] = false;
+			// Single, duplicated, and doesn't need PPC_FP.
+			if (code[i].opinfo->type == OPTYPE_SINGLEFP)
+			{
 				fprIsSingle[code[i].fregOut] = true;
-			else
-				fprIsSingle[code[i].fregOut] = false;
+				fprIsDuplicated[code[i].fregOut] = true;
+				fprIsStoreSafe[code[i].fregOut] = true;
+			}
+			// Single and duplicated, but might be a denormal (not safe to skip PPC_FP).
+			// TODO: if we go directly from a load to store, skip conversion entirely?
+			// TODO: if we go directly from a load to a float instruction, and the value isn't used
+			// for anything else, we can skip PPC_FP on a load too.
+			if (!strncmp(code[i].opinfo->opname, "lfs", 3))
+			{
+				fprIsSingle[code[i].fregOut] = true;
+				fprIsDuplicated[code[i].fregOut] = true;
+			}
+			// Paired are still floats, but the top/bottom halves may differ.
+			if (code[i].opinfo->type == OPTYPE_PS || code[i].opinfo->type == OPTYPE_LOADPS)
+			{
+				fprIsSingle[code[i].fregOut] = true;
+				fprIsStoreSafe[code[i].fregOut] = true;
+			}
+			// Careful: changing the float mode in a block breaks this optimization, since
+			// a previous float op might have had had FTZ off while the later store has FTZ
+			// on. So, discard all information we have.
+			if (!strncmp(code[i].opinfo->opname, "mtfs", 4))
+				fprIsStoreSafe = BitSet32(0);
 		}
 	}
 	return address;
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@ -53,6 +53,11 @@ struct CodeOp //16B
 	BitSet32 fprInXmm;
 	// whether an fpr is known to be an actual single-precision value at this point in the block.
 	BitSet32 fprIsSingle;
+	// whether an fpr is known to have identical top and bottom halves (e.g. due to a single instruction)
+	BitSet32 fprIsDuplicated;
+	// whether an fpr is the output of a single-precision arithmetic instruction, i.e. whether we can safely
+	// skip PPC_FP.
+	BitSet32 fprIsStoreSafe;
 };

 struct BlockStats