diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index 6a055f7cf8..6ef87ccac9 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -64,12 +64,13 @@ void Jit64::fp_arith(UGeckoInstruction inst)
 	FALLBACK_IF(inst.Rc);
 
 	bool single = inst.OPCD == 59;
+	bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
 	switch (inst.SUBOP5)
 	{
 	case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div
 	case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub
 	case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add
-	case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, single); break; //mul
+	case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, round_input); break; //mul
 	default:
 		_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
 	}
@@ -81,12 +82,12 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 	JITDISABLE(bJITFloatingPointOff);
 	FALLBACK_IF(inst.Rc);
 
-	bool single_precision = inst.OPCD == 59;
-
 	int a = inst.FA;
 	int b = inst.FB;
 	int c = inst.FC;
 	int d = inst.FD;
+	bool single_precision = inst.OPCD == 59;
+	bool round_input = single_precision && !jit->js.op->fprIsSingle[c];
 
 	fpr.Lock(a, b, c, d);
 
@@ -98,10 +99,10 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 	// instances on different computers giving identical results.
 	if (cpu_info.bFMA && !Core::g_want_determinism)
 	{
-		if (single_precision)
+		if (single_precision && round_input)
 			Force25BitPrecision(XMM0, fpr.R(c), XMM1);
 		else
-			MOVSD(XMM0, fpr.R(c));
+			MOVAPD(XMM0, fpr.R(c));
 		// Statistics suggests b is a lot less likely to be unbound in practice, so
 		// if we have to pick one of a or b to bind, let's make it b.
 		fpr.BindToRegister(b, true, false);
@@ -128,20 +129,20 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 	else if (inst.SUBOP5 == 30) //nmsub
 	{
 		// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
-		if (single_precision)
+		if (single_precision && round_input)
 			Force25BitPrecision(XMM1, fpr.R(c), XMM0);
 		else
-			MOVSD(XMM1, fpr.R(c));
+			MOVAPD(XMM1, fpr.R(c));
 		MULSD(XMM1, fpr.R(a));
 		MOVSD(XMM0, fpr.R(b));
 		SUBSD(XMM0, R(XMM1));
 	}
 	else
 	{
-		if (single_precision)
+		if (single_precision && round_input)
 			Force25BitPrecision(XMM0, fpr.R(c), XMM1);
 		else
-			MOVSD(XMM0, fpr.R(c));
+			MOVAPD(XMM0, fpr.R(c));
 		MULSD(XMM0, fpr.R(a));
 		if (inst.SUBOP5 == 28) //msub
 			SUBSD(XMM0, fpr.R(b));
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
index fe36351b25..815cd77c92 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
@@ -124,6 +124,7 @@ void Jit64::ps_arith(UGeckoInstruction inst)
 	JITDISABLE(bJITPairedOff);
 	FALLBACK_IF(inst.Rc);
 
+	bool round_input = !jit->js.op->fprIsSingle[inst.FC];
 	switch (inst.SUBOP5)
 	{
 	case 18: // div
@@ -136,7 +137,7 @@ void Jit64::ps_arith(UGeckoInstruction inst)
 		tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst);
 		break;
 	case 25: // mul
-		tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, true);
+		tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, round_input);
 		break;
 	default:
 		_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
@@ -187,6 +188,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
 	int d = inst.FD;
 	int a = inst.FA;
 	int c = inst.FC;
+	bool round_input = !jit->js.op->fprIsSingle[c];
 	fpr.Lock(a, c, d);
 	switch (inst.SUBOP5)
 	{
@@ -199,7 +201,8 @@ void Jit64::ps_muls(UGeckoInstruction inst)
 	default:
 		PanicAlert("ps_muls WTF!!!");
 	}
-	Force25BitPrecision(XMM0, R(XMM0), XMM1);
+	if (round_input)
+		Force25BitPrecision(XMM0, R(XMM0), XMM1);
 	MULPD(XMM0, fpr.R(a));
 	fpr.BindToRegister(d, false);
 	ForceSinglePrecisionP(fpr.RX(d), XMM0);
@@ -306,6 +309,7 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
 	int c = inst.FC;
 	int d = inst.FD;
 	bool fma = cpu_info.bFMA && !Core::g_want_determinism;
+	bool round_input = !jit->js.op->fprIsSingle[c];
 	fpr.Lock(a,b,c,d);
 
 	if (fma)
@@ -314,16 +318,21 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
 	if (inst.SUBOP5 == 14)
 	{
 		MOVDDUP(XMM0, fpr.R(c));
-		Force25BitPrecision(XMM0, R(XMM0), XMM1);
+		if (round_input)
+			Force25BitPrecision(XMM0, R(XMM0), XMM1);
 	}
 	else if (inst.SUBOP5 == 15)
 	{
 		avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
-		Force25BitPrecision(XMM0, R(XMM0), XMM1);
+		if (round_input)
+			Force25BitPrecision(XMM0, R(XMM0), XMM1);
 	}
 	else
 	{
-		Force25BitPrecision(XMM0, fpr.R(c), XMM1);
+		if (round_input)
+			Force25BitPrecision(XMM0, fpr.R(c), XMM1);
+		else
+			MOVAPD(XMM0, fpr.R(c));
 	}
 
 	if (fma)
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
index cefba76d93..0d72e8a5a4 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@@ -827,10 +827,21 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
 		// the same location later).
 		gprInUse |= code[i].regsOut;
 		if (code[i].fregOut >= 0)
-		{
 			fprInUse[code[i].fregOut] = true;
-			if (strncmp(code[i].opinfo->opname, "stfd", 4))
-				fprInXmm[code[i].fregOut] = true;
+	}
+
+	// Forward scan, for flags that need the other direction for calculation
+	BitSet32 fprIsSingle;
+	for (u32 i = 0; i < block->m_num_instructions; i++)
+	{
+		code[i].fprIsSingle = fprIsSingle;
+		if (code[i].fregOut >= 0)
+		{
+			// This instruction outputs float, so we can omit the special rounding done in fmuls/fmadds
+			if (code[i].opinfo->type == OPTYPE_SINGLEFP || code[i].opinfo->type == OPTYPE_PS || strncmp(code[i].opinfo->opname, "lfs", 3))
+				fprIsSingle[code[i].fregOut] = true;
+			else
+				fprIsSingle[code[i].fregOut] = false;
 		}
 	}
 	return address;
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h
index 8abf4bbdfe..e68be7a5ee 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@@ -51,6 +51,8 @@ struct CodeOp //16B
 	// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
 	// an XMM only to move it again to a GPR afterwards.
 	BitSet32 fprInXmm;
+	// whether an fpr is known to be an actual single-precision value at this point in the block.
+	BitSet32 fprIsSingle;
 };
 
 struct BlockStats