Jit64: merge ps_arith into fp_arith

2015-05-21 12:33:37 +02:00 · 2015-05-21 12:33:37 +02:00 · c6147c5ed5
parent 6d23b511a6
commit c6147c5ed5
4 changed files with 11 additions and 36 deletions
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@ -193,7 +193,6 @@ public:

 	void ps_mr(UGeckoInstruction inst);
 	void ps_sign(UGeckoInstruction inst); //aggregate
-	void ps_arith(UGeckoInstruction inst); //aggregate
 	void ps_mergeXX(UGeckoInstruction inst);
 	void ps_maddXX(UGeckoInstruction inst);
 	void ps_res(UGeckoInstruction inst);
--- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
@ -124,12 +124,12 @@ static GekkoOPTemplate table4_2[] =
 	{13, &Jit64::ps_muls},   // ps_muls1
 	{14, &Jit64::ps_maddXX}, // ps_madds0
 	{15, &Jit64::ps_maddXX}, // ps_madds1
-	{18, &Jit64::ps_arith},  // ps_div
-	{20, &Jit64::ps_arith},  // ps_sub
-	{21, &Jit64::ps_arith},  // ps_add
+	{18, &Jit64::fp_arith},  // ps_div
+	{20, &Jit64::fp_arith},  // ps_sub
+	{21, &Jit64::fp_arith},  // ps_add
 	{23, &Jit64::fselx},     // ps_sel
 	{24, &Jit64::ps_res},    // ps_res
-	{25, &Jit64::ps_arith},  // ps_mul
+	{25, &Jit64::fp_arith},  // ps_mul
 	{26, &Jit64::ps_rsqrte}, // ps_rsqrte
 	{28, &Jit64::ps_maddXX}, // ps_msub
 	{29, &Jit64::ps_maddXX}, // ps_madd
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -67,16 +67,19 @@ void Jit64::fp_arith(UGeckoInstruction inst)
 	int d = inst.FD;
 	int arg2 = inst.SUBOP5 == 25 ? c : b;

-	bool single = inst.OPCD == 59;
-	bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
+	bool single = inst.OPCD == 4 || inst.OPCD == 59;
 	// If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP at the end by
 	// using packed arithmetic instead.
-	bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[arg2];
+	bool packed = inst.OPCD == 4 || (inst.OPCD == 59 &&
+	                                 jit->js.op->fprIsDuplicated[a] &&
+	                                 jit->js.op->fprIsDuplicated[arg2]);
 	// Packed divides are slower than scalar divides on basically all x86, so this optimization isn't worth it in that case.
 	// Atoms (and a few really old CPUs) are also slower on packed operations than scalar ones.
-	if (inst.SUBOP5 == 18 || cpu_info.bAtom)
+	if (inst.OPCD == 59 && (inst.SUBOP5 == 18 || cpu_info.bAtom))
 		packed = false;

+	bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
+
 	switch (inst.SUBOP5)
 	{
 	case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
@ -56,33 +56,6 @@ void Jit64::ps_sign(UGeckoInstruction inst)
 	fpr.UnlockAll();
 }

-void Jit64::ps_arith(UGeckoInstruction inst)
-{
-	INSTRUCTION_START
-	JITDISABLE(bJITPairedOff);
-	FALLBACK_IF(inst.Rc);
-
-	bool round_input = !jit->js.op->fprIsSingle[inst.FC];
-	switch (inst.SUBOP5)
-	{
-	case 18: // div
-		fp_tri_op(inst.FD, inst.FA, inst.FB, false, true, &XEmitter::VDIVPD, &XEmitter::DIVPD, true);
-		break;
-	case 20: // sub
-		fp_tri_op(inst.FD, inst.FA, inst.FB, false, true, &XEmitter::VSUBPD, &XEmitter::SUBPD, true);
-		break;
-	case 21: // add
-		fp_tri_op(inst.FD, inst.FA, inst.FB, true, true, &XEmitter::VADDPD, &XEmitter::ADDPD, true);
-		break;
-	case 25: // mul
-		fp_tri_op(inst.FD, inst.FA, inst.FC, true, true, &XEmitter::VMULPD, &XEmitter::MULPD, true, round_input);
-		break;
-	default:
-		_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
-		break;
-	}
-}
-
 void Jit64::ps_sum(UGeckoInstruction inst)
 {
 	INSTRUCTION_START