diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index b468e8aaad..0a367f8fa4 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -193,7 +193,6 @@ public: void ps_mr(UGeckoInstruction inst); void ps_sign(UGeckoInstruction inst); //aggregate - void ps_arith(UGeckoInstruction inst); //aggregate void ps_mergeXX(UGeckoInstruction inst); void ps_maddXX(UGeckoInstruction inst); void ps_res(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 0d8cd10cf3..8b9dc5d3c2 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -124,12 +124,12 @@ static GekkoOPTemplate table4_2[] = {13, &Jit64::ps_muls}, // ps_muls1 {14, &Jit64::ps_maddXX}, // ps_madds0 {15, &Jit64::ps_maddXX}, // ps_madds1 - {18, &Jit64::ps_arith}, // ps_div - {20, &Jit64::ps_arith}, // ps_sub - {21, &Jit64::ps_arith}, // ps_add + {18, &Jit64::fp_arith}, // ps_div + {20, &Jit64::fp_arith}, // ps_sub + {21, &Jit64::fp_arith}, // ps_add {23, &Jit64::fselx}, // ps_sel {24, &Jit64::ps_res}, // ps_res - {25, &Jit64::ps_arith}, // ps_mul + {25, &Jit64::fp_arith}, // ps_mul {26, &Jit64::ps_rsqrte}, // ps_rsqrte {28, &Jit64::ps_maddXX}, // ps_msub {29, &Jit64::ps_maddXX}, // ps_madd diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 0201ed5936..5313a76da0 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -67,16 +67,19 @@ void Jit64::fp_arith(UGeckoInstruction inst) int d = inst.FD; int arg2 = inst.SUBOP5 == 25 ? c : b; - bool single = inst.OPCD == 59; - bool round_input = single && !jit->js.op->fprIsSingle[inst.FC]; + bool single = inst.OPCD == 4 || inst.OPCD == 59; // If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP at the end by // using packed arithmetic instead. - bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[arg2]; + bool packed = inst.OPCD == 4 || (inst.OPCD == 59 && + jit->js.op->fprIsDuplicated[a] && + jit->js.op->fprIsDuplicated[arg2]); // Packed divides are slower than scalar divides on basically all x86, so this optimization isn't worth it in that case. // Atoms (and a few really old CPUs) are also slower on packed operations than scalar ones. - if (inst.SUBOP5 == 18 || cpu_info.bAtom) + if (inst.OPCD == 59 && (inst.SUBOP5 == 18 || cpu_info.bAtom)) packed = false; + bool round_input = single && !jit->js.op->fprIsSingle[inst.FC]; + switch (inst.SUBOP5) { case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index 2b66c7bef1..8154151be5 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -56,33 +56,6 @@ void Jit64::ps_sign(UGeckoInstruction inst) fpr.UnlockAll(); } -void Jit64::ps_arith(UGeckoInstruction inst) -{ - INSTRUCTION_START - JITDISABLE(bJITPairedOff); - FALLBACK_IF(inst.Rc); - - bool round_input = !jit->js.op->fprIsSingle[inst.FC]; - switch (inst.SUBOP5) - { - case 18: // div - fp_tri_op(inst.FD, inst.FA, inst.FB, false, true, &XEmitter::VDIVPD, &XEmitter::DIVPD, true); - break; - case 20: // sub - fp_tri_op(inst.FD, inst.FA, inst.FB, false, true, &XEmitter::VSUBPD, &XEmitter::SUBPD, true); - break; - case 21: // add - fp_tri_op(inst.FD, inst.FA, inst.FB, true, true, &XEmitter::VADDPD, &XEmitter::ADDPD, true); - break; - case 25: // mul - fp_tri_op(inst.FD, inst.FA, inst.FC, true, true, &XEmitter::VMULPD, &XEmitter::MULPD, true, round_input); - break; - default: - _assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!"); - break; - } -} - void Jit64::ps_sum(UGeckoInstruction inst) { INSTRUCTION_START