Jit64: merge ps_arith into fp_arith
This commit is contained in:
parent
6d23b511a6
commit
c6147c5ed5
|
@ -193,7 +193,6 @@ public:
|
|||
|
||||
void ps_mr(UGeckoInstruction inst);
|
||||
void ps_sign(UGeckoInstruction inst); //aggregate
|
||||
void ps_arith(UGeckoInstruction inst); //aggregate
|
||||
void ps_mergeXX(UGeckoInstruction inst);
|
||||
void ps_maddXX(UGeckoInstruction inst);
|
||||
void ps_res(UGeckoInstruction inst);
|
||||
|
|
|
@ -124,12 +124,12 @@ static GekkoOPTemplate table4_2[] =
|
|||
{13, &Jit64::ps_muls}, // ps_muls1
|
||||
{14, &Jit64::ps_maddXX}, // ps_madds0
|
||||
{15, &Jit64::ps_maddXX}, // ps_madds1
|
||||
{18, &Jit64::ps_arith}, // ps_div
|
||||
{20, &Jit64::ps_arith}, // ps_sub
|
||||
{21, &Jit64::ps_arith}, // ps_add
|
||||
{18, &Jit64::fp_arith}, // ps_div
|
||||
{20, &Jit64::fp_arith}, // ps_sub
|
||||
{21, &Jit64::fp_arith}, // ps_add
|
||||
{23, &Jit64::fselx}, // ps_sel
|
||||
{24, &Jit64::ps_res}, // ps_res
|
||||
{25, &Jit64::ps_arith}, // ps_mul
|
||||
{25, &Jit64::fp_arith}, // ps_mul
|
||||
{26, &Jit64::ps_rsqrte}, // ps_rsqrte
|
||||
{28, &Jit64::ps_maddXX}, // ps_msub
|
||||
{29, &Jit64::ps_maddXX}, // ps_madd
|
||||
|
|
|
@ -67,16 +67,19 @@ void Jit64::fp_arith(UGeckoInstruction inst)
|
|||
int d = inst.FD;
|
||||
int arg2 = inst.SUBOP5 == 25 ? c : b;
|
||||
|
||||
bool single = inst.OPCD == 59;
|
||||
bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
|
||||
bool single = inst.OPCD == 4 || inst.OPCD == 59;
|
||||
// If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP at the end by
|
||||
// using packed arithmetic instead.
|
||||
bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[arg2];
|
||||
bool packed = inst.OPCD == 4 || (inst.OPCD == 59 &&
|
||||
jit->js.op->fprIsDuplicated[a] &&
|
||||
jit->js.op->fprIsDuplicated[arg2]);
|
||||
// Packed divides are slower than scalar divides on basically all x86, so this optimization isn't worth it in that case.
|
||||
// Atoms (and a few really old CPUs) are also slower on packed operations than scalar ones.
|
||||
if (inst.SUBOP5 == 18 || cpu_info.bAtom)
|
||||
if (inst.OPCD == 59 && (inst.SUBOP5 == 18 || cpu_info.bAtom))
|
||||
packed = false;
|
||||
|
||||
bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
|
||||
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
|
||||
|
|
|
@ -56,33 +56,6 @@ void Jit64::ps_sign(UGeckoInstruction inst)
|
|||
fpr.UnlockAll();
|
||||
}
|
||||
|
||||
void Jit64::ps_arith(UGeckoInstruction inst)
|
||||
{
|
||||
INSTRUCTION_START
|
||||
JITDISABLE(bJITPairedOff);
|
||||
FALLBACK_IF(inst.Rc);
|
||||
|
||||
bool round_input = !jit->js.op->fprIsSingle[inst.FC];
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 18: // div
|
||||
fp_tri_op(inst.FD, inst.FA, inst.FB, false, true, &XEmitter::VDIVPD, &XEmitter::DIVPD, true);
|
||||
break;
|
||||
case 20: // sub
|
||||
fp_tri_op(inst.FD, inst.FA, inst.FB, false, true, &XEmitter::VSUBPD, &XEmitter::SUBPD, true);
|
||||
break;
|
||||
case 21: // add
|
||||
fp_tri_op(inst.FD, inst.FA, inst.FB, true, true, &XEmitter::VADDPD, &XEmitter::ADDPD, true);
|
||||
break;
|
||||
case 25: // mul
|
||||
fp_tri_op(inst.FD, inst.FA, inst.FC, true, true, &XEmitter::VMULPD, &XEmitter::MULPD, true, round_input);
|
||||
break;
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void Jit64::ps_sum(UGeckoInstruction inst)
|
||||
{
|
||||
INSTRUCTION_START
|
||||
|
|
Loading…
Reference in New Issue