JIT64: clean up and unify float load/store code
While we're at it, support a bunch of float load/store variants that weren't implemented in the JIT. Might not have a big speed impact on typical games but they're used at least a bit in povray and luabench. 694 -> 644 seconds on povray.
This commit is contained in:
parent
44ee2f20b9
commit
aaca1b01e5
|
@ -192,11 +192,9 @@ public:
|
||||||
|
|
||||||
void cntlzwx(UGeckoInstruction inst);
|
void cntlzwx(UGeckoInstruction inst);
|
||||||
|
|
||||||
void lfs(UGeckoInstruction inst);
|
void lfXXX(UGeckoInstruction inst);
|
||||||
void lfd(UGeckoInstruction inst);
|
void stfXXX(UGeckoInstruction inst);
|
||||||
void stfd(UGeckoInstruction inst);
|
void stfiwx(UGeckoInstruction inst);
|
||||||
void stfs(UGeckoInstruction inst);
|
|
||||||
void stfsx(UGeckoInstruction inst);
|
|
||||||
void psq_l(UGeckoInstruction inst);
|
void psq_l(UGeckoInstruction inst);
|
||||||
void psq_st(UGeckoInstruction inst);
|
void psq_st(UGeckoInstruction inst);
|
||||||
|
|
||||||
|
@ -211,7 +209,6 @@ public:
|
||||||
void srwx(UGeckoInstruction inst);
|
void srwx(UGeckoInstruction inst);
|
||||||
void dcbst(UGeckoInstruction inst);
|
void dcbst(UGeckoInstruction inst);
|
||||||
void dcbz(UGeckoInstruction inst);
|
void dcbz(UGeckoInstruction inst);
|
||||||
void lfsx(UGeckoInstruction inst);
|
|
||||||
|
|
||||||
void subfic(UGeckoInstruction inst);
|
void subfic(UGeckoInstruction inst);
|
||||||
void subfcx(UGeckoInstruction inst);
|
void subfcx(UGeckoInstruction inst);
|
||||||
|
|
|
@ -82,15 +82,15 @@ static GekkoOPTemplate primarytable[] =
|
||||||
{46, &Jit64::lmw}, //"lmw", OPTYPE_SYSTEM, FL_EVIL, 10}},
|
{46, &Jit64::lmw}, //"lmw", OPTYPE_SYSTEM, FL_EVIL, 10}},
|
||||||
{47, &Jit64::stmw}, //"stmw", OPTYPE_SYSTEM, FL_EVIL, 10}},
|
{47, &Jit64::stmw}, //"stmw", OPTYPE_SYSTEM, FL_EVIL, 10}},
|
||||||
|
|
||||||
{48, &Jit64::lfs}, //"lfs", OPTYPE_LOADFP, FL_IN_A}},
|
{48, &Jit64::lfXXX}, //"lfs", OPTYPE_LOADFP, FL_IN_A}},
|
||||||
{49, &Jit64::FallBackToInterpreter}, //"lfsu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}},
|
{49, &Jit64::lfXXX}, //"lfsu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}},
|
||||||
{50, &Jit64::lfd}, //"lfd", OPTYPE_LOADFP, FL_IN_A}},
|
{50, &Jit64::lfXXX}, //"lfd", OPTYPE_LOADFP, FL_IN_A}},
|
||||||
{51, &Jit64::FallBackToInterpreter}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}},
|
{51, &Jit64::lfXXX}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}},
|
||||||
|
|
||||||
{52, &Jit64::stfs}, //"stfs", OPTYPE_STOREFP, FL_IN_A}},
|
{52, &Jit64::stfXXX}, //"stfs", OPTYPE_STOREFP, FL_IN_A}},
|
||||||
{53, &Jit64::FallBackToInterpreter}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
|
{53, &Jit64::stfXXX}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
|
||||||
{54, &Jit64::stfd}, //"stfd", OPTYPE_STOREFP, FL_IN_A}},
|
{54, &Jit64::stfXXX}, //"stfd", OPTYPE_STOREFP, FL_IN_A}},
|
||||||
{55, &Jit64::FallBackToInterpreter}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
|
{55, &Jit64::stfXXX}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
|
||||||
|
|
||||||
{56, &Jit64::psq_l}, //"psq_l", OPTYPE_PS, FL_IN_A}},
|
{56, &Jit64::psq_l}, //"psq_l", OPTYPE_PS, FL_IN_A}},
|
||||||
{57, &Jit64::psq_l}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},
|
{57, &Jit64::psq_l}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},
|
||||||
|
@ -253,16 +253,16 @@ static GekkoOPTemplate table31[] =
|
||||||
{725, &Jit64::FallBackToInterpreter}, //"stswi", OPTYPE_STORE, FL_EVIL}},
|
{725, &Jit64::FallBackToInterpreter}, //"stswi", OPTYPE_STORE, FL_EVIL}},
|
||||||
|
|
||||||
// fp load/store
|
// fp load/store
|
||||||
{535, &Jit64::lfsx}, //"lfsx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}},
|
{535, &Jit64::lfXXX}, //"lfsx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}},
|
||||||
{567, &Jit64::FallBackToInterpreter}, //"lfsux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}},
|
{567, &Jit64::lfXXX}, //"lfsux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}},
|
||||||
{599, &Jit64::FallBackToInterpreter}, //"lfdx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}},
|
{599, &Jit64::lfXXX}, //"lfdx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}},
|
||||||
{631, &Jit64::FallBackToInterpreter}, //"lfdux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}},
|
{631, &Jit64::lfXXX}, //"lfdux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}},
|
||||||
|
|
||||||
{663, &Jit64::stfsx}, //"stfsx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}},
|
{663, &Jit64::stfXXX}, //"stfsx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}},
|
||||||
{695, &Jit64::FallBackToInterpreter}, //"stfsux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}},
|
{695, &Jit64::stfXXX}, //"stfsux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}},
|
||||||
{727, &Jit64::FallBackToInterpreter}, //"stfdx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}},
|
{727, &Jit64::stfXXX}, //"stfdx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}},
|
||||||
{759, &Jit64::FallBackToInterpreter}, //"stfdux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}},
|
{759, &Jit64::stfXXX}, //"stfdux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}},
|
||||||
{983, &Jit64::FallBackToInterpreter}, //"stfiwx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}},
|
{983, &Jit64::stfiwx}, //"stfiwx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}},
|
||||||
|
|
||||||
{19, &Jit64::mfcr}, //"mfcr", OPTYPE_SYSTEM, FL_OUT_D}},
|
{19, &Jit64::mfcr}, //"mfcr", OPTYPE_SYSTEM, FL_OUT_D}},
|
||||||
{83, &Jit64::mfmsr}, //"mfmsr", OPTYPE_SYSTEM, FL_OUT_D}},
|
{83, &Jit64::mfmsr}, //"mfmsr", OPTYPE_SYSTEM, FL_OUT_D}},
|
||||||
|
|
|
@ -14,134 +14,161 @@ using namespace Gen;
|
||||||
// TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common,
|
// TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common,
|
||||||
// and pshufb could help a lot.
|
// and pshufb could help a lot.
|
||||||
|
|
||||||
void Jit64::lfs(UGeckoInstruction inst)
|
void Jit64::lfXXX(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
JITDISABLE(bJITLoadStoreFloatingOff);
|
JITDISABLE(bJITLoadStoreFloatingOff);
|
||||||
|
bool indexed = inst.OPCD == 31;
|
||||||
|
bool update = indexed ? !!(inst.SUBOP10 & 0x20) : !!(inst.OPCD & 1);
|
||||||
|
bool single = indexed ? !(inst.SUBOP10 & 0x40) : !(inst.OPCD & 2);
|
||||||
|
update &= indexed || inst.SIMM_16;
|
||||||
|
|
||||||
int d = inst.RD;
|
int d = inst.RD;
|
||||||
int a = inst.RA;
|
int a = inst.RA;
|
||||||
FALLBACK_IF(!a);
|
int b = inst.RB;
|
||||||
|
|
||||||
s32 offset = (s32)(s16)inst.SIMM_16;
|
FALLBACK_IF(!indexed && !a);
|
||||||
|
|
||||||
SafeLoadToReg(EAX, gpr.R(a), 32, offset, CallerSavedRegistersInUse(), false);
|
if (update)
|
||||||
|
gpr.BindToRegister(a, true, true);
|
||||||
|
|
||||||
|
s32 offset = 0;
|
||||||
|
OpArg addr = gpr.R(a);
|
||||||
|
if (indexed)
|
||||||
|
{
|
||||||
|
if (update)
|
||||||
|
{
|
||||||
|
ADD(32, addr, gpr.R(b));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
addr = R(EAX);
|
||||||
|
if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
|
||||||
|
LEA(32, EAX, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOV(32, addr, gpr.R(b));
|
||||||
|
if (a)
|
||||||
|
ADD(32, addr, gpr.R(a));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (update)
|
||||||
|
ADD(32, addr, Imm32((s32)(s16)inst.SIMM_16));
|
||||||
|
else
|
||||||
|
offset = (s32)(s16)inst.SIMM_16;
|
||||||
|
}
|
||||||
|
|
||||||
|
SafeLoadToReg(RAX, addr, single ? 32 : 64, offset, CallerSavedRegistersInUse(), false);
|
||||||
fpr.Lock(d);
|
fpr.Lock(d);
|
||||||
fpr.BindToRegister(d, js.memcheck);
|
fpr.BindToRegister(d, js.memcheck || !single);
|
||||||
|
|
||||||
MEMCHECK_START
|
MEMCHECK_START
|
||||||
ConvertSingleToDouble(fpr.RX(d), EAX, true);
|
if (single)
|
||||||
|
{
|
||||||
|
ConvertSingleToDouble(fpr.RX(d), EAX, true);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOVQ_xmm(XMM0, R(RAX));
|
||||||
|
MOVSD(fpr.RX(d), R(XMM0));
|
||||||
|
}
|
||||||
MEMCHECK_END
|
MEMCHECK_END
|
||||||
|
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
|
gpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Jit64::stfXXX(UGeckoInstruction inst)
|
||||||
void Jit64::lfd(UGeckoInstruction inst)
|
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
JITDISABLE(bJITLoadStoreFloatingOff);
|
JITDISABLE(bJITLoadStoreFloatingOff);
|
||||||
FALLBACK_IF(!inst.RA);
|
bool indexed = inst.OPCD == 31;
|
||||||
|
bool update = indexed ? !!(inst.SUBOP10&0x20) : !!(inst.OPCD&1);
|
||||||
int d = inst.RD;
|
bool single = indexed ? !(inst.SUBOP10&0x40) : !(inst.OPCD&2);
|
||||||
int a = inst.RA;
|
update &= indexed || inst.SIMM_16;
|
||||||
|
|
||||||
s32 offset = (s32)(s16)inst.SIMM_16;
|
|
||||||
|
|
||||||
SafeLoadToReg(RAX, gpr.R(a), 64, offset, CallerSavedRegistersInUse(), false);
|
|
||||||
|
|
||||||
fpr.Lock(d);
|
|
||||||
fpr.BindToRegister(d, true);
|
|
||||||
|
|
||||||
MEMCHECK_START
|
|
||||||
MOVQ_xmm(XMM0, R(RAX));
|
|
||||||
MOVSD(fpr.RX(d), R(XMM0));
|
|
||||||
MEMCHECK_END
|
|
||||||
|
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void Jit64::stfd(UGeckoInstruction inst)
|
|
||||||
{
|
|
||||||
INSTRUCTION_START
|
|
||||||
JITDISABLE(bJITLoadStoreFloatingOff);
|
|
||||||
FALLBACK_IF(!inst.RA);
|
|
||||||
|
|
||||||
int s = inst.RS;
|
int s = inst.RS;
|
||||||
int a = inst.RA;
|
int a = inst.RA;
|
||||||
|
int b = inst.RB;
|
||||||
|
|
||||||
|
FALLBACK_IF(!indexed && !a);
|
||||||
|
|
||||||
|
s32 offset = 0;
|
||||||
|
gpr.FlushLockX(ABI_PARAM1);
|
||||||
|
if (indexed)
|
||||||
|
{
|
||||||
|
if (update)
|
||||||
|
{
|
||||||
|
gpr.BindToRegister(a, true, true);
|
||||||
|
ADD(32, gpr.R(a), gpr.R(b));
|
||||||
|
MOV(32, R(ABI_PARAM1), gpr.R(a));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
|
||||||
|
LEA(32, ABI_PARAM1, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOV(32, R(ABI_PARAM1), gpr.R(b));
|
||||||
|
if (a)
|
||||||
|
ADD(32, R(ABI_PARAM1), gpr.R(a));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (update)
|
||||||
|
{
|
||||||
|
gpr.BindToRegister(a, true, true);
|
||||||
|
ADD(32, gpr.R(a), Imm32((s32)(s16)inst.SIMM_16));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
offset = (s32)(s16)inst.SIMM_16;
|
||||||
|
}
|
||||||
|
MOV(32, R(ABI_PARAM1), gpr.R(a));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (single)
|
||||||
|
{
|
||||||
|
fpr.BindToRegister(s, true, false);
|
||||||
|
ConvertDoubleToSingle(XMM0, fpr.RX(s));
|
||||||
|
SafeWriteF32ToReg(XMM0, ABI_PARAM1, offset, CallerSavedRegistersInUse());
|
||||||
|
fpr.UnlockAll();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (fpr.R(s).IsSimpleReg())
|
||||||
|
MOVQ_xmm(R(RAX), fpr.RX(s));
|
||||||
|
else
|
||||||
|
MOV(64, R(RAX), fpr.R(s));
|
||||||
|
SafeWriteRegToReg(RAX, ABI_PARAM1, 64, offset, CallerSavedRegistersInUse());
|
||||||
|
}
|
||||||
|
gpr.UnlockAll();
|
||||||
|
gpr.UnlockAllX();
|
||||||
|
}
|
||||||
|
|
||||||
|
// This one is a little bit weird; it stores the low 32 bits of a double without converting it
|
||||||
|
void Jit64::stfiwx(UGeckoInstruction inst)
|
||||||
|
{
|
||||||
|
INSTRUCTION_START
|
||||||
|
JITDISABLE(bJITLoadStoreFloatingOff);
|
||||||
|
|
||||||
|
int s = inst.RS;
|
||||||
|
int a = inst.RA;
|
||||||
|
int b = inst.RB;
|
||||||
|
|
||||||
gpr.FlushLockX(ABI_PARAM1);
|
gpr.FlushLockX(ABI_PARAM1);
|
||||||
MOV(32, R(ABI_PARAM1), gpr.R(a));
|
MOV(32, R(ABI_PARAM1), gpr.R(b));
|
||||||
|
if (a)
|
||||||
|
ADD(32, R(ABI_PARAM1), gpr.R(a));
|
||||||
|
|
||||||
if (fpr.R(s).IsSimpleReg())
|
if (fpr.R(s).IsSimpleReg())
|
||||||
MOVQ_xmm(R(RAX), fpr.RX(s));
|
MOVD_xmm(R(EAX), fpr.RX(s));
|
||||||
else
|
else
|
||||||
MOV(64, R(RAX), fpr.R(s));
|
MOV(32, R(EAX), fpr.R(s));
|
||||||
|
SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, CallerSavedRegistersInUse());
|
||||||
s32 offset = (s32)(s16)inst.SIMM_16;
|
|
||||||
SafeWriteRegToReg(RAX, ABI_PARAM1, 64, offset, CallerSavedRegistersInUse());
|
|
||||||
|
|
||||||
gpr.UnlockAllX();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Jit64::stfs(UGeckoInstruction inst)
|
|
||||||
{
|
|
||||||
INSTRUCTION_START
|
|
||||||
JITDISABLE(bJITLoadStoreFloatingOff);
|
|
||||||
FALLBACK_IF(!inst.RA);
|
|
||||||
|
|
||||||
int s = inst.RS;
|
|
||||||
int a = inst.RA;
|
|
||||||
s32 offset = (s32)(s16)inst.SIMM_16;
|
|
||||||
|
|
||||||
fpr.BindToRegister(s, true, false);
|
|
||||||
ConvertDoubleToSingle(XMM0, fpr.RX(s));
|
|
||||||
gpr.FlushLockX(ABI_PARAM1);
|
|
||||||
MOV(32, R(ABI_PARAM1), gpr.R(a));
|
|
||||||
SafeWriteF32ToReg(XMM0, ABI_PARAM1, offset, CallerSavedRegistersInUse());
|
|
||||||
fpr.UnlockAll();
|
|
||||||
gpr.UnlockAllX();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Jit64::stfsx(UGeckoInstruction inst)
|
|
||||||
{
|
|
||||||
INSTRUCTION_START
|
|
||||||
JITDISABLE(bJITLoadStoreFloatingOff);
|
|
||||||
|
|
||||||
gpr.FlushLockX(ABI_PARAM1);
|
|
||||||
MOV(32, R(ABI_PARAM1), gpr.R(inst.RB));
|
|
||||||
if (inst.RA)
|
|
||||||
ADD(32, R(ABI_PARAM1), gpr.R(inst.RA));
|
|
||||||
|
|
||||||
int s = inst.RS;
|
|
||||||
fpr.Lock(s);
|
|
||||||
fpr.BindToRegister(s, true, false);
|
|
||||||
ConvertDoubleToSingle(XMM0, fpr.RX(s));
|
|
||||||
SafeWriteF32ToReg(XMM0, ABI_PARAM1, 0, CallerSavedRegistersInUse());
|
|
||||||
fpr.UnlockAll();
|
|
||||||
gpr.UnlockAllX();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Jit64::lfsx(UGeckoInstruction inst)
|
|
||||||
{
|
|
||||||
INSTRUCTION_START
|
|
||||||
JITDISABLE(bJITLoadStoreFloatingOff);
|
|
||||||
|
|
||||||
MOV(32, R(EAX), gpr.R(inst.RB));
|
|
||||||
if (inst.RA)
|
|
||||||
ADD(32, R(EAX), gpr.R(inst.RA));
|
|
||||||
|
|
||||||
SafeLoadToReg(EAX, R(EAX), 32, 0, CallerSavedRegistersInUse(), false);
|
|
||||||
|
|
||||||
fpr.Lock(inst.RS);
|
|
||||||
fpr.BindToRegister(inst.RS, js.memcheck);
|
|
||||||
|
|
||||||
MEMCHECK_START
|
|
||||||
ConvertSingleToDouble(fpr.RX(inst.RS), EAX, true);
|
|
||||||
MEMCHECK_END
|
|
||||||
|
|
||||||
fpr.UnlockAll();
|
|
||||||
gpr.UnlockAllX();
|
gpr.UnlockAllX();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue