JIT64: clean up and unify float load/store code

While we're at it, support a bunch of float load/store variants that weren't
implemented in the JIT. Might not have a big speed impact on typical games but
they're used at least a bit in povray and luabench.

694 -> 644 seconds on povray.
This commit is contained in:
Fiora 2014-08-25 13:56:01 -07:00
parent 44ee2f20b9
commit aaca1b01e5
3 changed files with 149 additions and 125 deletions

View File

@ -192,11 +192,9 @@ public:
void cntlzwx(UGeckoInstruction inst);
void lfs(UGeckoInstruction inst);
void lfd(UGeckoInstruction inst);
void stfd(UGeckoInstruction inst);
void stfs(UGeckoInstruction inst);
void stfsx(UGeckoInstruction inst);
void lfXXX(UGeckoInstruction inst);
void stfXXX(UGeckoInstruction inst);
void stfiwx(UGeckoInstruction inst);
void psq_l(UGeckoInstruction inst);
void psq_st(UGeckoInstruction inst);
@ -211,7 +209,6 @@ public:
void srwx(UGeckoInstruction inst);
void dcbst(UGeckoInstruction inst);
void dcbz(UGeckoInstruction inst);
void lfsx(UGeckoInstruction inst);
void subfic(UGeckoInstruction inst);
void subfcx(UGeckoInstruction inst);

View File

@ -82,15 +82,15 @@ static GekkoOPTemplate primarytable[] =
{46, &Jit64::lmw}, //"lmw", OPTYPE_SYSTEM, FL_EVIL, 10}},
{47, &Jit64::stmw}, //"stmw", OPTYPE_SYSTEM, FL_EVIL, 10}},
{48, &Jit64::lfs}, //"lfs", OPTYPE_LOADFP, FL_IN_A}},
{49, &Jit64::FallBackToInterpreter}, //"lfsu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}},
{50, &Jit64::lfd}, //"lfd", OPTYPE_LOADFP, FL_IN_A}},
{51, &Jit64::FallBackToInterpreter}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}},
{48, &Jit64::lfXXX}, //"lfs", OPTYPE_LOADFP, FL_IN_A}},
{49, &Jit64::lfXXX}, //"lfsu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}},
{50, &Jit64::lfXXX}, //"lfd", OPTYPE_LOADFP, FL_IN_A}},
{51, &Jit64::lfXXX}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}},
{52, &Jit64::stfs}, //"stfs", OPTYPE_STOREFP, FL_IN_A}},
{53, &Jit64::FallBackToInterpreter}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
{54, &Jit64::stfd}, //"stfd", OPTYPE_STOREFP, FL_IN_A}},
{55, &Jit64::FallBackToInterpreter}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
{52, &Jit64::stfXXX}, //"stfs", OPTYPE_STOREFP, FL_IN_A}},
{53, &Jit64::stfXXX}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
{54, &Jit64::stfXXX}, //"stfd", OPTYPE_STOREFP, FL_IN_A}},
{55, &Jit64::stfXXX}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
{56, &Jit64::psq_l}, //"psq_l", OPTYPE_PS, FL_IN_A}},
{57, &Jit64::psq_l}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},
@ -253,16 +253,16 @@ static GekkoOPTemplate table31[] =
{725, &Jit64::FallBackToInterpreter}, //"stswi", OPTYPE_STORE, FL_EVIL}},
// fp load/store
{535, &Jit64::lfsx}, //"lfsx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}},
{567, &Jit64::FallBackToInterpreter}, //"lfsux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}},
{599, &Jit64::FallBackToInterpreter}, //"lfdx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}},
{631, &Jit64::FallBackToInterpreter}, //"lfdux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}},
{535, &Jit64::lfXXX}, //"lfsx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}},
{567, &Jit64::lfXXX}, //"lfsux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}},
{599, &Jit64::lfXXX}, //"lfdx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}},
{631, &Jit64::lfXXX}, //"lfdux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}},
{663, &Jit64::stfsx}, //"stfsx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}},
{695, &Jit64::FallBackToInterpreter}, //"stfsux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}},
{727, &Jit64::FallBackToInterpreter}, //"stfdx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}},
{759, &Jit64::FallBackToInterpreter}, //"stfdux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}},
{983, &Jit64::FallBackToInterpreter}, //"stfiwx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}},
{663, &Jit64::stfXXX}, //"stfsx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}},
{695, &Jit64::stfXXX}, //"stfsux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}},
{727, &Jit64::stfXXX}, //"stfdx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}},
{759, &Jit64::stfXXX}, //"stfdux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}},
{983, &Jit64::stfiwx}, //"stfiwx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}},
{19, &Jit64::mfcr}, //"mfcr", OPTYPE_SYSTEM, FL_OUT_D}},
{83, &Jit64::mfmsr}, //"mfmsr", OPTYPE_SYSTEM, FL_OUT_D}},

View File

@ -14,134 +14,161 @@ using namespace Gen;
// TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common,
// and pshufb could help a lot.
void Jit64::lfs(UGeckoInstruction inst)
void Jit64::lfXXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
bool indexed = inst.OPCD == 31;
bool update = indexed ? !!(inst.SUBOP10 & 0x20) : !!(inst.OPCD & 1);
bool single = indexed ? !(inst.SUBOP10 & 0x40) : !(inst.OPCD & 2);
update &= indexed || inst.SIMM_16;
int d = inst.RD;
int a = inst.RA;
FALLBACK_IF(!a);
int b = inst.RB;
s32 offset = (s32)(s16)inst.SIMM_16;
FALLBACK_IF(!indexed && !a);
SafeLoadToReg(EAX, gpr.R(a), 32, offset, CallerSavedRegistersInUse(), false);
if (update)
gpr.BindToRegister(a, true, true);
s32 offset = 0;
OpArg addr = gpr.R(a);
if (indexed)
{
if (update)
{
ADD(32, addr, gpr.R(b));
}
else
{
addr = R(EAX);
if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
LEA(32, EAX, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
else
{
MOV(32, addr, gpr.R(b));
if (a)
ADD(32, addr, gpr.R(a));
}
}
}
else
{
if (update)
ADD(32, addr, Imm32((s32)(s16)inst.SIMM_16));
else
offset = (s32)(s16)inst.SIMM_16;
}
SafeLoadToReg(RAX, addr, single ? 32 : 64, offset, CallerSavedRegistersInUse(), false);
fpr.Lock(d);
fpr.BindToRegister(d, js.memcheck);
fpr.BindToRegister(d, js.memcheck || !single);
MEMCHECK_START
ConvertSingleToDouble(fpr.RX(d), EAX, true);
if (single)
{
ConvertSingleToDouble(fpr.RX(d), EAX, true);
}
else
{
MOVQ_xmm(XMM0, R(RAX));
MOVSD(fpr.RX(d), R(XMM0));
}
MEMCHECK_END
fpr.UnlockAll();
gpr.UnlockAll();
}
void Jit64::lfd(UGeckoInstruction inst)
void Jit64::stfXXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
FALLBACK_IF(!inst.RA);
int d = inst.RD;
int a = inst.RA;
s32 offset = (s32)(s16)inst.SIMM_16;
SafeLoadToReg(RAX, gpr.R(a), 64, offset, CallerSavedRegistersInUse(), false);
fpr.Lock(d);
fpr.BindToRegister(d, true);
MEMCHECK_START
MOVQ_xmm(XMM0, R(RAX));
MOVSD(fpr.RX(d), R(XMM0));
MEMCHECK_END
fpr.UnlockAll();
}
void Jit64::stfd(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
FALLBACK_IF(!inst.RA);
bool indexed = inst.OPCD == 31;
bool update = indexed ? !!(inst.SUBOP10&0x20) : !!(inst.OPCD&1);
bool single = indexed ? !(inst.SUBOP10&0x40) : !(inst.OPCD&2);
update &= indexed || inst.SIMM_16;
int s = inst.RS;
int a = inst.RA;
int b = inst.RB;
FALLBACK_IF(!indexed && !a);
s32 offset = 0;
gpr.FlushLockX(ABI_PARAM1);
if (indexed)
{
if (update)
{
gpr.BindToRegister(a, true, true);
ADD(32, gpr.R(a), gpr.R(b));
MOV(32, R(ABI_PARAM1), gpr.R(a));
}
else
{
if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
LEA(32, ABI_PARAM1, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
else
{
MOV(32, R(ABI_PARAM1), gpr.R(b));
if (a)
ADD(32, R(ABI_PARAM1), gpr.R(a));
}
}
}
else
{
if (update)
{
gpr.BindToRegister(a, true, true);
ADD(32, gpr.R(a), Imm32((s32)(s16)inst.SIMM_16));
}
else
{
offset = (s32)(s16)inst.SIMM_16;
}
MOV(32, R(ABI_PARAM1), gpr.R(a));
}
if (single)
{
fpr.BindToRegister(s, true, false);
ConvertDoubleToSingle(XMM0, fpr.RX(s));
SafeWriteF32ToReg(XMM0, ABI_PARAM1, offset, CallerSavedRegistersInUse());
fpr.UnlockAll();
}
else
{
if (fpr.R(s).IsSimpleReg())
MOVQ_xmm(R(RAX), fpr.RX(s));
else
MOV(64, R(RAX), fpr.R(s));
SafeWriteRegToReg(RAX, ABI_PARAM1, 64, offset, CallerSavedRegistersInUse());
}
gpr.UnlockAll();
gpr.UnlockAllX();
}
// This one is a little bit weird; it stores the low 32 bits of a double without converting it
void Jit64::stfiwx(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
int s = inst.RS;
int a = inst.RA;
int b = inst.RB;
gpr.FlushLockX(ABI_PARAM1);
MOV(32, R(ABI_PARAM1), gpr.R(a));
MOV(32, R(ABI_PARAM1), gpr.R(b));
if (a)
ADD(32, R(ABI_PARAM1), gpr.R(a));
if (fpr.R(s).IsSimpleReg())
MOVQ_xmm(R(RAX), fpr.RX(s));
MOVD_xmm(R(EAX), fpr.RX(s));
else
MOV(64, R(RAX), fpr.R(s));
s32 offset = (s32)(s16)inst.SIMM_16;
SafeWriteRegToReg(RAX, ABI_PARAM1, 64, offset, CallerSavedRegistersInUse());
gpr.UnlockAllX();
}
void Jit64::stfs(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
FALLBACK_IF(!inst.RA);
int s = inst.RS;
int a = inst.RA;
s32 offset = (s32)(s16)inst.SIMM_16;
fpr.BindToRegister(s, true, false);
ConvertDoubleToSingle(XMM0, fpr.RX(s));
gpr.FlushLockX(ABI_PARAM1);
MOV(32, R(ABI_PARAM1), gpr.R(a));
SafeWriteF32ToReg(XMM0, ABI_PARAM1, offset, CallerSavedRegistersInUse());
fpr.UnlockAll();
gpr.UnlockAllX();
}
void Jit64::stfsx(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
gpr.FlushLockX(ABI_PARAM1);
MOV(32, R(ABI_PARAM1), gpr.R(inst.RB));
if (inst.RA)
ADD(32, R(ABI_PARAM1), gpr.R(inst.RA));
int s = inst.RS;
fpr.Lock(s);
fpr.BindToRegister(s, true, false);
ConvertDoubleToSingle(XMM0, fpr.RX(s));
SafeWriteF32ToReg(XMM0, ABI_PARAM1, 0, CallerSavedRegistersInUse());
fpr.UnlockAll();
gpr.UnlockAllX();
}
void Jit64::lfsx(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
MOV(32, R(EAX), gpr.R(inst.RB));
if (inst.RA)
ADD(32, R(EAX), gpr.R(inst.RA));
SafeLoadToReg(EAX, R(EAX), 32, 0, CallerSavedRegistersInUse(), false);
fpr.Lock(inst.RS);
fpr.BindToRegister(inst.RS, js.memcheck);
MEMCHECK_START
ConvertSingleToDouble(fpr.RX(inst.RS), EAX, true);
MEMCHECK_END
fpr.UnlockAll();
MOV(32, R(EAX), fpr.R(s));
SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, CallerSavedRegistersInUse());
gpr.UnlockAllX();
}