EERec: Don't load in skip case of SD[LR]

This commit is contained in:
TellowKrinkle 2021-09-18 01:25:04 -05:00 committed by refractionpcsx2
parent e9518f78c7
commit 23578e963f
1 changed files with 195 additions and 192 deletions

View File

@ -458,6 +458,43 @@ void recSWR()
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
/// Masks rt with (0xffffffffffffffff maskshift maskamt), merges with (value shift amt), leaves result in value
static void ldlrhelper_const(int maskamt, const xImplSimd_Shift& maskshift, int amt, const xImplSimd_Shift& shift, const xRegisterSSE& value, const xRegisterSSE& rt)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
xRegisterSSE t0(t0reg);
xPCMP.EQD(t0, t0);
maskshift.Q(t0, maskamt);
xPAND(t0, rt);
shift.Q(value, amt);
xPOR(value, t0);
_freeXMMreg(t0reg);
}
/// Masks rt with (0xffffffffffffffff maskshift maskamt), merges with (value shift amt), leaves result in value
static void ldlrhelper(const xRegister32& maskamt, const xImplSimd_Shift& maskshift, const xRegister32& amt, const xImplSimd_Shift& shift, const xRegisterSSE& value, const xRegisterSSE& rt)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
xRegisterSSE t0(t0reg);
xRegisterSSE t1(t1reg);
xMOVDZX(t1, maskamt);
xPCMP.EQD(t0, t0);
maskshift.Q(t0, t1);
xPAND(t0, rt);
xMOVDZX(t1, amt);
shift.Q(value, t1);
xPOR(value, t0);
_freeXMMreg(t1reg);
_freeXMMreg(t0reg);
}
void recLDL() void recLDL()
{ {
if (!_Rt_) if (!_Rt_)
@ -480,6 +517,7 @@ void recLDL()
if (_Imm_ != 0) if (_Imm_ != 0)
xADD(arg1regd, _Imm_); xADD(arg1regd, _Imm_);
xMOV(calleeSavedReg1d, arg1regd);
xAND(arg1regd, ~0x07); xAND(arg1regd, ~0x07);
iFlushCall(FLUSH_FULLVTLB); iFlushCall(FLUSH_FULLVTLB);
@ -488,46 +526,34 @@ void recLDL()
} }
int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ | MODE_WRITE); int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ | MODE_WRITE);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
if (GPR_IS_CONST1(_Rs_)) if (GPR_IS_CONST1(_Rs_))
{ {
u32 shiftval = g_cpuConstRegs[_Rs_].UL[0] + _Imm_; u32 shift = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
shiftval &= 0x7; shift = ((shift & 0x7) + 1) * 8;
xMOV(eax, shiftval + 1); if (shift != 64)
{
ldlrhelper_const(shift, xPSRL, 64 - shift, xPSLL, xRegisterSSE(t2reg), xRegisterSSE(rtreg));
}
} }
else else
{ {
_eeMoveGPRtoR(eax, _Rs_); xAND(calleeSavedReg1d, 0x7);
if (_Imm_ != 0) xCMP(calleeSavedReg1d, 7);
xADD(eax, _Imm_);
xAND(eax, 0x7);
xADD(eax, 1);
}
xCMP(eax, 8);
xForwardJE8 skip; xForwardJE8 skip;
//Calculate the shift from top bit to lowest // Calculate the shift from top bit to lowest
xADD(calleeSavedReg1d, 1);
xMOV(edx, 64); xMOV(edx, 64);
xSHL(eax, 3); xSHL(calleeSavedReg1d, 3);
xSUB(edx, eax); xSUB(edx, calleeSavedReg1d);
xMOVDZX(xRegisterSSE(t1reg), eax);
xPCMP.EQD(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xPSRL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
xPAND(xRegisterSSE(t0reg), xRegisterSSE(rtreg));
xMOVDZX(xRegisterSSE(t1reg), edx);
xPSLL.Q(xRegisterSSE(t2reg), xRegisterSSE(t1reg));
xPOR(xRegisterSSE(t2reg), xRegisterSSE(t0reg));
ldlrhelper(calleeSavedReg1d, xPSRL, edx, xPSLL, xRegisterSSE(t2reg), xRegisterSSE(rtreg));
skip.SetTarget(); skip.SetTarget();
}
xMOVSD(xRegisterSSE(rtreg), xRegisterSSE(t2reg)); xMOVSD(xRegisterSSE(rtreg), xRegisterSSE(t2reg));
_freeXMMreg(t0reg);
_freeXMMreg(t1reg);
_freeXMMreg(t2reg); _freeXMMreg(t2reg);
_clearNeededXMMregs();
#else #else
iFlushCall(FLUSH_INTERPRETER); iFlushCall(FLUSH_INTERPRETER);
@ -562,6 +588,7 @@ void recLDR()
if (_Imm_ != 0) if (_Imm_ != 0)
xADD(arg1regd, _Imm_); xADD(arg1regd, _Imm_);
xMOV(calleeSavedReg1d, arg1regd);
xAND(arg1regd, ~0x07); xAND(arg1regd, ~0x07);
iFlushCall(FLUSH_FULLVTLB); iFlushCall(FLUSH_FULLVTLB);
@ -570,45 +597,33 @@ void recLDR()
} }
int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ | MODE_WRITE); int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ | MODE_WRITE);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
if (GPR_IS_CONST1(_Rs_)) if (GPR_IS_CONST1(_Rs_))
{ {
u32 shiftval = g_cpuConstRegs[_Rs_].UL[0] + _Imm_; u32 shift = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
shiftval &= 0x7; shift = (shift & 0x7) * 8;
xMOV(eax, shiftval); if (shift != 0)
{
ldlrhelper_const(64 - shift, xPSLL, shift, xPSRL, xRegisterSSE(t2reg), xRegisterSSE(rtreg));
}
} }
else else
{ {
_eeMoveGPRtoR(eax, _Rs_); xAND(calleeSavedReg1d, 0x7);
if (_Imm_ != 0) xForwardJE8 skip;
xADD(eax, _Imm_); // Calculate the shift from top bit to lowest
xAND(eax, 0x7); xMOV(edx, 64);
xSHL(calleeSavedReg1d, 3);
xSUB(edx, calleeSavedReg1d);
ldlrhelper(edx, xPSLL, calleeSavedReg1d, xPSRL, xRegisterSSE(t2reg), xRegisterSSE(rtreg));
skip.SetTarget();
} }
xCMP(eax, 0);
xForwardJE32 skip;
//Calculate the shift from top bit to lowest
xMOV(edx, 64);
xSHL(eax, 3);
xSUB(edx, eax);
xMOVDZX(xRegisterSSE(t1reg), edx); //64-shift*8
xPCMP.EQD(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xPSLL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
xPAND(xRegisterSSE(t0reg), xRegisterSSE(rtreg));
xMOVDZX(xRegisterSSE(t1reg), eax); //shift*8
xPSRL.Q(xRegisterSSE(t2reg), xRegisterSSE(t1reg));
xPOR(xRegisterSSE(t2reg), xRegisterSSE(t0reg));
skip.SetTarget();
xMOVSD(xRegisterSSE(rtreg), xRegisterSSE(t2reg)); xMOVSD(xRegisterSSE(rtreg), xRegisterSSE(t2reg));
_freeXMMreg(t0reg);
_freeXMMreg(t1reg);
_freeXMMreg(t2reg); _freeXMMreg(t2reg);
_clearNeededXMMregs();
#else #else
iFlushCall(FLUSH_INTERPRETER); iFlushCall(FLUSH_INTERPRETER);
@ -622,16 +637,73 @@ void recLDR()
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
/// Masks value with (0xffffffffffffffff maskshift maskamt), merges with (rt shift amt), saves to dummyValue
static void sdlrhelper_const(int maskamt, const xImplSimd_Shift& maskshift, int amt, const xImplSimd_Shift& shift, const xRegisterSSE& value, const xRegisterSSE& rt)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
xRegisterSSE t0(t0reg);
xPCMP.EQD(t0, t0);
maskshift.Q(t0, maskamt);
xPAND(t0, value);
shift.Q(rt, amt);
xPOR(rt, t0);
xLEA(arg2reg, ptr[&dummyValue[0]]);
xMOVQ(ptr64[arg2reg], rt);
_freeXMMreg(t0reg);
}
/// Masks value with (0xffffffffffffffff maskshift maskamt), merges with (rt shift amt), saves to dummyValue
static void sdlrhelper(const xRegister32& maskamt, const xImplSimd_Shift& maskshift, const xRegister32& amt, const xImplSimd_Shift& shift, const xRegisterSSE& value, const xRegisterSSE& rt)
{
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
xRegisterSSE t0(t0reg);
xRegisterSSE t1(t1reg);
// Generate mask 128-(shiftx8)
xMOVDZX(t1, maskamt);
xPCMP.EQD(t0, t0);
maskshift.Q(t0, t1);
xPAND(t0, value);
// Shift over reg value
xMOVDZX(t1, amt);
shift.Q(rt, t1);
xPOR(rt, t0);
xLEA(arg2reg, ptr[&dummyValue[0]]);
xMOVQ(ptr64[arg2reg], rt);
_freeXMMreg(t1reg);
_freeXMMreg(t0reg);
}
void recSDL() void recSDL()
{ {
#ifdef LOADSTORE_RECOMPILE #ifdef LOADSTORE_RECOMPILE
int t2reg; _flushEEreg(_Rt_); // flush register to mem
if (GPR_IS_CONST1(_Rs_)) if (GPR_IS_CONST1(_Rs_))
{ {
u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_; u32 adr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
srcadr &= ~0x07; u32 aligned = adr & ~0x07;
u32 shift = ((adr & 0x7) + 1) * 8;
t2reg = vtlb_DynGenRead64_Const(64, srcadr, -1); if (shift == 64)
{
xLEA(arg2reg, ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
}
else
{
int t2reg = vtlb_DynGenRead64_Const(64, aligned, -1);
int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ);
sdlrhelper_const(shift, xPSLL, 64 - shift, xPSRL, xRegisterSSE(t2reg), xRegisterSSE(rtreg));
_deleteGPRtoXMMreg(_Rt_, 3);
_freeXMMreg(t2reg);
}
vtlb_DynGenWrite_Const(64, aligned);
} }
else else
{ {
@ -640,74 +712,34 @@ void recSDL()
if (_Imm_ != 0) if (_Imm_ != 0)
xADD(arg1regd, _Imm_); xADD(arg1regd, _Imm_);
xAND(arg1regd, ~0x07);
iFlushCall(FLUSH_FULLVTLB); iFlushCall(FLUSH_FULLVTLB);
xMOV(calleeSavedReg1d, arg1regd);
t2reg = vtlb_DynGenRead64(64, -1); xAND(arg1regd, ~0x07);
} xAND(calleeSavedReg1d, 0x7);
_flushEEreg(_Rt_); // flush register to mem xCMP(calleeSavedReg1d, 7);
xForwardJE32 skip;
xADD(calleeSavedReg1d, 1);
int t2reg = vtlb_DynGenRead64(64, -1);
int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ); int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
if (GPR_IS_CONST1(_Rs_))
{
u32 shiftval = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
shiftval &= 0x7;
xMOV(eax, shiftval + 1);
}
else
{
_eeMoveGPRtoR(eax, _Rs_);
if (_Imm_ != 0)
xADD(eax, _Imm_);
xAND(eax, 0x7);
xADD(eax, 1);
}
xCMP(eax, 8);
xForwardJE8 skip;
//Calculate the shift from top bit to lowest //Calculate the shift from top bit to lowest
xMOV(edx, 64); xMOV(edx, 64);
xSHL(eax, 3); xSHL(calleeSavedReg1d, 3);
xSUB(edx, eax); xSUB(edx, calleeSavedReg1d);
// Generate mask 128-(shiftx8) xPSRA.W does bit for bit
xMOVDZX(xRegisterSSE(t1reg), eax);
xPCMP.EQD(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xPSLL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
xPAND(xRegisterSSE(t0reg), xRegisterSSE(t2reg));
// Shift over reg value (shift, PSLL.Q multiplies by 8) sdlrhelper(calleeSavedReg1d, xPSLL, edx, xPSRL, xRegisterSSE(t2reg), xRegisterSSE(rtreg));
xMOVDZX(xRegisterSSE(t1reg), edx);
xPSRL.Q(xRegisterSSE(rtreg), xRegisterSSE(t1reg));
xPOR(xRegisterSSE(rtreg), xRegisterSSE(t0reg));
skip.SetTarget();
xMOVQ(ptr64[&dummyValue[0]], xRegisterSSE(rtreg));
_deleteGPRtoXMMreg(_Rt_, 3); _deleteGPRtoXMMreg(_Rt_, 3);
_freeXMMreg(t0reg);
_freeXMMreg(t1reg);
_freeXMMreg(t2reg); _freeXMMreg(t2reg);
xLEA(arg2reg, ptr128[&dummyValue[0]]);
if (GPR_IS_CONST1(_Rs_))
{
u32 dstadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
dstadr &= ~0x07;
vtlb_DynGenWrite_Const(64, dstadr);
}
else
{
_eeMoveGPRtoR(arg1regd, _Rs_); _eeMoveGPRtoR(arg1regd, _Rs_);
if (_Imm_ != 0) if (_Imm_ != 0)
xADD(arg1regd, _Imm_); xADD(arg1regd, _Imm_);
xAND(arg1regd, ~0x7); xAND(arg1regd, ~0x7);
xForwardJump8 end;
skip.SetTarget();
xLEA(arg2reg, ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
end.SetTarget();
iFlushCall(FLUSH_FULLVTLB); iFlushCall(FLUSH_FULLVTLB);
@ -726,13 +758,26 @@ void recSDL()
void recSDR() void recSDR()
{ {
#ifdef LOADSTORE_RECOMPILE #ifdef LOADSTORE_RECOMPILE
int t2reg; _flushEEreg(_Rt_); // flush register to mem
if (GPR_IS_CONST1(_Rs_)) if (GPR_IS_CONST1(_Rs_))
{ {
u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_; u32 adr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
srcadr &= ~0x07; u32 aligned = adr & ~0x07;
u32 shift = (adr & 0x7) * 8;
if (shift == 0)
{
xLEA(arg2reg, ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
}
else
{
int t2reg = vtlb_DynGenRead64_Const(64, aligned, -1);
int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ);
sdlrhelper_const(64 - shift, xPSRL, shift, xPSLL, xRegisterSSE(t2reg), xRegisterSSE(rtreg));
_deleteGPRtoXMMreg(_Rt_, 3);
_freeXMMreg(t2reg);
}
t2reg = vtlb_DynGenRead64_Const(64, srcadr, -1); vtlb_DynGenWrite_Const(64, aligned);
} }
else else
{ {
@ -741,73 +786,31 @@ void recSDR()
if (_Imm_ != 0) if (_Imm_ != 0)
xADD(arg1regd, _Imm_); xADD(arg1regd, _Imm_);
xAND(arg1regd, ~0x07);
iFlushCall(FLUSH_FULLVTLB); iFlushCall(FLUSH_FULLVTLB);
xMOV(calleeSavedReg1d, arg1regd);
t2reg = vtlb_DynGenRead64(64, -1); xAND(arg1regd, ~0x07);
} xAND(calleeSavedReg1d, 0x7);
_flushEEreg(_Rt_); // flush register to mem xForwardJE32 skip;
int t2reg = vtlb_DynGenRead64(64, -1);
int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ); int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
if (GPR_IS_CONST1(_Rs_))
{
u32 shiftval = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
shiftval &= 0x7;
xMOV(eax, shiftval);
}
else
{
_eeMoveGPRtoR(eax, _Rs_);
if (_Imm_ != 0)
xADD(eax, _Imm_);
xAND(eax, 0x7);
}
xCMP(eax, 0);
xForwardJE8 skip;
//Calculate the shift from top bit to lowest
xMOV(edx, 64); xMOV(edx, 64);
xSHL(eax, 3); xSHL(calleeSavedReg1d, 3);
xSUB(edx, eax); xSUB(edx, calleeSavedReg1d);
// Generate mask 128-(shiftx8) xPSRA.W does bit for bit
xMOVDZX(xRegisterSSE(t1reg), edx);
xPCMP.EQD(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xPSRL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
xPAND(xRegisterSSE(t0reg), xRegisterSSE(t2reg));
// Shift over reg value (shift, PSLL.Q multiplies by 8) sdlrhelper(edx, xPSRL, calleeSavedReg1d, xPSLL, xRegisterSSE(t2reg), xRegisterSSE(rtreg));
xMOVDZX(xRegisterSSE(t1reg), eax);
xPSLL.Q(xRegisterSSE(rtreg), xRegisterSSE(t1reg));
xPOR(xRegisterSSE(rtreg), xRegisterSSE(t0reg));
skip.SetTarget();
xMOVQ(ptr64[&dummyValue[0]], xRegisterSSE(rtreg));
_deleteGPRtoXMMreg(_Rt_, 3); _deleteGPRtoXMMreg(_Rt_, 3);
_freeXMMreg(t0reg);
_freeXMMreg(t1reg);
_freeXMMreg(t2reg); _freeXMMreg(t2reg);
xLEA(arg2reg, ptr128[&dummyValue[0]]);
if (GPR_IS_CONST1(_Rs_))
{
u32 dstadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
dstadr &= ~0x07;
vtlb_DynGenWrite_Const(64, dstadr);
}
else
{
_eeMoveGPRtoR(arg1regd, _Rs_); _eeMoveGPRtoR(arg1regd, _Rs_);
if (_Imm_ != 0) if (_Imm_ != 0)
xADD(arg1regd, _Imm_); xADD(arg1regd, _Imm_);
xAND(arg1regd, ~0x7); xAND(arg1regd, ~0x7);
xForwardJump8 end;
skip.SetTarget();
xLEA(arg2reg, ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
end.SetTarget();
iFlushCall(FLUSH_FULLVTLB); iFlushCall(FLUSH_FULLVTLB);