EE Rec: Added LDR/LDL

Also fixed slight optimisation bug in SDL
This commit is contained in:
refractionpcsx2 2021-09-16 19:11:41 +01:00
parent 5f58c325ca
commit 862d606514
1 changed files with 164 additions and 8 deletions

View File

@ -451,12 +451,96 @@ void recSWR()
} }
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
alignas(16) const u32 SHIFT_MASKS[2][4] = {
{ 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 },
{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }
};
void recLDL() void recLDL()
{ {
if (!_Rt_)
return;
#ifdef LOADSTORE_RECOMPILE
xLEA(arg2reg, ptr128[&dummyValue[0]]);
if (GPR_IS_CONST1(_Rs_))
{
u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
srcadr &= ~0x07;
vtlb_DynGenRead64_Const(64, srcadr);
}
else
{
// Load ECX with the source memory address that we're reading from.
_eeMoveGPRtoR(arg1regd, _Rs_);
if (_Imm_ != 0)
xADD(arg1regd, _Imm_);
xAND(arg1regd, ~0x07);
iFlushCall(FLUSH_FULLVTLB);
vtlb_DynGenRead64(64);
}
int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ | MODE_WRITE);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t2reg = _allocTempXMMreg(XMMT_INT, -1);
if (GPR_IS_CONST1(_Rs_))
{
u32 shiftval = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
shiftval &= 0x7;
xMOV(eax, shiftval + 1);
}
else
{
_eeMoveGPRtoR(eax, _Rs_);
if (_Imm_ != 0)
xADD(eax, _Imm_);
xAND(eax, 0x7);
xADD(eax, 1);
}
xCMP(eax, 8);
xForwardJE32 skip;
//Calculate the shift from top bit to lowest
xMOV(edx, 64);
xSHL(eax, 3);
xSUB(edx, eax);
xMOVDZX(xRegisterSSE(t1reg), eax);
xMOVQZX(xRegisterSSE(t0reg), ptr128[&SHIFT_MASKS[0][0]]);
xPSRL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
xPAND(xRegisterSSE(t0reg), xRegisterSSE(rtreg));
xMOVDQA(xRegisterSSE(t2reg), xRegisterSSE(t0reg));
xMOVDZX(xRegisterSSE(t1reg), edx);
xMOVQZX(xRegisterSSE(t0reg), ptr128[&dummyValue[0]]);
xPSLL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
xPOR(xRegisterSSE(t0reg), xRegisterSSE(t2reg));
xForwardJump32 full;
skip.SetTarget();
xMOVQZX(xRegisterSSE(t0reg), ptr128[&dummyValue[0]]);
full.SetTarget();
xBLEND.PS(xRegisterSSE(rtreg), xRegisterSSE(t0reg), 0x3);
_freeXMMreg(t0reg);
_freeXMMreg(t1reg);
_freeXMMreg(t2reg);
#else
iFlushCall(FLUSH_INTERPRETER); iFlushCall(FLUSH_INTERPRETER);
_deleteEEreg(_Rs_, 1); _deleteEEreg(_Rs_, 1);
_deleteEEreg(_Rt_, 1); _deleteEEreg(_Rt_, 1);
recCall(LDL); recCall(LDL);
#endif
EE::Profiler.EmitOp(eeOpcode::LDL); EE::Profiler.EmitOp(eeOpcode::LDL);
} }
@ -464,21 +548,93 @@ void recLDL()
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
void recLDR() void recLDR()
{ {
if (!_Rt_)
return;
#ifdef LOADSTORE_RECOMPILE
xLEA(arg2reg, ptr128[&dummyValue[0]]);
if (GPR_IS_CONST1(_Rs_))
{
u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
srcadr &= ~0x07;
vtlb_DynGenRead64_Const(64, srcadr);
}
else
{
// Load ECX with the source memory address that we're reading from.
_eeMoveGPRtoR(arg1regd, _Rs_);
if (_Imm_ != 0)
xADD(arg1regd, _Imm_);
xAND(arg1regd, ~0x07);
iFlushCall(FLUSH_FULLVTLB);
vtlb_DynGenRead64(64);
}
int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ | MODE_WRITE);
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t2reg = _allocTempXMMreg(XMMT_INT, -1);
if (GPR_IS_CONST1(_Rs_))
{
u32 shiftval = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
shiftval &= 0x7;
xMOV(eax, shiftval);
}
else
{
_eeMoveGPRtoR(eax, _Rs_);
if (_Imm_ != 0)
xADD(eax, _Imm_);
xAND(eax, 0x7);
}
xCMP(eax, 0);
xForwardJE32 skip;
//Calculate the shift from top bit to lowest
xMOV(edx, 64);
xSHL(eax, 3);
xSUB(edx, eax);
xMOVDZX(xRegisterSSE(t1reg), edx); //64-shift*8
xMOVQZX(xRegisterSSE(t0reg), ptr128[&SHIFT_MASKS[0][0]]);
xPSLL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
xPAND(xRegisterSSE(t0reg), xRegisterSSE(rtreg));
xMOVQZX(xRegisterSSE(t2reg), xRegisterSSE(t0reg));
xMOVDZX(xRegisterSSE(t1reg), eax); //shift*8
xMOVQZX(xRegisterSSE(t0reg), ptr128[&dummyValue[0]]);
xPSRL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
xPOR(xRegisterSSE(t0reg), xRegisterSSE(t2reg));
xForwardJump32 full;
skip.SetTarget();
xMOVQZX(xRegisterSSE(t0reg), ptr128[&dummyValue[0]]);
full.SetTarget();
xBLEND.PS(xRegisterSSE(rtreg), xRegisterSSE(t0reg), 0x3);
_freeXMMreg(t0reg);
_freeXMMreg(t1reg);
_freeXMMreg(t2reg);
#else
iFlushCall(FLUSH_INTERPRETER); iFlushCall(FLUSH_INTERPRETER);
_deleteEEreg(_Rs_, 1); _deleteEEreg(_Rs_, 1);
_deleteEEreg(_Rt_, 1); _deleteEEreg(_Rt_, 1);
recCall(LDR); recCall(LDR);
#endif
EE::Profiler.EmitOp(eeOpcode::LDR); EE::Profiler.EmitOp(eeOpcode::LDR);
} }
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
alignas(16) const u32 SD_MASK[2][4] = {
{ 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 },
{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }
};
void recSDL() void recSDL()
{ {
#ifdef LOADSTORE_RECOMPILE #ifdef LOADSTORE_RECOMPILE
@ -525,7 +681,7 @@ void recSDL()
xADD(eax, 1); xADD(eax, 1);
} }
xCMP(eax, 1); xCMP(eax, 8);
xForwardJE32 skip; xForwardJE32 skip;
//Calculate the shift from top bit to lowest //Calculate the shift from top bit to lowest
xMOV(edx, 64); xMOV(edx, 64);
@ -533,7 +689,7 @@ void recSDL()
xSUB(edx, eax); xSUB(edx, eax);
// Generate mask 128-(shiftx8) xPSRA.W does bit for bit // Generate mask 128-(shiftx8) xPSRA.W does bit for bit
xMOVDZX(xRegisterSSE(t1reg), eax); xMOVDZX(xRegisterSSE(t1reg), eax);
xMOVQZX(xRegisterSSE(t0reg), ptr128[&SD_MASK[0][0]]); xMOVQZX(xRegisterSSE(t0reg), ptr128[&SHIFT_MASKS[0][0]]);
xPSLL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg)); xPSLL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
xMOVQZX(xRegisterSSE(t1reg), ptr128[&dummyValue[0]]); // This line is super slow, but using MOVDQA/MOVAPS is even slower! xMOVQZX(xRegisterSSE(t1reg), ptr128[&dummyValue[0]]); // This line is super slow, but using MOVDQA/MOVAPS is even slower!
xPAND(xRegisterSSE(t0reg), xRegisterSSE(t1reg)); xPAND(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
@ -634,7 +790,7 @@ void recSDR()
xSUB(edx, eax); xSUB(edx, eax);
// Generate mask 128-(shiftx8) xPSRA.W does bit for bit // Generate mask 128-(shiftx8) xPSRA.W does bit for bit
xMOVDZX(xRegisterSSE(t1reg), edx); xMOVDZX(xRegisterSSE(t1reg), edx);
xMOVQZX(xRegisterSSE(t0reg), ptr128[&SD_MASK[0][0]]); xMOVQZX(xRegisterSSE(t0reg), ptr128[&SHIFT_MASKS[0][0]]);
xPSRL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg)); xPSRL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
xMOVQZX(xRegisterSSE(t1reg), ptr128[&dummyValue[0]]); // This line is super slow, but using MOVDQA/MOVAPS is even slower! xMOVQZX(xRegisterSSE(t1reg), ptr128[&dummyValue[0]]); // This line is super slow, but using MOVDQA/MOVAPS is even slower!
xPAND(xRegisterSSE(t0reg), xRegisterSSE(t1reg)); xPAND(xRegisterSSE(t0reg), xRegisterSSE(t1reg));