EE Rec: Added LDR/LDL

Also fixed slight optimisation bug in SDL
2021-09-16 19:11:41 +01:00 · 2021-09-16 19:11:41 +01:00 · 862d606514
parent 5f58c325ca
commit 862d606514
1 changed files with 164 additions and 8 deletions
--- a/pcsx2/x86/ix86-32/iR5900LoadStore.cpp
+++ b/pcsx2/x86/ix86-32/iR5900LoadStore.cpp
@ -451,12 +451,96 @@ void recSWR()
 }

 ////////////////////////////////////////////////////
+
+alignas(16) const u32 SHIFT_MASKS[2][4] = {
+	{ 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 },
+	{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }
+};
+
 void recLDL()
 {
+	if (!_Rt_)
+		return;
+
+#ifdef LOADSTORE_RECOMPILE
+	xLEA(arg2reg, ptr128[&dummyValue[0]]);
+
+	if (GPR_IS_CONST1(_Rs_))
+	{
+		u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
+		srcadr &= ~0x07;
+
+		vtlb_DynGenRead64_Const(64, srcadr);
+	}
+	else
+	{
+		// Load ECX with the source memory address that we're reading from.
+		_eeMoveGPRtoR(arg1regd, _Rs_);
+		if (_Imm_ != 0)
+			xADD(arg1regd, _Imm_);
+
+		xAND(arg1regd, ~0x07);
+
+		iFlushCall(FLUSH_FULLVTLB);
+
+		vtlb_DynGenRead64(64);
+	}
+	
+	int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ | MODE_WRITE);
+	int t0reg = _allocTempXMMreg(XMMT_INT, -1);
+	int t1reg = _allocTempXMMreg(XMMT_INT, -1);
+	int t2reg = _allocTempXMMreg(XMMT_INT, -1);
+
+	if (GPR_IS_CONST1(_Rs_))
+	{
+		u32 shiftval = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
+		shiftval &= 0x7;
+		xMOV(eax, shiftval + 1);
+	}
+	else
+	{
+		_eeMoveGPRtoR(eax, _Rs_);
+		if (_Imm_ != 0)
+			xADD(eax, _Imm_);
+		xAND(eax, 0x7);
+		xADD(eax, 1);
+	}
+
+	xCMP(eax, 8);
+	xForwardJE32 skip;
+	//Calculate the shift from top bit to lowest
+	xMOV(edx, 64);
+	xSHL(eax, 3);
+	xSUB(edx, eax);
+
+	xMOVDZX(xRegisterSSE(t1reg), eax);
+	xMOVQZX(xRegisterSSE(t0reg), ptr128[&SHIFT_MASKS[0][0]]);
+	xPSRL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
+	xPAND(xRegisterSSE(t0reg), xRegisterSSE(rtreg));
+	xMOVDQA(xRegisterSSE(t2reg), xRegisterSSE(t0reg));
+
+	xMOVDZX(xRegisterSSE(t1reg), edx);
+	xMOVQZX(xRegisterSSE(t0reg), ptr128[&dummyValue[0]]);
+	xPSLL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
+	xPOR(xRegisterSSE(t0reg), xRegisterSSE(t2reg));
+	xForwardJump32 full;
+	skip.SetTarget();
+
+	xMOVQZX(xRegisterSSE(t0reg), ptr128[&dummyValue[0]]);
+	full.SetTarget();
+
+	xBLEND.PS(xRegisterSSE(rtreg), xRegisterSSE(t0reg), 0x3);
+
+	_freeXMMreg(t0reg);
+	_freeXMMreg(t1reg);
+	_freeXMMreg(t2reg);
+
+#else
 	iFlushCall(FLUSH_INTERPRETER);
 	_deleteEEreg(_Rs_, 1);
 	_deleteEEreg(_Rt_, 1);
 	recCall(LDL);
+#endif

 	EE::Profiler.EmitOp(eeOpcode::LDL);
 }
@ -464,21 +548,93 @@ void recLDL()
 ////////////////////////////////////////////////////
 void recLDR()
 {
+	if (!_Rt_)
+		return;
+
+#ifdef LOADSTORE_RECOMPILE
+	xLEA(arg2reg, ptr128[&dummyValue[0]]);
+
+	if (GPR_IS_CONST1(_Rs_))
+	{
+		u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
+		srcadr &= ~0x07;
+
+		vtlb_DynGenRead64_Const(64, srcadr);
+	}
+	else
+	{
+		// Load ECX with the source memory address that we're reading from.
+		_eeMoveGPRtoR(arg1regd, _Rs_);
+		if (_Imm_ != 0)
+			xADD(arg1regd, _Imm_);
+
+		xAND(arg1regd, ~0x07);
+
+		iFlushCall(FLUSH_FULLVTLB);
+
+		vtlb_DynGenRead64(64);
+	}
+
+	int rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ | MODE_WRITE);
+	int t0reg = _allocTempXMMreg(XMMT_INT, -1);
+	int t1reg = _allocTempXMMreg(XMMT_INT, -1);
+	int t2reg = _allocTempXMMreg(XMMT_INT, -1);
+
+	if (GPR_IS_CONST1(_Rs_))
+	{
+		u32 shiftval = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
+		shiftval &= 0x7;
+		xMOV(eax, shiftval);
+	}
+	else
+	{
+		_eeMoveGPRtoR(eax, _Rs_);
+		if (_Imm_ != 0)
+			xADD(eax, _Imm_);
+		xAND(eax, 0x7);
+	}
+
+	xCMP(eax, 0);
+	xForwardJE32 skip;
+	//Calculate the shift from top bit to lowest
+	xMOV(edx, 64);
+	xSHL(eax, 3);
+	xSUB(edx, eax);
+
+	xMOVDZX(xRegisterSSE(t1reg), edx); //64-shift*8
+	xMOVQZX(xRegisterSSE(t0reg), ptr128[&SHIFT_MASKS[0][0]]);
+	xPSLL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
+	xPAND(xRegisterSSE(t0reg), xRegisterSSE(rtreg));
+	xMOVQZX(xRegisterSSE(t2reg), xRegisterSSE(t0reg));
+
+	xMOVDZX(xRegisterSSE(t1reg), eax); //shift*8
+	xMOVQZX(xRegisterSSE(t0reg), ptr128[&dummyValue[0]]);
+	xPSRL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
+	xPOR(xRegisterSSE(t0reg), xRegisterSSE(t2reg));
+	xForwardJump32 full;
+	skip.SetTarget();
+
+	xMOVQZX(xRegisterSSE(t0reg), ptr128[&dummyValue[0]]);
+	full.SetTarget();
+
+	xBLEND.PS(xRegisterSSE(rtreg), xRegisterSSE(t0reg), 0x3);
+
+	_freeXMMreg(t0reg);
+	_freeXMMreg(t1reg);
+	_freeXMMreg(t2reg);
+
+#else
 	iFlushCall(FLUSH_INTERPRETER);
 	_deleteEEreg(_Rs_, 1);
 	_deleteEEreg(_Rt_, 1);
 	recCall(LDR);
+#endif

 	EE::Profiler.EmitOp(eeOpcode::LDR);
 }

 ////////////////////////////////////////////////////

-alignas(16) const u32 SD_MASK[2][4] = {
-	{ 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 },
-	{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }
-};
-
 void recSDL()
 {
 #ifdef LOADSTORE_RECOMPILE
@ -525,7 +681,7 @@ void recSDL()
 		xADD(eax, 1);
 	}

-	xCMP(eax, 1);
+	xCMP(eax, 8);
 	xForwardJE32 skip;
 	//Calculate the shift from top bit to lowest
 	xMOV(edx, 64);
@ -533,7 +689,7 @@ void recSDL()
 	xSUB(edx, eax);
 	// Generate mask 128-(shiftx8) xPSRA.W does bit for bit
 	xMOVDZX(xRegisterSSE(t1reg), eax);
-	xMOVQZX(xRegisterSSE(t0reg), ptr128[&SD_MASK[0][0]]);
+	xMOVQZX(xRegisterSSE(t0reg), ptr128[&SHIFT_MASKS[0][0]]);
 	xPSLL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
 	xMOVQZX(xRegisterSSE(t1reg), ptr128[&dummyValue[0]]); // This line is super slow, but using MOVDQA/MOVAPS is even slower!
 	xPAND(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
@ -634,7 +790,7 @@ void recSDR()
 	xSUB(edx, eax);
 	// Generate mask 128-(shiftx8) xPSRA.W does bit for bit
 	xMOVDZX(xRegisterSSE(t1reg), edx);
-	xMOVQZX(xRegisterSSE(t0reg), ptr128[&SD_MASK[0][0]]);
+	xMOVQZX(xRegisterSSE(t0reg), ptr128[&SHIFT_MASKS[0][0]]);
 	xPSRL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
 	xMOVQZX(xRegisterSSE(t1reg), ptr128[&dummyValue[0]]); // This line is super slow, but using MOVDQA/MOVAPS is even slower!
 	xPAND(xRegisterSSE(t0reg), xRegisterSSE(t1reg));