EE:Rec: Allow rec memory anywhere

This commit is contained in:
TellowKrinkle 2024-08-21 00:54:07 -05:00
parent 8a9fbb43e6
commit fe2f97eeb5
15 changed files with 107 additions and 36 deletions

View File

@ -49,6 +49,7 @@
thread_local u8* x86Ptr; thread_local u8* x86Ptr;
thread_local u8* xTextPtr;
thread_local XMMSSEType g_xmmtypes[iREGCNT_XMM] = {XMMT_INT}; thread_local XMMSSEType g_xmmtypes[iREGCNT_XMM] = {XMMT_INT};
namespace x86Emitter namespace x86Emitter
@ -295,13 +296,27 @@ const xRegister32
void EmitSibMagic(uint regfield, const void* address, int extraRIPOffset) void EmitSibMagic(uint regfield, const void* address, int extraRIPOffset)
{ {
sptr displacement = (sptr)address; sptr displacement = (sptr)address;
sptr textRelative = (sptr)address - (sptr)xTextPtr;
sptr ripRelative = (sptr)address - ((sptr)x86Ptr + sizeof(s8) + sizeof(s32) + extraRIPOffset); sptr ripRelative = (sptr)address - ((sptr)x86Ptr + sizeof(s8) + sizeof(s32) + extraRIPOffset);
// Can we use an 8-bit offset from the text pointer?
if (textRelative == (s8)textRelative && xTextPtr)
{
ModRM(1, regfield, RTEXTPTR.GetId());
xWrite<s8>((s8)textRelative);
return;
}
// Can we use a rip-relative address? (Prefer this over eiz because it's a byte shorter) // Can we use a rip-relative address? (Prefer this over eiz because it's a byte shorter)
if (ripRelative == (s32)ripRelative) else if (ripRelative == (s32)ripRelative)
{ {
ModRM(0, regfield, ModRm_UseDisp32); ModRM(0, regfield, ModRm_UseDisp32);
displacement = ripRelative; displacement = ripRelative;
} }
// How about from the text pointer?
else if (textRelative == (s32)textRelative && xTextPtr)
{
ModRM(2, regfield, RTEXTPTR.GetId());
displacement = textRelative;
}
else else
{ {
pxAssertMsg(displacement == (s32)displacement, "SIB target is too far away, needs an indirect register"); pxAssertMsg(displacement == (s32)displacement, "SIB target is too far away, needs an indirect register");
@ -539,6 +554,12 @@ const xRegister32
x86Ptr = (u8*)ptr; x86Ptr = (u8*)ptr;
} }
// Assigns the current emitter text base address.
__emitinline void xSetTextPtr(void* ptr)
{
xTextPtr = (u8*)ptr;
}
// Retrieves the current emitter buffer target address. // Retrieves the current emitter buffer target address.
// This is provided instead of using x86Ptr directly, since we may in the future find // This is provided instead of using x86Ptr directly, since we may in the future find
// a need to change the storage class system for the x86Ptr 'under the hood.' // a need to change the storage class system for the x86Ptr 'under the hood.'
@ -547,6 +568,12 @@ const xRegister32
return x86Ptr; return x86Ptr;
} }
// Retrieves the current emitter text base address.
__emitinline u8* xGetTextPtr()
{
return xTextPtr;
}
__emitinline void xAlignPtr(uint bytes) __emitinline void xAlignPtr(uint bytes)
{ {
// forward align // forward align
@ -1229,6 +1256,9 @@ const xRegister32
#endif #endif
stackAlign(m_offset, true); stackAlign(m_offset, true);
if (u8* ptr = xGetTextPtr())
xMOV64(RTEXTPTR, (sptr)ptr);
} }
xScopedStackFrame::~xScopedStackFrame() xScopedStackFrame::~xScopedStackFrame()
@ -1285,11 +1315,14 @@ const xRegister32
{ {
return offset + base; return offset + base;
} }
else if (u8* ptr = xGetTextPtr())
{ {
xLEA(tmpRegister, ptr[base]); sptr tbase = (sptr)base - (sptr)ptr;
return offset + tmpRegister; if (tbase == (s32)tbase)
return offset + RTEXTPTR + tbase;
} }
xLEA(tmpRegister, ptr[base]);
return offset + tmpRegister;
} }
void xLoadFarAddr(const xAddressReg& dst, void* addr) void xLoadFarAddr(const xAddressReg& dst, void* addr)

View File

@ -149,11 +149,13 @@ namespace x86Emitter
static const int Sib_UseDisp32 = 5; // same index value as EBP (used in Base field) static const int Sib_UseDisp32 = 5; // same index value as EBP (used in Base field)
extern void xSetPtr(void* ptr); extern void xSetPtr(void* ptr);
extern void xSetTextPtr(void* ptr);
extern void xAlignPtr(uint bytes); extern void xAlignPtr(uint bytes);
extern void xAdvancePtr(uint bytes); extern void xAdvancePtr(uint bytes);
extern void xAlignCallTarget(); extern void xAlignCallTarget();
extern u8* xGetPtr(); extern u8* xGetPtr();
extern u8* xGetTextPtr();
extern u8* xGetAlignedCallTarget(); extern u8* xGetAlignedCallTarget();
extern JccComparisonType xInvertCond(JccComparisonType src); extern JccComparisonType xInvertCond(JccComparisonType src);
@ -646,6 +648,8 @@ extern const xRegister32
calleeSavedReg1d, calleeSavedReg1d,
calleeSavedReg2d; calleeSavedReg2d;
/// Holds a pointer to program text at all times so we don't need to be within 2GB of text
static constexpr const xAddressReg& RTEXTPTR = rbx;
// clang-format on // clang-format on

View File

@ -890,10 +890,13 @@ static void recReserve()
pxFailRel("Failed to allocate R3000 InstCache array."); pxFailRel("Failed to allocate R3000 InstCache array.");
} }
#define R3000A_TEXTPTR (&psxRegs.GPR.r[33])
void recResetIOP() void recResetIOP()
{ {
DevCon.WriteLn("iR3000A Recompiler reset."); DevCon.WriteLn("iR3000A Recompiler reset.");
xSetTextPtr(R3000A_TEXTPTR);
xSetPtr(SysMemory::GetIOPRec()); xSetPtr(SysMemory::GetIOPRec());
_DynGen_Dispatchers(); _DynGen_Dispatchers();
recPtr = xGetPtr(); recPtr = xGetPtr();
@ -1565,6 +1568,7 @@ static void iopRecRecompile(const u32 startpc)
recResetIOP(); recResetIOP();
} }
xSetTextPtr(R3000A_TEXTPTR);
xSetPtr(recPtr); xSetPtr(recPtr);
recPtr = xGetAlignedCallTarget(); recPtr = xGetAlignedCallTarget();

View File

@ -21,6 +21,11 @@ extern u32 target; // branch target
extern u32 s_nBlockCycles; // cycles of current block recompiling extern u32 s_nBlockCycles; // cycles of current block recompiling
extern bool s_nBlockInterlocked; // Current block has VU0 interlocking extern bool s_nBlockInterlocked; // Current block has VU0 interlocking
// x86 can use shorter displacement if it fits in an s8, so offset 144 bytes into the cpuRegs
// This will allow us to reach r1-r16 with a shorter encoding
// TODO: Actually figure out what things are used most often, maybe rearrange the cpuRegs struct, and point at that
#define R5900_TEXTPTR (&cpuRegs.GPR.r[9])
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
// //

View File

@ -445,6 +445,8 @@ static const void* _DynGen_EnterRecompiledCode()
xSUB(rsp, stack_size); xSUB(rsp, stack_size);
#endif #endif
if (u8* ptr = xGetTextPtr())
xMOV64(RTEXTPTR, (sptr)ptr);
if (CHECK_FASTMEM) if (CHECK_FASTMEM)
xMOV(RFASTMEMBASE, ptrNative[&vtlb_private::vtlbdata.fastmem_base]); xMOV(RFASTMEMBASE, ptrNative[&vtlb_private::vtlbdata.fastmem_base]);
@ -585,6 +587,7 @@ static void recResetRaw()
EE::Profiler.Reset(); EE::Profiler.Reset();
xSetTextPtr(R5900_TEXTPTR);
xSetPtr(SysMemory::GetEERec()); xSetPtr(SysMemory::GetEERec());
_DynGen_Dispatchers(); _DynGen_Dispatchers();
vtlb_DynGenDispatchers(); vtlb_DynGenDispatchers();
@ -897,6 +900,7 @@ u8* recBeginThunk()
if (recPtr >= recPtrEnd) if (recPtr >= recPtrEnd)
eeRecNeedsReset = true; eeRecNeedsReset = true;
xSetTextPtr(R5900_TEXTPTR);
xSetPtr(recPtr); xSetPtr(recPtr);
recPtr = xGetAlignedCallTarget(); recPtr = xGetAlignedCallTarget();
@ -2191,6 +2195,7 @@ static void recRecompile(const u32 startpc)
recResetRaw(); recResetRaw();
} }
xSetTextPtr(R5900_TEXTPTR);
xSetPtr(recPtr); xSetPtr(recPtr);
recPtr = xGetAlignedCallTarget(); recPtr = xGetAlignedCallTarget();

View File

@ -345,6 +345,7 @@ void vtlb_DynGenDispatchers()
for (int sign = 0; sign < (!mode && bits < 3 ? 2 : 1); sign++) for (int sign = 0; sign < (!mode && bits < 3 ? 2 : 1); sign++)
{ {
xSetPtr(GetIndirectDispatcherPtr(mode, bits, !!sign)); xSetPtr(GetIndirectDispatcherPtr(mode, bits, !!sign));
xSetTextPtr(R5900_TEXTPTR);
DynGen_IndirectTlbDispatcher(mode, bits, !!sign); DynGen_IndirectTlbDispatcher(mode, bits, !!sign);
} }

View File

@ -42,6 +42,7 @@ void mVUreset(microVU& mVU, bool resetReserve)
VU0.VI[REG_VPU_STAT].UL &= ~0x100; VU0.VI[REG_VPU_STAT].UL &= ~0x100;
} }
xSetTextPtr(mVU.textPtr());
xSetPtr(mVU.cache); xSetPtr(mVU.cache);
mVUdispatcherAB(mVU); mVUdispatcherAB(mVU);
mVUdispatcherCD(mVU); mVUdispatcherCD(mVU);

View File

@ -123,6 +123,7 @@ struct microVU
s32 cycles; // Cycles Counter s32 cycles; // Cycles Counter
VURegs& regs() const { return ::vuRegs[index]; } VURegs& regs() const { return ::vuRegs[index]; }
void* textPtr() const { return (index && THREAD_VU1) ? (void*)&regs().VF[9] : (void*)R5900_TEXTPTR; }
__fi REG_VI& getVI(uint reg) const { return regs().VI[reg]; } __fi REG_VI& getVI(uint reg) const { return regs().VI[reg]; }
__fi VECTOR& getVF(uint reg) const { return regs().VF[reg]; } __fi VECTOR& getVF(uint reg) const { return regs().VF[reg]; }

View File

@ -207,15 +207,17 @@ static void mVUGenerateCopyPipelineState(mV)
{ {
mVU.copyPLState = xGetAlignedCallTarget(); mVU.copyPLState = xGetAlignedCallTarget();
xLoadFarAddr(rdx, reinterpret_cast<u8*>(&mVU.prog.lpState));
if (cpuinfo_has_x86_avx()) if (cpuinfo_has_x86_avx())
{ {
xVMOVAPS(ymm0, ptr[rax]); xVMOVAPS(ymm0, ptr[rax]);
xVMOVAPS(ymm1, ptr[rax + 32u]); xVMOVAPS(ymm1, ptr[rax + 32u]);
xVMOVAPS(ymm2, ptr[rax + 64u]); xVMOVAPS(ymm2, ptr[rax + 64u]);
xVMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState)], ymm0); xVMOVUPS(ptr[rdx], ymm0);
xVMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 32u], ymm1); xVMOVUPS(ptr[rdx + 32u], ymm1);
xVMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 64u], ymm2); xVMOVUPS(ptr[rdx + 64u], ymm2);
xVZEROUPPER(); xVZEROUPPER();
} }
@ -228,12 +230,12 @@ static void mVUGenerateCopyPipelineState(mV)
xMOVAPS(xmm4, ptr[rax + 64u]); xMOVAPS(xmm4, ptr[rax + 64u]);
xMOVAPS(xmm5, ptr[rax + 80u]); xMOVAPS(xmm5, ptr[rax + 80u]);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState)], xmm0); xMOVUPS(ptr[rdx], xmm0);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 16u], xmm1); xMOVUPS(ptr[rdx + 16u], xmm1);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 32u], xmm2); xMOVUPS(ptr[rdx + 32u], xmm2);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 48u], xmm3); xMOVUPS(ptr[rdx + 48u], xmm3);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 64u], xmm4); xMOVUPS(ptr[rdx + 64u], xmm4);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 80u], xmm5); xMOVUPS(ptr[rdx + 80u], xmm5);
} }
xRET(); xRET();
@ -326,6 +328,7 @@ _mVUt void* mVUexecute(u32 startPC, u32 cycles)
mVU.cycles = cycles; mVU.cycles = cycles;
mVU.totalCycles = cycles; mVU.totalCycles = cycles;
xSetTextPtr(mVU.textPtr());
xSetPtr(mVU.prog.x86ptr); // Set x86ptr to where last program left off xSetPtr(mVU.prog.x86ptr); // Set x86ptr to where last program left off
return mVUsearchProg<vuIndex>(startPC & vuLimit, (uptr)&mVU.prog.lpState); // Find and set correct program return mVUsearchProg<vuIndex>(startPC & vuLimit, (uptr)&mVU.prog.lpState); // Find and set correct program
} }

View File

@ -411,6 +411,7 @@ public:
} }
} }
gprMap[RTEXTPTR.GetId()].usable = !xGetTextPtr();
gprMap[RFASTMEMBASE.GetId()].usable = !cop2mode || !CHECK_FASTMEM; gprMap[RFASTMEMBASE.GetId()].usable = !cop2mode || !CHECK_FASTMEM;
} }

View File

@ -1106,7 +1106,7 @@ mVUop(mVU_ILW)
mVU.regAlloc->moveVIToGPR(gprT1, _Is_); mVU.regAlloc->moveVIToGPR(gprT1, _Is_);
if (_Imm11_ != 0) if (_Imm11_ != 0)
xADD(gprT1, _Imm11_); xADD(gprT1, _Imm11_);
mVUaddrFix(mVU, gprT1q); mVUaddrFix(mVU, gprT1q, gprT2q);
} }
const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI);
@ -1133,7 +1133,7 @@ mVUop(mVU_ILWR)
if (_Is_) if (_Is_)
{ {
mVU.regAlloc->moveVIToGPR(gprT1, _Is_); mVU.regAlloc->moveVIToGPR(gprT1, _Is_);
mVUaddrFix (mVU, gprT1q); mVUaddrFix (mVU, gprT1q, gprT2q);
const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI);
xMOVZX(regT, ptr16[xComplexAddress(gprT2q, ptr, gprT1q)]); xMOVZX(regT, ptr16[xComplexAddress(gprT2q, ptr, gprT1q)]);
@ -1170,7 +1170,7 @@ mVUop(mVU_ISW)
mVU.regAlloc->moveVIToGPR(gprT1, _Is_); mVU.regAlloc->moveVIToGPR(gprT1, _Is_);
if (_Imm11_ != 0) if (_Imm11_ != 0)
xADD(gprT1, _Imm11_); xADD(gprT1, _Imm11_);
mVUaddrFix(mVU, gprT1q); mVUaddrFix(mVU, gprT1q, gprT2q);
} }
// If regT is dirty, the high bits might not be zero. // If regT is dirty, the high bits might not be zero.
@ -1201,7 +1201,7 @@ mVUop(mVU_ISWR)
if (_Is_) if (_Is_)
{ {
mVU.regAlloc->moveVIToGPR(gprT1, _Is_); mVU.regAlloc->moveVIToGPR(gprT1, _Is_);
mVUaddrFix(mVU, gprT1q); mVUaddrFix(mVU, gprT1q, gprT2q);
is = gprT1q; is = gprT1q;
} }
const xRegister32& regT = mVU.regAlloc->allocGPR(_It_, -1, false, true); const xRegister32& regT = mVU.regAlloc->allocGPR(_It_, -1, false, true);
@ -1257,7 +1257,7 @@ mVUop(mVU_LQ)
mVU.regAlloc->moveVIToGPR(gprT1, _Is_); mVU.regAlloc->moveVIToGPR(gprT1, _Is_);
if (_Imm11_ != 0) if (_Imm11_ != 0)
xADD(gprT1, _Imm11_); xADD(gprT1, _Imm11_);
mVUaddrFix(mVU, gprT1q); mVUaddrFix(mVU, gprT1q, gprT2q);
} }
const xmm& Ft = mVU.regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W); const xmm& Ft = mVU.regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
@ -1281,7 +1281,7 @@ mVUop(mVU_LQD)
xDEC(regS); xDEC(regS);
xMOVSX(gprT1, xRegister16(regS)); // TODO: Confirm xMOVSX(gprT1, xRegister16(regS)); // TODO: Confirm
mVU.regAlloc->clearNeeded(regS); mVU.regAlloc->clearNeeded(regS);
mVUaddrFix(mVU, gprT1q); mVUaddrFix(mVU, gprT1q, gprT2q);
is = gprT1q; is = gprT1q;
} }
else else
@ -1319,7 +1319,7 @@ mVUop(mVU_LQI)
xMOVSX(gprT1, xRegister16(regS)); // TODO: Confirm xMOVSX(gprT1, xRegister16(regS)); // TODO: Confirm
xINC(regS); xINC(regS);
mVU.regAlloc->clearNeeded(regS); mVU.regAlloc->clearNeeded(regS);
mVUaddrFix(mVU, gprT1q); mVUaddrFix(mVU, gprT1q, gprT2q);
is = gprT1q; is = gprT1q;
} }
if (!mVUlow.noWriteVF) if (!mVUlow.noWriteVF)
@ -1351,7 +1351,7 @@ mVUop(mVU_SQ)
mVU.regAlloc->moveVIToGPR(gprT1, _It_); mVU.regAlloc->moveVIToGPR(gprT1, _It_);
if (_Imm11_ != 0) if (_Imm11_ != 0)
xADD(gprT1, _Imm11_); xADD(gprT1, _Imm11_);
mVUaddrFix(mVU, gprT1q); mVUaddrFix(mVU, gprT1q, gprT2q);
} }
const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, _XYZW_PS ? -1 : 0, _X_Y_Z_W); const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, _XYZW_PS ? -1 : 0, _X_Y_Z_W);
@ -1375,7 +1375,7 @@ mVUop(mVU_SQD)
xDEC(regT); xDEC(regT);
xMOVZX(gprT1, xRegister16(regT)); xMOVZX(gprT1, xRegister16(regT));
mVU.regAlloc->clearNeeded(regT); mVU.regAlloc->clearNeeded(regT);
mVUaddrFix(mVU, gprT1q); mVUaddrFix(mVU, gprT1q, gprT2q);
it = gprT1q; it = gprT1q;
} }
else else
@ -1405,7 +1405,7 @@ mVUop(mVU_SQI)
xMOVZX(gprT1, xRegister16(regT)); xMOVZX(gprT1, xRegister16(regT));
xINC(regT); xINC(regT);
mVU.regAlloc->clearNeeded(regT); mVU.regAlloc->clearNeeded(regT);
mVUaddrFix(mVU, gprT1q); mVUaddrFix(mVU, gprT1q, gprT2q);
} }
const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, _XYZW_PS ? -1 : 0, _X_Y_Z_W); const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, _XYZW_PS ? -1 : 0, _X_Y_Z_W);
if (_It_) if (_It_)

View File

@ -295,7 +295,7 @@ static void mVUwaitMTVU()
} }
// Transforms the Address in gprReg to valid VU0/VU1 Address // Transforms the Address in gprReg to valid VU0/VU1 Address
__fi void mVUaddrFix(mV, const xAddressReg& gprReg) __fi void mVUaddrFix(mV, const xAddressReg& gprReg, const xAddressReg& tmpReg)
{ {
if (isVU1) if (isVU1)
{ {
@ -324,7 +324,16 @@ __fi void mVUaddrFix(mV, const xAddressReg& gprReg)
xFastCall((void*)mVU.waitMTVU); xFastCall((void*)mVU.waitMTVU);
} }
xAND(xRegister32(gprReg.Id), 0x3f); // ToDo: theres a potential problem if VU0 overrides VU1's VF0/VI0 regs! xAND(xRegister32(gprReg.Id), 0x3f); // ToDo: theres a potential problem if VU0 overrides VU1's VF0/VI0 regs!
xADD(gprReg, (u128*)VU1.VF - (u128*)VU0.Mem); sptr offset = (u128*)VU1.VF - (u128*)VU0.Mem;
if (offset == (s32)offset)
{
xADD(gprReg, offset);
}
else
{
xMOV64(tmpReg, offset);
xADD(gprReg, tmpReg);
}
jmpB.SetTarget(); jmpB.SetTarget();
xSHL(gprReg, 4); // multiply by 16 (shift left by 4) xSHL(gprReg, 4); // multiply by 16 (shift left by 4)
} }

View File

@ -23,7 +23,8 @@ void dVifRelease(int idx)
} }
VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_) VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_)
: v(vif_) : vifPtr(rax)
, v(vif_)
, vB(vifBlock_) , vB(vifBlock_)
{ {
const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2) const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2)
@ -42,9 +43,6 @@ __fi void makeMergeMask(u32& x)
__fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const __fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const
{ {
const int idx = v.idx;
const vifStruct& vif = MTVU_VifX;
//This could have ended up copying the row when there was no row to write.1810080 //This could have ended up copying the row when there was no row to write.1810080
u32 m0 = vB.mask; //The actual mask example 0x03020100 u32 m0 = vB.mask; //The actual mask example 0x03020100
u32 m3 = ((m0 & 0xaaaaaaaa) >> 1) & ~m0; //all the upper bits, so our example 0x01010000 & 0xFCFDFEFF = 0x00010000 just the cols (shifted right for maskmerge) u32 m3 = ((m0 & 0xaaaaaaaa) >> 1) & ~m0; //all the upper bits, so our example 0x01010000 & 0xFCFDFEFF = 0x00010000 just the cols (shifted right for maskmerge)
@ -52,14 +50,14 @@ __fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const
if ((doMask && m2) || doMode) if ((doMask && m2) || doMode)
{ {
xMOVAPS(xmmRow, ptr128[&vif.MaskRow]); xMOVAPS(xmmRow, ptr128[vifPtr + (sptr)offsetof(vifStruct, MaskRow)]);
MSKPATH3_LOG("Moving row"); MSKPATH3_LOG("Moving row");
} }
if (doMask && m3) if (doMask && m3)
{ {
VIF_LOG("Merging Cols"); VIF_LOG("Merging Cols");
xMOVAPS(xmmCol0, ptr128[&vif.MaskCol]); xMOVAPS(xmmCol0, ptr128[vifPtr + (sptr)offsetof(vifStruct, MaskCol)]);
if ((cS >= 2) && (m3 & 0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1); if ((cS >= 2) && (m3 & 0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
if ((cS >= 3) && (m3 & 0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2); if ((cS >= 3) && (m3 & 0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
if ((cS >= 4) && (m3 & 0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3); if ((cS >= 4) && (m3 & 0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3);
@ -137,8 +135,7 @@ void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const
void VifUnpackSSE_Dynarec::writeBackRow() const void VifUnpackSSE_Dynarec::writeBackRow() const
{ {
const int idx = v.idx; xMOVAPS(ptr128[vifPtr + (sptr)offsetof(vifStruct, MaskRow)], xmmRow);
xMOVAPS(ptr128[&(MTVU_VifX.MaskRow)], xmmRow);
VIF_LOG("nVif: writing back row reg! [doMode = %d]", doMode); VIF_LOG("nVif: writing back row reg! [doMode = %d]", doMode);
} }
@ -239,6 +236,7 @@ void VifUnpackSSE_Dynarec::ProcessMasks()
void VifUnpackSSE_Dynarec::CompileRoutine() void VifUnpackSSE_Dynarec::CompileRoutine()
{ {
const int idx = v.idx;
const int wl = vB.wl ? vB.wl : 256; // 0 is taken as 256 (KH2) const int wl = vB.wl ? vB.wl : 256; // 0 is taken as 256 (KH2)
const int upkNum = vB.upkType & 0xf; const int upkNum = vB.upkType & 0xf;
const u8& vift = nVifT[upkNum]; const u8& vift = nVifT[upkNum];
@ -252,6 +250,7 @@ void VifUnpackSSE_Dynarec::CompileRoutine()
VIF_LOG("Compiling new block, unpack number %x, mode %x, masking %x, vNum %x", upkNum, doMode, doMask, vNum); VIF_LOG("Compiling new block, unpack number %x, mode %x, masking %x, vNum %x", upkNum, doMode, doMask, vNum);
pxAssume(vCL == 0); pxAssume(vCL == 0);
xLoadFarAddr(vifPtr, &MTVU_VifX);
// Value passed determines # of col regs we need to load // Value passed determines # of col regs we need to load
SetMasks(isFill ? blockSize : cycleSize); SetMasks(isFill ? blockSize : cycleSize);
@ -336,6 +335,7 @@ _vifT __fi nVifBlock* dVifCompile(nVifBlock& block, bool isFill)
} }
// Compile the block now // Compile the block now
xSetTextPtr(nullptr);
xSetPtr(v.recWritePtr); xSetPtr(v.recWritePtr);
block.startPtr = (uptr)xGetAlignedCallTarget(); block.startPtr = (uptr)xGetAlignedCallTarget();

View File

@ -329,9 +329,11 @@ void VifUnpackSSE_Simple::doMaskWrite(const xRegisterSSE& regX) const
{ {
xMOVAPS(xmm7, ptr[dstIndirect]); xMOVAPS(xmm7, ptr[dstIndirect]);
int offX = std::min(curCycle, 3); int offX = std::min(curCycle, 3);
xPAND(regX, ptr32[nVifMask[0][offX]]); sptr base = reinterpret_cast<sptr>(nVifMask[2]);
xPAND(xmm7, ptr32[nVifMask[1][offX]]); xLoadFarAddr(rax, nVifMask);
xPOR (regX, ptr32[nVifMask[2][offX]]); xPAND(regX, ptr128[rax + (reinterpret_cast<sptr>(nVifMask[0][offX]) - base)]);
xPAND(xmm7, ptr128[rax + (reinterpret_cast<sptr>(nVifMask[1][offX]) - base)]);
xPOR (regX, ptr128[rax + (reinterpret_cast<sptr>(nVifMask[2][offX]) - base)]);
xPOR (regX, xmm7); xPOR (regX, xmm7);
xMOVAPS(ptr[dstIndirect], regX); xMOVAPS(ptr[dstIndirect], regX);
} }
@ -362,6 +364,7 @@ void VifUnpackSSE_Init()
{ {
DevCon.WriteLn("Generating SSE-optimized unpacking functions for VIF interpreters..."); DevCon.WriteLn("Generating SSE-optimized unpacking functions for VIF interpreters...");
xSetTextPtr(nullptr);
xSetPtr(SysMemory::GetVIFUnpackRec()); xSetPtr(SysMemory::GetVIFUnpackRec());
for (int a = 0; a < 2; a++) for (int a = 0; a < 2; a++)

View File

@ -98,6 +98,7 @@ public:
bool inputMasked; bool inputMasked;
protected: protected:
xAddressReg vifPtr;
const nVifStruct& v; // vif0 or vif1 const nVifStruct& v; // vif0 or vif1
const nVifBlock& vB; // some pre-collected data from VifStruct const nVifBlock& vB; // some pre-collected data from VifStruct
int vCL; // internal copy of vif->cl int vCL; // internal copy of vif->cl