EE:Rec: Allow rec memory anywhere

This commit is contained in:
TellowKrinkle 2024-08-21 00:54:07 -05:00
parent 8a9fbb43e6
commit fe2f97eeb5
15 changed files with 107 additions and 36 deletions

View File

@ -49,6 +49,7 @@
thread_local u8* x86Ptr;
thread_local u8* xTextPtr;
thread_local XMMSSEType g_xmmtypes[iREGCNT_XMM] = {XMMT_INT};
namespace x86Emitter
@ -295,13 +296,27 @@ const xRegister32
void EmitSibMagic(uint regfield, const void* address, int extraRIPOffset)
{
sptr displacement = (sptr)address;
sptr textRelative = (sptr)address - (sptr)xTextPtr;
sptr ripRelative = (sptr)address - ((sptr)x86Ptr + sizeof(s8) + sizeof(s32) + extraRIPOffset);
// Can we use an 8-bit offset from the text pointer?
if (textRelative == (s8)textRelative && xTextPtr)
{
ModRM(1, regfield, RTEXTPTR.GetId());
xWrite<s8>((s8)textRelative);
return;
}
// Can we use a rip-relative address? (Prefer this over eiz because it's a byte shorter)
if (ripRelative == (s32)ripRelative)
else if (ripRelative == (s32)ripRelative)
{
ModRM(0, regfield, ModRm_UseDisp32);
displacement = ripRelative;
}
// How about from the text pointer?
else if (textRelative == (s32)textRelative && xTextPtr)
{
ModRM(2, regfield, RTEXTPTR.GetId());
displacement = textRelative;
}
else
{
pxAssertMsg(displacement == (s32)displacement, "SIB target is too far away, needs an indirect register");
@ -539,6 +554,12 @@ const xRegister32
x86Ptr = (u8*)ptr;
}
// Assigns the current emitter text base address.
__emitinline void xSetTextPtr(void* ptr)
{
xTextPtr = (u8*)ptr;
}
// Retrieves the current emitter buffer target address.
// This is provided instead of using x86Ptr directly, since we may in the future find
// a need to change the storage class system for the x86Ptr 'under the hood.'
@ -547,6 +568,12 @@ const xRegister32
return x86Ptr;
}
// Retrieves the current emitter text base address.
__emitinline u8* xGetTextPtr()
{
return xTextPtr;
}
__emitinline void xAlignPtr(uint bytes)
{
// forward align
@ -1229,6 +1256,9 @@ const xRegister32
#endif
stackAlign(m_offset, true);
if (u8* ptr = xGetTextPtr())
xMOV64(RTEXTPTR, (sptr)ptr);
}
xScopedStackFrame::~xScopedStackFrame()
@ -1285,11 +1315,14 @@ const xRegister32
{
return offset + base;
}
else
if (u8* ptr = xGetTextPtr())
{
xLEA(tmpRegister, ptr[base]);
return offset + tmpRegister;
sptr tbase = (sptr)base - (sptr)ptr;
if (tbase == (s32)tbase)
return offset + RTEXTPTR + tbase;
}
xLEA(tmpRegister, ptr[base]);
return offset + tmpRegister;
}
void xLoadFarAddr(const xAddressReg& dst, void* addr)

View File

@ -149,11 +149,13 @@ namespace x86Emitter
static const int Sib_UseDisp32 = 5; // same index value as EBP (used in Base field)
extern void xSetPtr(void* ptr);
extern void xSetTextPtr(void* ptr);
extern void xAlignPtr(uint bytes);
extern void xAdvancePtr(uint bytes);
extern void xAlignCallTarget();
extern u8* xGetPtr();
extern u8* xGetTextPtr();
extern u8* xGetAlignedCallTarget();
extern JccComparisonType xInvertCond(JccComparisonType src);
@ -646,6 +648,8 @@ extern const xRegister32
calleeSavedReg1d,
calleeSavedReg2d;
/// Holds a pointer to program text at all times so we don't need to be within 2GB of text
static constexpr const xAddressReg& RTEXTPTR = rbx;
// clang-format on

View File

@ -890,10 +890,13 @@ static void recReserve()
pxFailRel("Failed to allocate R3000 InstCache array.");
}
#define R3000A_TEXTPTR (&psxRegs.GPR.r[33])
void recResetIOP()
{
DevCon.WriteLn("iR3000A Recompiler reset.");
xSetTextPtr(R3000A_TEXTPTR);
xSetPtr(SysMemory::GetIOPRec());
_DynGen_Dispatchers();
recPtr = xGetPtr();
@ -1565,6 +1568,7 @@ static void iopRecRecompile(const u32 startpc)
recResetIOP();
}
xSetTextPtr(R3000A_TEXTPTR);
xSetPtr(recPtr);
recPtr = xGetAlignedCallTarget();

View File

@ -21,6 +21,11 @@ extern u32 target; // branch target
extern u32 s_nBlockCycles; // cycles of current block recompiling
extern bool s_nBlockInterlocked; // Current block has VU0 interlocking
// x86 can use shorter displacement if it fits in an s8, so offset 144 bytes into the cpuRegs
// This will allow us to reach r1-r16 with a shorter encoding
// TODO: Actually figure out what things are used most often, maybe rearrange the cpuRegs struct, and point at that
#define R5900_TEXTPTR (&cpuRegs.GPR.r[9])
//////////////////////////////////////////////////////////////////////////////////////////
//

View File

@ -445,6 +445,8 @@ static const void* _DynGen_EnterRecompiledCode()
xSUB(rsp, stack_size);
#endif
if (u8* ptr = xGetTextPtr())
xMOV64(RTEXTPTR, (sptr)ptr);
if (CHECK_FASTMEM)
xMOV(RFASTMEMBASE, ptrNative[&vtlb_private::vtlbdata.fastmem_base]);
@ -585,6 +587,7 @@ static void recResetRaw()
EE::Profiler.Reset();
xSetTextPtr(R5900_TEXTPTR);
xSetPtr(SysMemory::GetEERec());
_DynGen_Dispatchers();
vtlb_DynGenDispatchers();
@ -897,6 +900,7 @@ u8* recBeginThunk()
if (recPtr >= recPtrEnd)
eeRecNeedsReset = true;
xSetTextPtr(R5900_TEXTPTR);
xSetPtr(recPtr);
recPtr = xGetAlignedCallTarget();
@ -2191,6 +2195,7 @@ static void recRecompile(const u32 startpc)
recResetRaw();
}
xSetTextPtr(R5900_TEXTPTR);
xSetPtr(recPtr);
recPtr = xGetAlignedCallTarget();

View File

@ -345,6 +345,7 @@ void vtlb_DynGenDispatchers()
for (int sign = 0; sign < (!mode && bits < 3 ? 2 : 1); sign++)
{
xSetPtr(GetIndirectDispatcherPtr(mode, bits, !!sign));
xSetTextPtr(R5900_TEXTPTR);
DynGen_IndirectTlbDispatcher(mode, bits, !!sign);
}

View File

@ -42,6 +42,7 @@ void mVUreset(microVU& mVU, bool resetReserve)
VU0.VI[REG_VPU_STAT].UL &= ~0x100;
}
xSetTextPtr(mVU.textPtr());
xSetPtr(mVU.cache);
mVUdispatcherAB(mVU);
mVUdispatcherCD(mVU);

View File

@ -123,6 +123,7 @@ struct microVU
s32 cycles; // Cycles Counter
VURegs& regs() const { return ::vuRegs[index]; }
void* textPtr() const { return (index && THREAD_VU1) ? (void*)&regs().VF[9] : (void*)R5900_TEXTPTR; }
__fi REG_VI& getVI(uint reg) const { return regs().VI[reg]; }
__fi VECTOR& getVF(uint reg) const { return regs().VF[reg]; }

View File

@ -207,15 +207,17 @@ static void mVUGenerateCopyPipelineState(mV)
{
mVU.copyPLState = xGetAlignedCallTarget();
xLoadFarAddr(rdx, reinterpret_cast<u8*>(&mVU.prog.lpState));
if (cpuinfo_has_x86_avx())
{
xVMOVAPS(ymm0, ptr[rax]);
xVMOVAPS(ymm1, ptr[rax + 32u]);
xVMOVAPS(ymm2, ptr[rax + 64u]);
xVMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState)], ymm0);
xVMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 32u], ymm1);
xVMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 64u], ymm2);
xVMOVUPS(ptr[rdx], ymm0);
xVMOVUPS(ptr[rdx + 32u], ymm1);
xVMOVUPS(ptr[rdx + 64u], ymm2);
xVZEROUPPER();
}
@ -228,12 +230,12 @@ static void mVUGenerateCopyPipelineState(mV)
xMOVAPS(xmm4, ptr[rax + 64u]);
xMOVAPS(xmm5, ptr[rax + 80u]);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState)], xmm0);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 16u], xmm1);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 32u], xmm2);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 48u], xmm3);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 64u], xmm4);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 80u], xmm5);
xMOVUPS(ptr[rdx], xmm0);
xMOVUPS(ptr[rdx + 16u], xmm1);
xMOVUPS(ptr[rdx + 32u], xmm2);
xMOVUPS(ptr[rdx + 48u], xmm3);
xMOVUPS(ptr[rdx + 64u], xmm4);
xMOVUPS(ptr[rdx + 80u], xmm5);
}
xRET();
@ -326,6 +328,7 @@ _mVUt void* mVUexecute(u32 startPC, u32 cycles)
mVU.cycles = cycles;
mVU.totalCycles = cycles;
xSetTextPtr(mVU.textPtr());
xSetPtr(mVU.prog.x86ptr); // Set x86ptr to where last program left off
return mVUsearchProg<vuIndex>(startPC & vuLimit, (uptr)&mVU.prog.lpState); // Find and set correct program
}

View File

@ -411,6 +411,7 @@ public:
}
}
gprMap[RTEXTPTR.GetId()].usable = !xGetTextPtr();
gprMap[RFASTMEMBASE.GetId()].usable = !cop2mode || !CHECK_FASTMEM;
}

View File

@ -1106,7 +1106,7 @@ mVUop(mVU_ILW)
mVU.regAlloc->moveVIToGPR(gprT1, _Is_);
if (_Imm11_ != 0)
xADD(gprT1, _Imm11_);
mVUaddrFix(mVU, gprT1q);
mVUaddrFix(mVU, gprT1q, gprT2q);
}
const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI);
@ -1133,7 +1133,7 @@ mVUop(mVU_ILWR)
if (_Is_)
{
mVU.regAlloc->moveVIToGPR(gprT1, _Is_);
mVUaddrFix (mVU, gprT1q);
mVUaddrFix (mVU, gprT1q, gprT2q);
const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI);
xMOVZX(regT, ptr16[xComplexAddress(gprT2q, ptr, gprT1q)]);
@ -1170,7 +1170,7 @@ mVUop(mVU_ISW)
mVU.regAlloc->moveVIToGPR(gprT1, _Is_);
if (_Imm11_ != 0)
xADD(gprT1, _Imm11_);
mVUaddrFix(mVU, gprT1q);
mVUaddrFix(mVU, gprT1q, gprT2q);
}
// If regT is dirty, the high bits might not be zero.
@ -1201,7 +1201,7 @@ mVUop(mVU_ISWR)
if (_Is_)
{
mVU.regAlloc->moveVIToGPR(gprT1, _Is_);
mVUaddrFix(mVU, gprT1q);
mVUaddrFix(mVU, gprT1q, gprT2q);
is = gprT1q;
}
const xRegister32& regT = mVU.regAlloc->allocGPR(_It_, -1, false, true);
@ -1257,7 +1257,7 @@ mVUop(mVU_LQ)
mVU.regAlloc->moveVIToGPR(gprT1, _Is_);
if (_Imm11_ != 0)
xADD(gprT1, _Imm11_);
mVUaddrFix(mVU, gprT1q);
mVUaddrFix(mVU, gprT1q, gprT2q);
}
const xmm& Ft = mVU.regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
@ -1281,7 +1281,7 @@ mVUop(mVU_LQD)
xDEC(regS);
xMOVSX(gprT1, xRegister16(regS)); // TODO: Confirm
mVU.regAlloc->clearNeeded(regS);
mVUaddrFix(mVU, gprT1q);
mVUaddrFix(mVU, gprT1q, gprT2q);
is = gprT1q;
}
else
@ -1319,7 +1319,7 @@ mVUop(mVU_LQI)
xMOVSX(gprT1, xRegister16(regS)); // TODO: Confirm
xINC(regS);
mVU.regAlloc->clearNeeded(regS);
mVUaddrFix(mVU, gprT1q);
mVUaddrFix(mVU, gprT1q, gprT2q);
is = gprT1q;
}
if (!mVUlow.noWriteVF)
@ -1351,7 +1351,7 @@ mVUop(mVU_SQ)
mVU.regAlloc->moveVIToGPR(gprT1, _It_);
if (_Imm11_ != 0)
xADD(gprT1, _Imm11_);
mVUaddrFix(mVU, gprT1q);
mVUaddrFix(mVU, gprT1q, gprT2q);
}
const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, _XYZW_PS ? -1 : 0, _X_Y_Z_W);
@ -1375,7 +1375,7 @@ mVUop(mVU_SQD)
xDEC(regT);
xMOVZX(gprT1, xRegister16(regT));
mVU.regAlloc->clearNeeded(regT);
mVUaddrFix(mVU, gprT1q);
mVUaddrFix(mVU, gprT1q, gprT2q);
it = gprT1q;
}
else
@ -1405,7 +1405,7 @@ mVUop(mVU_SQI)
xMOVZX(gprT1, xRegister16(regT));
xINC(regT);
mVU.regAlloc->clearNeeded(regT);
mVUaddrFix(mVU, gprT1q);
mVUaddrFix(mVU, gprT1q, gprT2q);
}
const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, _XYZW_PS ? -1 : 0, _X_Y_Z_W);
if (_It_)

View File

@ -295,7 +295,7 @@ static void mVUwaitMTVU()
}
// Transforms the Address in gprReg to valid VU0/VU1 Address
__fi void mVUaddrFix(mV, const xAddressReg& gprReg)
__fi void mVUaddrFix(mV, const xAddressReg& gprReg, const xAddressReg& tmpReg)
{
if (isVU1)
{
@ -324,7 +324,16 @@ __fi void mVUaddrFix(mV, const xAddressReg& gprReg)
xFastCall((void*)mVU.waitMTVU);
}
xAND(xRegister32(gprReg.Id), 0x3f); // ToDo: theres a potential problem if VU0 overrides VU1's VF0/VI0 regs!
xADD(gprReg, (u128*)VU1.VF - (u128*)VU0.Mem);
sptr offset = (u128*)VU1.VF - (u128*)VU0.Mem;
if (offset == (s32)offset)
{
xADD(gprReg, offset);
}
else
{
xMOV64(tmpReg, offset);
xADD(gprReg, tmpReg);
}
jmpB.SetTarget();
xSHL(gprReg, 4); // multiply by 16 (shift left by 4)
}

View File

@ -23,7 +23,8 @@ void dVifRelease(int idx)
}
VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_)
: v(vif_)
: vifPtr(rax)
, v(vif_)
, vB(vifBlock_)
{
const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2)
@ -42,9 +43,6 @@ __fi void makeMergeMask(u32& x)
__fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const
{
const int idx = v.idx;
const vifStruct& vif = MTVU_VifX;
//This could have ended up copying the row when there was no row to write.1810080
u32 m0 = vB.mask; //The actual mask example 0x03020100
u32 m3 = ((m0 & 0xaaaaaaaa) >> 1) & ~m0; //all the upper bits, so our example 0x01010000 & 0xFCFDFEFF = 0x00010000 just the cols (shifted right for maskmerge)
@ -52,14 +50,14 @@ __fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const
if ((doMask && m2) || doMode)
{
xMOVAPS(xmmRow, ptr128[&vif.MaskRow]);
xMOVAPS(xmmRow, ptr128[vifPtr + (sptr)offsetof(vifStruct, MaskRow)]);
MSKPATH3_LOG("Moving row");
}
if (doMask && m3)
{
VIF_LOG("Merging Cols");
xMOVAPS(xmmCol0, ptr128[&vif.MaskCol]);
xMOVAPS(xmmCol0, ptr128[vifPtr + (sptr)offsetof(vifStruct, MaskCol)]);
if ((cS >= 2) && (m3 & 0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
if ((cS >= 3) && (m3 & 0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
if ((cS >= 4) && (m3 & 0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3);
@ -137,8 +135,7 @@ void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const
void VifUnpackSSE_Dynarec::writeBackRow() const
{
const int idx = v.idx;
xMOVAPS(ptr128[&(MTVU_VifX.MaskRow)], xmmRow);
xMOVAPS(ptr128[vifPtr + (sptr)offsetof(vifStruct, MaskRow)], xmmRow);
VIF_LOG("nVif: writing back row reg! [doMode = %d]", doMode);
}
@ -239,6 +236,7 @@ void VifUnpackSSE_Dynarec::ProcessMasks()
void VifUnpackSSE_Dynarec::CompileRoutine()
{
const int idx = v.idx;
const int wl = vB.wl ? vB.wl : 256; // 0 is taken as 256 (KH2)
const int upkNum = vB.upkType & 0xf;
const u8& vift = nVifT[upkNum];
@ -252,6 +250,7 @@ void VifUnpackSSE_Dynarec::CompileRoutine()
VIF_LOG("Compiling new block, unpack number %x, mode %x, masking %x, vNum %x", upkNum, doMode, doMask, vNum);
pxAssume(vCL == 0);
xLoadFarAddr(vifPtr, &MTVU_VifX);
// Value passed determines # of col regs we need to load
SetMasks(isFill ? blockSize : cycleSize);
@ -336,6 +335,7 @@ _vifT __fi nVifBlock* dVifCompile(nVifBlock& block, bool isFill)
}
// Compile the block now
xSetTextPtr(nullptr);
xSetPtr(v.recWritePtr);
block.startPtr = (uptr)xGetAlignedCallTarget();

View File

@ -329,9 +329,11 @@ void VifUnpackSSE_Simple::doMaskWrite(const xRegisterSSE& regX) const
{
xMOVAPS(xmm7, ptr[dstIndirect]);
int offX = std::min(curCycle, 3);
xPAND(regX, ptr32[nVifMask[0][offX]]);
xPAND(xmm7, ptr32[nVifMask[1][offX]]);
xPOR (regX, ptr32[nVifMask[2][offX]]);
sptr base = reinterpret_cast<sptr>(nVifMask[2]);
xLoadFarAddr(rax, nVifMask);
xPAND(regX, ptr128[rax + (reinterpret_cast<sptr>(nVifMask[0][offX]) - base)]);
xPAND(xmm7, ptr128[rax + (reinterpret_cast<sptr>(nVifMask[1][offX]) - base)]);
xPOR (regX, ptr128[rax + (reinterpret_cast<sptr>(nVifMask[2][offX]) - base)]);
xPOR (regX, xmm7);
xMOVAPS(ptr[dstIndirect], regX);
}
@ -362,6 +364,7 @@ void VifUnpackSSE_Init()
{
DevCon.WriteLn("Generating SSE-optimized unpacking functions for VIF interpreters...");
xSetTextPtr(nullptr);
xSetPtr(SysMemory::GetVIFUnpackRec());
for (int a = 0; a < 2; a++)

View File

@ -98,6 +98,7 @@ public:
bool inputMasked;
protected:
xAddressReg vifPtr;
const nVifStruct& v; // vif0 or vif1
const nVifBlock& vB; // some pre-collected data from VifStruct
int vCL; // internal copy of vif->cl