diff --git a/common/emitter/x86emitter.cpp b/common/emitter/x86emitter.cpp index 157d25e4b4..f34a375e76 100644 --- a/common/emitter/x86emitter.cpp +++ b/common/emitter/x86emitter.cpp @@ -49,6 +49,7 @@ thread_local u8* x86Ptr; +thread_local u8* xTextPtr; thread_local XMMSSEType g_xmmtypes[iREGCNT_XMM] = {XMMT_INT}; namespace x86Emitter @@ -295,13 +296,27 @@ const xRegister32 void EmitSibMagic(uint regfield, const void* address, int extraRIPOffset) { sptr displacement = (sptr)address; + sptr textRelative = (sptr)address - (sptr)xTextPtr; sptr ripRelative = (sptr)address - ((sptr)x86Ptr + sizeof(s8) + sizeof(s32) + extraRIPOffset); + // Can we use an 8-bit offset from the text pointer? + if (textRelative == (s8)textRelative && xTextPtr) + { + ModRM(1, regfield, RTEXTPTR.GetId()); + xWrite((s8)textRelative); + return; + } // Can we use a rip-relative address? (Prefer this over eiz because it's a byte shorter) - if (ripRelative == (s32)ripRelative) + else if (ripRelative == (s32)ripRelative) { ModRM(0, regfield, ModRm_UseDisp32); displacement = ripRelative; } + // How about from the text pointer? + else if (textRelative == (s32)textRelative && xTextPtr) + { + ModRM(2, regfield, RTEXTPTR.GetId()); + displacement = textRelative; + } else { pxAssertMsg(displacement == (s32)displacement, "SIB target is too far away, needs an indirect register"); @@ -539,6 +554,12 @@ const xRegister32 x86Ptr = (u8*)ptr; } + // Assigns the current emitter text base address. + __emitinline void xSetTextPtr(void* ptr) + { + xTextPtr = (u8*)ptr; + } + // Retrieves the current emitter buffer target address. // This is provided instead of using x86Ptr directly, since we may in the future find // a need to change the storage class system for the x86Ptr 'under the hood.' @@ -547,6 +568,12 @@ const xRegister32 return x86Ptr; } + // Retrieves the current emitter text base address. + __emitinline u8* xGetTextPtr() + { + return xTextPtr; + } + __emitinline void xAlignPtr(uint bytes) { // forward align @@ -1229,6 +1256,9 @@ const xRegister32 #endif stackAlign(m_offset, true); + + if (u8* ptr = xGetTextPtr()) + xMOV64(RTEXTPTR, (sptr)ptr); } xScopedStackFrame::~xScopedStackFrame() @@ -1285,11 +1315,14 @@ const xRegister32 { return offset + base; } - else + if (u8* ptr = xGetTextPtr()) { - xLEA(tmpRegister, ptr[base]); - return offset + tmpRegister; + sptr tbase = (sptr)base - (sptr)ptr; + if (tbase == (s32)tbase) + return offset + RTEXTPTR + tbase; } + xLEA(tmpRegister, ptr[base]); + return offset + tmpRegister; } void xLoadFarAddr(const xAddressReg& dst, void* addr) diff --git a/common/emitter/x86types.h b/common/emitter/x86types.h index e898b9400a..98788421c7 100644 --- a/common/emitter/x86types.h +++ b/common/emitter/x86types.h @@ -149,11 +149,13 @@ namespace x86Emitter static const int Sib_UseDisp32 = 5; // same index value as EBP (used in Base field) extern void xSetPtr(void* ptr); + extern void xSetTextPtr(void* ptr); extern void xAlignPtr(uint bytes); extern void xAdvancePtr(uint bytes); extern void xAlignCallTarget(); extern u8* xGetPtr(); + extern u8* xGetTextPtr(); extern u8* xGetAlignedCallTarget(); extern JccComparisonType xInvertCond(JccComparisonType src); @@ -646,6 +648,8 @@ extern const xRegister32 calleeSavedReg1d, calleeSavedReg2d; +/// Holds a pointer to program text at all times so we don't need to be within 2GB of text +static constexpr const xAddressReg& RTEXTPTR = rbx; // clang-format on diff --git a/pcsx2/x86/iR3000A.cpp b/pcsx2/x86/iR3000A.cpp index 0a5856390b..1bdb3956e0 100644 --- a/pcsx2/x86/iR3000A.cpp +++ b/pcsx2/x86/iR3000A.cpp @@ -890,10 +890,13 @@ static void recReserve() pxFailRel("Failed to allocate R3000 InstCache array."); } +#define R3000A_TEXTPTR (&psxRegs.GPR.r[33]) + void recResetIOP() { DevCon.WriteLn("iR3000A Recompiler reset."); + xSetTextPtr(R3000A_TEXTPTR); xSetPtr(SysMemory::GetIOPRec()); _DynGen_Dispatchers(); recPtr = xGetPtr(); @@ -1565,6 +1568,7 @@ static void iopRecRecompile(const u32 startpc) recResetIOP(); } + xSetTextPtr(R3000A_TEXTPTR); xSetPtr(recPtr); recPtr = xGetAlignedCallTarget(); diff --git a/pcsx2/x86/iR5900.h b/pcsx2/x86/iR5900.h index 1c0bfe4a4b..f5fc4e3226 100644 --- a/pcsx2/x86/iR5900.h +++ b/pcsx2/x86/iR5900.h @@ -21,6 +21,11 @@ extern u32 target; // branch target extern u32 s_nBlockCycles; // cycles of current block recompiling extern bool s_nBlockInterlocked; // Current block has VU0 interlocking +// x86 can use shorter displacement if it fits in an s8, so offset 144 bytes into the cpuRegs +// This will allow us to reach r1-r16 with a shorter encoding +// TODO: Actually figure out what things are used most often, maybe rearrange the cpuRegs struct, and point at that +#define R5900_TEXTPTR (&cpuRegs.GPR.r[9]) + ////////////////////////////////////////////////////////////////////////////////////////// // diff --git a/pcsx2/x86/ix86-32/iR5900.cpp b/pcsx2/x86/ix86-32/iR5900.cpp index a716d1f07f..26c71edc3d 100644 --- a/pcsx2/x86/ix86-32/iR5900.cpp +++ b/pcsx2/x86/ix86-32/iR5900.cpp @@ -445,6 +445,8 @@ static const void* _DynGen_EnterRecompiledCode() xSUB(rsp, stack_size); #endif + if (u8* ptr = xGetTextPtr()) + xMOV64(RTEXTPTR, (sptr)ptr); if (CHECK_FASTMEM) xMOV(RFASTMEMBASE, ptrNative[&vtlb_private::vtlbdata.fastmem_base]); @@ -585,6 +587,7 @@ static void recResetRaw() EE::Profiler.Reset(); + xSetTextPtr(R5900_TEXTPTR); xSetPtr(SysMemory::GetEERec()); _DynGen_Dispatchers(); vtlb_DynGenDispatchers(); @@ -897,6 +900,7 @@ u8* recBeginThunk() if (recPtr >= recPtrEnd) eeRecNeedsReset = true; + xSetTextPtr(R5900_TEXTPTR); xSetPtr(recPtr); recPtr = xGetAlignedCallTarget(); @@ -2191,6 +2195,7 @@ static void recRecompile(const u32 startpc) recResetRaw(); } + xSetTextPtr(R5900_TEXTPTR); xSetPtr(recPtr); recPtr = xGetAlignedCallTarget(); diff --git a/pcsx2/x86/ix86-32/recVTLB.cpp b/pcsx2/x86/ix86-32/recVTLB.cpp index 1a95793de0..4884d31442 100644 --- a/pcsx2/x86/ix86-32/recVTLB.cpp +++ b/pcsx2/x86/ix86-32/recVTLB.cpp @@ -345,6 +345,7 @@ void vtlb_DynGenDispatchers() for (int sign = 0; sign < (!mode && bits < 3 ? 2 : 1); sign++) { xSetPtr(GetIndirectDispatcherPtr(mode, bits, !!sign)); + xSetTextPtr(R5900_TEXTPTR); DynGen_IndirectTlbDispatcher(mode, bits, !!sign); } diff --git a/pcsx2/x86/microVU.cpp b/pcsx2/x86/microVU.cpp index 6c2c4b2a2e..a2b7b39310 100644 --- a/pcsx2/x86/microVU.cpp +++ b/pcsx2/x86/microVU.cpp @@ -42,6 +42,7 @@ void mVUreset(microVU& mVU, bool resetReserve) VU0.VI[REG_VPU_STAT].UL &= ~0x100; } + xSetTextPtr(mVU.textPtr()); xSetPtr(mVU.cache); mVUdispatcherAB(mVU); mVUdispatcherCD(mVU); diff --git a/pcsx2/x86/microVU.h b/pcsx2/x86/microVU.h index 2fd22f678d..db42203e51 100644 --- a/pcsx2/x86/microVU.h +++ b/pcsx2/x86/microVU.h @@ -123,6 +123,7 @@ struct microVU s32 cycles; // Cycles Counter VURegs& regs() const { return ::vuRegs[index]; } + void* textPtr() const { return (index && THREAD_VU1) ? (void*)®s().VF[9] : (void*)R5900_TEXTPTR; } __fi REG_VI& getVI(uint reg) const { return regs().VI[reg]; } __fi VECTOR& getVF(uint reg) const { return regs().VF[reg]; } diff --git a/pcsx2/x86/microVU_Execute.inl b/pcsx2/x86/microVU_Execute.inl index f59910988a..cf15fd019d 100644 --- a/pcsx2/x86/microVU_Execute.inl +++ b/pcsx2/x86/microVU_Execute.inl @@ -207,15 +207,17 @@ static void mVUGenerateCopyPipelineState(mV) { mVU.copyPLState = xGetAlignedCallTarget(); + xLoadFarAddr(rdx, reinterpret_cast(&mVU.prog.lpState)); + if (cpuinfo_has_x86_avx()) { xVMOVAPS(ymm0, ptr[rax]); xVMOVAPS(ymm1, ptr[rax + 32u]); xVMOVAPS(ymm2, ptr[rax + 64u]); - xVMOVUPS(ptr[reinterpret_cast(&mVU.prog.lpState)], ymm0); - xVMOVUPS(ptr[reinterpret_cast(&mVU.prog.lpState) + 32u], ymm1); - xVMOVUPS(ptr[reinterpret_cast(&mVU.prog.lpState) + 64u], ymm2); + xVMOVUPS(ptr[rdx], ymm0); + xVMOVUPS(ptr[rdx + 32u], ymm1); + xVMOVUPS(ptr[rdx + 64u], ymm2); xVZEROUPPER(); } @@ -228,12 +230,12 @@ static void mVUGenerateCopyPipelineState(mV) xMOVAPS(xmm4, ptr[rax + 64u]); xMOVAPS(xmm5, ptr[rax + 80u]); - xMOVUPS(ptr[reinterpret_cast(&mVU.prog.lpState)], xmm0); - xMOVUPS(ptr[reinterpret_cast(&mVU.prog.lpState) + 16u], xmm1); - xMOVUPS(ptr[reinterpret_cast(&mVU.prog.lpState) + 32u], xmm2); - xMOVUPS(ptr[reinterpret_cast(&mVU.prog.lpState) + 48u], xmm3); - xMOVUPS(ptr[reinterpret_cast(&mVU.prog.lpState) + 64u], xmm4); - xMOVUPS(ptr[reinterpret_cast(&mVU.prog.lpState) + 80u], xmm5); + xMOVUPS(ptr[rdx], xmm0); + xMOVUPS(ptr[rdx + 16u], xmm1); + xMOVUPS(ptr[rdx + 32u], xmm2); + xMOVUPS(ptr[rdx + 48u], xmm3); + xMOVUPS(ptr[rdx + 64u], xmm4); + xMOVUPS(ptr[rdx + 80u], xmm5); } xRET(); @@ -326,6 +328,7 @@ _mVUt void* mVUexecute(u32 startPC, u32 cycles) mVU.cycles = cycles; mVU.totalCycles = cycles; + xSetTextPtr(mVU.textPtr()); xSetPtr(mVU.prog.x86ptr); // Set x86ptr to where last program left off return mVUsearchProg(startPC & vuLimit, (uptr)&mVU.prog.lpState); // Find and set correct program } diff --git a/pcsx2/x86/microVU_IR.h b/pcsx2/x86/microVU_IR.h index 855e064570..0091639f64 100644 --- a/pcsx2/x86/microVU_IR.h +++ b/pcsx2/x86/microVU_IR.h @@ -411,6 +411,7 @@ public: } } + gprMap[RTEXTPTR.GetId()].usable = !xGetTextPtr(); gprMap[RFASTMEMBASE.GetId()].usable = !cop2mode || !CHECK_FASTMEM; } diff --git a/pcsx2/x86/microVU_Lower.inl b/pcsx2/x86/microVU_Lower.inl index 916f6cb43c..ce2b179fef 100644 --- a/pcsx2/x86/microVU_Lower.inl +++ b/pcsx2/x86/microVU_Lower.inl @@ -1106,7 +1106,7 @@ mVUop(mVU_ILW) mVU.regAlloc->moveVIToGPR(gprT1, _Is_); if (_Imm11_ != 0) xADD(gprT1, _Imm11_); - mVUaddrFix(mVU, gprT1q); + mVUaddrFix(mVU, gprT1q, gprT2q); } const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); @@ -1133,7 +1133,7 @@ mVUop(mVU_ILWR) if (_Is_) { mVU.regAlloc->moveVIToGPR(gprT1, _Is_); - mVUaddrFix (mVU, gprT1q); + mVUaddrFix (mVU, gprT1q, gprT2q); const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); xMOVZX(regT, ptr16[xComplexAddress(gprT2q, ptr, gprT1q)]); @@ -1170,7 +1170,7 @@ mVUop(mVU_ISW) mVU.regAlloc->moveVIToGPR(gprT1, _Is_); if (_Imm11_ != 0) xADD(gprT1, _Imm11_); - mVUaddrFix(mVU, gprT1q); + mVUaddrFix(mVU, gprT1q, gprT2q); } // If regT is dirty, the high bits might not be zero. @@ -1201,7 +1201,7 @@ mVUop(mVU_ISWR) if (_Is_) { mVU.regAlloc->moveVIToGPR(gprT1, _Is_); - mVUaddrFix(mVU, gprT1q); + mVUaddrFix(mVU, gprT1q, gprT2q); is = gprT1q; } const xRegister32& regT = mVU.regAlloc->allocGPR(_It_, -1, false, true); @@ -1257,7 +1257,7 @@ mVUop(mVU_LQ) mVU.regAlloc->moveVIToGPR(gprT1, _Is_); if (_Imm11_ != 0) xADD(gprT1, _Imm11_); - mVUaddrFix(mVU, gprT1q); + mVUaddrFix(mVU, gprT1q, gprT2q); } const xmm& Ft = mVU.regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W); @@ -1281,7 +1281,7 @@ mVUop(mVU_LQD) xDEC(regS); xMOVSX(gprT1, xRegister16(regS)); // TODO: Confirm mVU.regAlloc->clearNeeded(regS); - mVUaddrFix(mVU, gprT1q); + mVUaddrFix(mVU, gprT1q, gprT2q); is = gprT1q; } else @@ -1319,7 +1319,7 @@ mVUop(mVU_LQI) xMOVSX(gprT1, xRegister16(regS)); // TODO: Confirm xINC(regS); mVU.regAlloc->clearNeeded(regS); - mVUaddrFix(mVU, gprT1q); + mVUaddrFix(mVU, gprT1q, gprT2q); is = gprT1q; } if (!mVUlow.noWriteVF) @@ -1351,7 +1351,7 @@ mVUop(mVU_SQ) mVU.regAlloc->moveVIToGPR(gprT1, _It_); if (_Imm11_ != 0) xADD(gprT1, _Imm11_); - mVUaddrFix(mVU, gprT1q); + mVUaddrFix(mVU, gprT1q, gprT2q); } const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, _XYZW_PS ? -1 : 0, _X_Y_Z_W); @@ -1375,7 +1375,7 @@ mVUop(mVU_SQD) xDEC(regT); xMOVZX(gprT1, xRegister16(regT)); mVU.regAlloc->clearNeeded(regT); - mVUaddrFix(mVU, gprT1q); + mVUaddrFix(mVU, gprT1q, gprT2q); it = gprT1q; } else @@ -1405,7 +1405,7 @@ mVUop(mVU_SQI) xMOVZX(gprT1, xRegister16(regT)); xINC(regT); mVU.regAlloc->clearNeeded(regT); - mVUaddrFix(mVU, gprT1q); + mVUaddrFix(mVU, gprT1q, gprT2q); } const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, _XYZW_PS ? -1 : 0, _X_Y_Z_W); if (_It_) diff --git a/pcsx2/x86/microVU_Misc.inl b/pcsx2/x86/microVU_Misc.inl index f9e9663a56..021bb90834 100644 --- a/pcsx2/x86/microVU_Misc.inl +++ b/pcsx2/x86/microVU_Misc.inl @@ -295,7 +295,7 @@ static void mVUwaitMTVU() } // Transforms the Address in gprReg to valid VU0/VU1 Address -__fi void mVUaddrFix(mV, const xAddressReg& gprReg) +__fi void mVUaddrFix(mV, const xAddressReg& gprReg, const xAddressReg& tmpReg) { if (isVU1) { @@ -324,7 +324,16 @@ __fi void mVUaddrFix(mV, const xAddressReg& gprReg) xFastCall((void*)mVU.waitMTVU); } xAND(xRegister32(gprReg.Id), 0x3f); // ToDo: theres a potential problem if VU0 overrides VU1's VF0/VI0 regs! - xADD(gprReg, (u128*)VU1.VF - (u128*)VU0.Mem); + sptr offset = (u128*)VU1.VF - (u128*)VU0.Mem; + if (offset == (s32)offset) + { + xADD(gprReg, offset); + } + else + { + xMOV64(tmpReg, offset); + xADD(gprReg, tmpReg); + } jmpB.SetTarget(); xSHL(gprReg, 4); // multiply by 16 (shift left by 4) } diff --git a/pcsx2/x86/newVif_Dynarec.cpp b/pcsx2/x86/newVif_Dynarec.cpp index ed8e548538..ec157c79cd 100644 --- a/pcsx2/x86/newVif_Dynarec.cpp +++ b/pcsx2/x86/newVif_Dynarec.cpp @@ -23,7 +23,8 @@ void dVifRelease(int idx) } VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_) - : v(vif_) + : vifPtr(rax) + , v(vif_) , vB(vifBlock_) { const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2) @@ -42,9 +43,6 @@ __fi void makeMergeMask(u32& x) __fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const { - const int idx = v.idx; - const vifStruct& vif = MTVU_VifX; - //This could have ended up copying the row when there was no row to write.1810080 u32 m0 = vB.mask; //The actual mask example 0x03020100 u32 m3 = ((m0 & 0xaaaaaaaa) >> 1) & ~m0; //all the upper bits, so our example 0x01010000 & 0xFCFDFEFF = 0x00010000 just the cols (shifted right for maskmerge) @@ -52,14 +50,14 @@ __fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const if ((doMask && m2) || doMode) { - xMOVAPS(xmmRow, ptr128[&vif.MaskRow]); + xMOVAPS(xmmRow, ptr128[vifPtr + (sptr)offsetof(vifStruct, MaskRow)]); MSKPATH3_LOG("Moving row"); } if (doMask && m3) { VIF_LOG("Merging Cols"); - xMOVAPS(xmmCol0, ptr128[&vif.MaskCol]); + xMOVAPS(xmmCol0, ptr128[vifPtr + (sptr)offsetof(vifStruct, MaskCol)]); if ((cS >= 2) && (m3 & 0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1); if ((cS >= 3) && (m3 & 0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2); if ((cS >= 4) && (m3 & 0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3); @@ -137,8 +135,7 @@ void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const void VifUnpackSSE_Dynarec::writeBackRow() const { - const int idx = v.idx; - xMOVAPS(ptr128[&(MTVU_VifX.MaskRow)], xmmRow); + xMOVAPS(ptr128[vifPtr + (sptr)offsetof(vifStruct, MaskRow)], xmmRow); VIF_LOG("nVif: writing back row reg! [doMode = %d]", doMode); } @@ -239,6 +236,7 @@ void VifUnpackSSE_Dynarec::ProcessMasks() void VifUnpackSSE_Dynarec::CompileRoutine() { + const int idx = v.idx; const int wl = vB.wl ? vB.wl : 256; // 0 is taken as 256 (KH2) const int upkNum = vB.upkType & 0xf; const u8& vift = nVifT[upkNum]; @@ -252,6 +250,7 @@ void VifUnpackSSE_Dynarec::CompileRoutine() VIF_LOG("Compiling new block, unpack number %x, mode %x, masking %x, vNum %x", upkNum, doMode, doMask, vNum); pxAssume(vCL == 0); + xLoadFarAddr(vifPtr, &MTVU_VifX); // Value passed determines # of col regs we need to load SetMasks(isFill ? blockSize : cycleSize); @@ -336,6 +335,7 @@ _vifT __fi nVifBlock* dVifCompile(nVifBlock& block, bool isFill) } // Compile the block now + xSetTextPtr(nullptr); xSetPtr(v.recWritePtr); block.startPtr = (uptr)xGetAlignedCallTarget(); diff --git a/pcsx2/x86/newVif_UnpackSSE.cpp b/pcsx2/x86/newVif_UnpackSSE.cpp index 68b52ce997..5a124cf2b3 100644 --- a/pcsx2/x86/newVif_UnpackSSE.cpp +++ b/pcsx2/x86/newVif_UnpackSSE.cpp @@ -329,9 +329,11 @@ void VifUnpackSSE_Simple::doMaskWrite(const xRegisterSSE& regX) const { xMOVAPS(xmm7, ptr[dstIndirect]); int offX = std::min(curCycle, 3); - xPAND(regX, ptr32[nVifMask[0][offX]]); - xPAND(xmm7, ptr32[nVifMask[1][offX]]); - xPOR (regX, ptr32[nVifMask[2][offX]]); + sptr base = reinterpret_cast(nVifMask[2]); + xLoadFarAddr(rax, nVifMask); + xPAND(regX, ptr128[rax + (reinterpret_cast(nVifMask[0][offX]) - base)]); + xPAND(xmm7, ptr128[rax + (reinterpret_cast(nVifMask[1][offX]) - base)]); + xPOR (regX, ptr128[rax + (reinterpret_cast(nVifMask[2][offX]) - base)]); xPOR (regX, xmm7); xMOVAPS(ptr[dstIndirect], regX); } @@ -362,6 +364,7 @@ void VifUnpackSSE_Init() { DevCon.WriteLn("Generating SSE-optimized unpacking functions for VIF interpreters..."); + xSetTextPtr(nullptr); xSetPtr(SysMemory::GetVIFUnpackRec()); for (int a = 0; a < 2; a++) diff --git a/pcsx2/x86/newVif_UnpackSSE.h b/pcsx2/x86/newVif_UnpackSSE.h index 866a5ce1a7..381cb61bac 100644 --- a/pcsx2/x86/newVif_UnpackSSE.h +++ b/pcsx2/x86/newVif_UnpackSSE.h @@ -98,6 +98,7 @@ public: bool inputMasked; protected: + xAddressReg vifPtr; const nVifStruct& v; // vif0 or vif1 const nVifBlock& vB; // some pre-collected data from VifStruct int vCL; // internal copy of vif->cl