x86: Move dispatchers to recompiler code space

This commit is contained in:
Stenzek 2023-10-09 21:32:18 +10:00 committed by Connor McLaughlin
parent 5f11ba0445
commit 377746f155
22 changed files with 330 additions and 451 deletions

View File

@ -140,12 +140,6 @@ namespace HostSys
extern void MemProtect(void* baseaddr, size_t size, const PageProtectionMode& mode);
template <uint size>
void MemProtectStatic(u8 (&arr)[size], const PageProtectionMode& mode)
{
MemProtect(arr, size, mode);
}
extern std::string GetFileMappingName(const char* prefix);
extern void* CreateSharedMemory(const char* name, size_t size);
extern void DestroySharedMemory(void* ptr);

View File

@ -32,10 +32,10 @@ namespace x86Emitter
// Special form for calling functions. This form automatically resolves the
// correct displacement based on the size of the instruction being generated.
void operator()(void* func) const
void operator()(const void* func) const
{
if (isJmp)
xJccKnownTarget(Jcc_Unconditional, (void*)(uptr)func, false); // double cast to/from (uptr) needed to appease GCC
xJccKnownTarget(Jcc_Unconditional, (const void*)(uptr)func, false); // double cast to/from (uptr) needed to appease GCC
else
{
// calls are relative to the instruction after this one, and length is
@ -58,32 +58,32 @@ namespace x86Emitter
// FIXME: current 64 bits is mostly a copy/past potentially it would require to push/pop
// some registers. But I think it is enough to handle the first call.
void operator()(void* f, const xRegister32& a1 = xEmptyReg, const xRegister32& a2 = xEmptyReg) const;
void operator()(const void* f, const xRegister32& a1 = xEmptyReg, const xRegister32& a2 = xEmptyReg) const;
void operator()(void* f, u32 a1, const xRegister32& a2) const;
void operator()(void* f, const xIndirect32& a1) const;
void operator()(void* f, u32 a1, u32 a2) const;
void operator()(void* f, void* a1) const;
void operator()(const void* f, u32 a1, const xRegister32& a2) const;
void operator()(const void* f, const xIndirect32& a1) const;
void operator()(const void* f, u32 a1, u32 a2) const;
void operator()(const void* f, void* a1) const;
void operator()(void* f, const xRegisterLong& a1, const xRegisterLong& a2 = xEmptyReg) const;
void operator()(void* f, u32 a1, const xRegisterLong& a2) const;
void operator()(const void* f, const xRegisterLong& a1, const xRegisterLong& a2 = xEmptyReg) const;
void operator()(const void* f, u32 a1, const xRegisterLong& a2) const;
template <typename T>
__fi void operator()(T* func, u32 a1, const xRegisterLong& a2 = xEmptyReg) const
{
(*this)((void*)func, a1, a2);
(*this)((const void*)func, a1, a2);
}
template <typename T>
__fi void operator()(T* func, const xIndirect32& a1) const
{
(*this)((void*)func, a1);
(*this)((const void*)func, a1);
}
template <typename T>
__fi void operator()(T* func, u32 a1, u32 a2) const
{
(*this)((void*)func, a1, a2);
(*this)((const void*)func, a1, a2);
}
void operator()(const xIndirectNative& f, const xRegisterLong& a1 = xEmptyReg, const xRegisterLong& a2 = xEmptyReg) const;

View File

@ -78,7 +78,7 @@ namespace x86Emitter
}
}
void xImpl_FastCall::operator()(void* f, const xRegister32& a1, const xRegister32& a2) const
void xImpl_FastCall::operator()(const void* f, const xRegister32& a1, const xRegister32& a2) const
{
prepareRegsForFastcall(a1, a2);
uptr disp = ((uptr)xGetPtr() + 5) - (uptr)f;
@ -93,7 +93,7 @@ namespace x86Emitter
}
}
void xImpl_FastCall::operator()(void* f, const xRegisterLong& a1, const xRegisterLong& a2) const
void xImpl_FastCall::operator()(const void* f, const xRegisterLong& a1, const xRegisterLong& a2) const
{
prepareRegsForFastcall(a1, a2);
uptr disp = ((uptr)xGetPtr() + 5) - (uptr)f;
@ -108,7 +108,7 @@ namespace x86Emitter
}
}
void xImpl_FastCall::operator()(void* f, u32 a1, const xRegisterLong& a2) const
void xImpl_FastCall::operator()(const void* f, u32 a1, const xRegisterLong& a2) const
{
if (!a2.IsEmpty())
{
@ -118,13 +118,13 @@ namespace x86Emitter
(*this)(f, arg1reg, arg2reg);
}
void xImpl_FastCall::operator()(void* f, void* a1) const
void xImpl_FastCall::operator()(const void* f, void* a1) const
{
xLEA(arg1reg, ptr[a1]);
(*this)(f, arg1reg, arg2reg);
}
void xImpl_FastCall::operator()(void* f, u32 a1, const xRegister32& a2) const
void xImpl_FastCall::operator()(const void* f, u32 a1, const xRegister32& a2) const
{
if (!a2.IsEmpty())
{
@ -134,13 +134,13 @@ namespace x86Emitter
(*this)(f, arg1regd, arg2regd);
}
void xImpl_FastCall::operator()(void* f, const xIndirect32& a1) const
void xImpl_FastCall::operator()(const void* f, const xIndirect32& a1) const
{
xMOV(arg1regd, a1);
(*this)(f, arg1regd);
}
void xImpl_FastCall::operator()(void* f, u32 a1, u32 a2) const
void xImpl_FastCall::operator()(const void* f, u32 a1, u32 a2) const
{
xMOV(arg1regd, a1);
xMOV(arg2regd, a2);

View File

@ -144,13 +144,6 @@ emitterT void x86SetJ32A(u32* j32)
x86SetJ32(j32);
}
////////////////////////////////////////////////////
emitterT void x86Align(int bytes)
{
// forward align
x86Ptr = (u8*)(((uptr)x86Ptr + bytes - 1) & ~(bytes - 1));
}
/********************/
/* IX86 instructions */
/********************/

View File

@ -29,15 +29,12 @@
//------------------------------------------------------------------
// legacy jump/align functions
//------------------------------------------------------------------
ATTR_DEP extern void x86SetPtr(u8* ptr);
ATTR_DEP extern void x86SetJ8(u8* j8);
ATTR_DEP extern void x86SetJ8A(u8* j8);
ATTR_DEP extern void x86SetJ16(u16* j16);
ATTR_DEP extern void x86SetJ16A(u16* j16);
ATTR_DEP extern void x86SetJ32(u32* j32);
ATTR_DEP extern void x86SetJ32A(u32* j32);
ATTR_DEP extern void x86Align(int bytes);
ATTR_DEP extern void x86AlignExecutable(int align);
//------------------------------------------------------------------
////////////////////////////////////

View File

@ -38,7 +38,6 @@ GSDrawScanline::GSDrawScanline()
: m_sp_map("GSSetupPrim")
, m_ds_map("GSDrawScanline")
{
GSCodeReserve::GetInstance().AllowModification();
GSCodeReserve::GetInstance().Reset();
}
@ -46,8 +45,6 @@ GSDrawScanline::~GSDrawScanline()
{
if (const size_t used = GSCodeReserve::GetInstance().GetMemoryUsed(); used > 0)
DevCon.WriteLn("SW JIT generated %zu bytes of code", used);
GSCodeReserve::GetInstance().ForbidModification();
}
void GSDrawScanline::BeginDraw(const GSRasterizerData& data, GSScanlineLocalData& local)

View File

@ -334,13 +334,3 @@ void RecompiledCodeReserve::Reset()
std::memset(m_baseptr, 0xCC, m_size);
}
}
void RecompiledCodeReserve::AllowModification()
{
HostSys::MemProtect(m_baseptr, m_size, PageAccess_Any());
}
void RecompiledCodeReserve::ForbidModification()
{
HostSys::MemProtect(m_baseptr, m_size, PageProtectionMode().Read().Execute());
}

View File

@ -161,9 +161,6 @@ public:
void Assign(VirtualMemoryManagerPtr allocator, size_t offset, size_t size);
void Reset();
void ForbidModification();
void AllowModification();
operator u8*() { return m_baseptr; }
operator const u8*() const { return m_baseptr; }
};

View File

@ -1245,9 +1245,6 @@ void vtlb_Init()
// The LUT is only used for 1 game so we allocate it only when the gamefix is enabled (save 4MB)
if (EmuConfig.Gamefixes.GoemonTlbHack)
vtlb_Alloc_Ppmap();
extern void vtlb_dynarec_init();
vtlb_dynarec_init();
}
// vtlb_Reset -- Performs a COP0-level reset of the PS2's TLB.

View File

@ -124,6 +124,8 @@ extern int vtlb_DynGenReadQuad_Const(u32 bits, u32 addr_const, vtlb_ReadRegAlloc
extern void vtlb_DynGenWrite(u32 sz, bool xmm, int addr_reg, int value_reg);
extern void vtlb_DynGenWrite_Const(u32 bits, bool xmm, u32 addr_const, int value_reg);
extern void vtlb_DynGenDispatchers();
// --------------------------------------------------------------------------------------
// VtlbMemoryReserve
// --------------------------------------------------------------------------------------

View File

@ -165,9 +165,9 @@ public:
{
}
void SetJITCompile(void (*recompiler_)())
void SetJITCompile(const void *recompiler_)
{
recompiler = (uptr)recompiler_;
recompiler = reinterpret_cast<uptr>(recompiler_);
}
BASEBLOCKEX* New(u32 startpc, uptr fnptr);

View File

@ -171,19 +171,14 @@ static ZyanStatus ZydisFormatterPrintAddressAbsolute(const ZydisFormatter* forma
// Dynamically Compiled Dispatchers - R3000A style
// =====================================================================================================
static void iopRecRecompile(const u32 startpc);
static void iopRecRecompile(u32 startpc);
// Recompiled code buffer for EE recompiler dispatchers!
alignas(__pagesize) static u8 iopRecDispatchers[__pagesize];
typedef void DynGenFunc();
static DynGenFunc* iopDispatcherEvent = NULL;
static DynGenFunc* iopDispatcherReg = NULL;
static DynGenFunc* iopJITCompile = NULL;
static DynGenFunc* iopJITCompileInBlock = NULL;
static DynGenFunc* iopEnterRecompiledCode = NULL;
static DynGenFunc* iopExitRecompiledCode = NULL;
static const void* iopDispatcherEvent = nullptr;
static const void* iopDispatcherReg = nullptr;
static const void* iopJITCompile = nullptr;
static const void* iopJITCompileInBlock = nullptr;
static const void* iopEnterRecompiledCode = nullptr;
static const void* iopExitRecompiledCode = nullptr;
static void recEventTest()
{
@ -192,7 +187,7 @@ static void recEventTest()
// The address for all cleared blocks. It recompiles the current pc and then
// dispatches to the recompiled block address.
static DynGenFunc* _DynGen_JITCompile()
static const void* _DynGen_JITCompile()
{
pxAssertMsg(iopDispatcherReg != NULL, "Please compile the DispatcherReg subroutine *before* JITComple. Thanks.");
@ -206,18 +201,18 @@ static DynGenFunc* _DynGen_JITCompile()
xMOV(rcx, ptrNative[xComplexAddress(rcx, psxRecLUT, rax * wordsize)]);
xJMP(ptrNative[rbx * (wordsize / 4) + rcx]);
return (DynGenFunc*)retval;
return retval;
}
static DynGenFunc* _DynGen_JITCompileInBlock()
static const void* _DynGen_JITCompileInBlock()
{
u8* retval = xGetPtr();
xJMP((void*)iopJITCompile);
return (DynGenFunc*)retval;
return retval;
}
// called when jumping to variable pc address
static DynGenFunc* _DynGen_DispatcherReg()
static const void* _DynGen_DispatcherReg()
{
u8* retval = xGetPtr();
@ -227,13 +222,13 @@ static DynGenFunc* _DynGen_DispatcherReg()
xMOV(rcx, ptrNative[xComplexAddress(rcx, psxRecLUT, rax * wordsize)]);
xJMP(ptrNative[rbx * (wordsize / 4) + rcx]);
return (DynGenFunc*)retval;
return retval;
}
// --------------------------------------------------------------------------------------
// EnterRecompiledCode - dynamic compilation stub!
// --------------------------------------------------------------------------------------
static DynGenFunc* _DynGen_EnterRecompiledCode()
static const void* _DynGen_EnterRecompiledCode()
{
// Optimization: The IOP never uses stack-based parameter invocation, so we can avoid
// allocating any room on the stack for it (which is important since the IOP's entry
@ -251,27 +246,21 @@ static DynGenFunc* _DynGen_EnterRecompiledCode()
xJMP((void*)iopDispatcherReg);
// Save an exit point
iopExitRecompiledCode = (DynGenFunc*)xGetPtr();
iopExitRecompiledCode = xGetPtr();
}
xRET();
return (DynGenFunc*)retval;
return retval;
}
static void _DynGen_Dispatchers()
{
// In case init gets called multiple times:
HostSys::MemProtectStatic(iopRecDispatchers, PageAccess_ReadWrite());
// clear the buffer to 0xcc (easier debugging).
memset(iopRecDispatchers, 0xcc, __pagesize);
xSetPtr(iopRecDispatchers);
const u8* start = xGetAlignedCallTarget();
// Place the EventTest and DispatcherReg stuff at the top, because they get called the
// most and stand to benefit from strong alignment and direct referencing.
iopDispatcherEvent = (DynGenFunc*)xGetPtr();
iopDispatcherEvent = xGetPtr();
xFastCall((void*)recEventTest);
iopDispatcherReg = _DynGen_DispatcherReg();
@ -279,11 +268,9 @@ static void _DynGen_Dispatchers()
iopJITCompileInBlock = _DynGen_JITCompileInBlock();
iopEnterRecompiledCode = _DynGen_EnterRecompiledCode();
HostSys::MemProtectStatic(iopRecDispatchers, PageAccess_ExecOnly());
recBlocks.SetJITCompile(iopJITCompile);
Perf::any.Register((void*)iopRecDispatchers, 4096, "IOP Dispatcher");
Perf::any.Register(start, xGetPtr() - start, "IOP Dispatcher");
}
////////////////////////////////////////////////////
@ -931,8 +918,6 @@ static void recAlloc()
if (!s_pInstCache)
pxFailRel("Failed to allocate R3000 InstCache array.");
}
_DynGen_Dispatchers();
}
void recResetIOP()
@ -941,6 +926,9 @@ void recResetIOP()
recAlloc();
recMem->Reset();
xSetPtr(*recMem);
_DynGen_Dispatchers();
recPtr = xGetPtr();
iopClearRecLUT((BASEBLOCK*)m_recBlockAlloc,
(((Ps2MemSize::IopRam + Ps2MemSize::Rom + Ps2MemSize::Rom1 + Ps2MemSize::Rom2) / 4)));
@ -990,7 +978,6 @@ void recResetIOP()
recBlocks.Reset();
g_psxMaxRecMem = 0;
recPtr = *recMem;
psxbranch = 0;
}
@ -1036,7 +1023,7 @@ static __noinline s32 recExecuteBlock(s32 eeCycles)
// mov edx,dword ptr [iopCycleEE (832A84h)]
// lea eax,[edx+ecx]
iopEnterRecompiledCode();
((void(*)())iopEnterRecompiledCode)();
return psxRegs.iopBreak + psxRegs.iopCycleEE;
}
@ -1579,9 +1566,8 @@ static void iopRecRecompile(const u32 startpc)
recResetIOP();
}
x86SetPtr(recPtr);
x86Align(16);
recPtr = x86Ptr;
xSetPtr(recPtr);
recPtr = xGetAlignedCallTarget();
s_pCurBlock = PSX_GETBLOCK(startpc);

View File

@ -360,19 +360,14 @@ static void recRecompile(const u32 startpc);
static void dyna_block_discard(u32 start, u32 sz);
static void dyna_page_reset(u32 start, u32 sz);
// Recompiled code buffer for EE recompiler dispatchers!
alignas(__pagesize) static u8 eeRecDispatchers[__pagesize];
typedef void DynGenFunc();
static DynGenFunc* DispatcherEvent = NULL;
static DynGenFunc* DispatcherReg = NULL;
static DynGenFunc* JITCompile = NULL;
static DynGenFunc* JITCompileInBlock = NULL;
static DynGenFunc* EnterRecompiledCode = NULL;
static DynGenFunc* ExitRecompiledCode = NULL;
static DynGenFunc* DispatchBlockDiscard = NULL;
static DynGenFunc* DispatchPageReset = NULL;
static const void* DispatcherEvent = nullptr;
static const void* DispatcherReg = nullptr;
static const void* JITCompile = nullptr;
static const void* JITCompileInBlock = nullptr;
static const void* EnterRecompiledCode = nullptr;
static const void* ExitRecompiledCode = nullptr;
static const void* DispatchBlockDiscard = nullptr;
static const void* DispatchPageReset = nullptr;
static void recEventTest()
{
@ -387,13 +382,13 @@ static void recEventTest()
// The address for all cleared blocks. It recompiles the current pc and then
// dispatches to the recompiled block address.
static DynGenFunc* _DynGen_JITCompile()
static const void* _DynGen_JITCompile()
{
pxAssertMsg(DispatcherReg != NULL, "Please compile the DispatcherReg subroutine *before* JITComple. Thanks.");
u8* retval = xGetAlignedCallTarget();
xFastCall((void*)recRecompile, ptr32[&cpuRegs.pc]);
xFastCall((const void*)recRecompile, ptr32[&cpuRegs.pc]);
// C equivalent:
// u32 addr = cpuRegs.pc;
@ -405,18 +400,18 @@ static DynGenFunc* _DynGen_JITCompile()
xMOV(rcx, ptrNative[xComplexAddress(rcx, recLUT, rax * wordsize)]);
xJMP(ptrNative[rbx * (wordsize / 4) + rcx]);
return (DynGenFunc*)retval;
return retval;
}
static DynGenFunc* _DynGen_JITCompileInBlock()
static const void* _DynGen_JITCompileInBlock()
{
u8* retval = xGetAlignedCallTarget();
xJMP((void*)JITCompile);
return (DynGenFunc*)retval;
xJMP(JITCompile);
return retval;
}
// called when jumping to variable pc address
static DynGenFunc* _DynGen_DispatcherReg()
static const void* _DynGen_DispatcherReg()
{
u8* retval = xGetPtr(); // fallthrough target, can't align it!
@ -430,19 +425,19 @@ static DynGenFunc* _DynGen_DispatcherReg()
xMOV(rcx, ptrNative[xComplexAddress(rcx, recLUT, rax * wordsize)]);
xJMP(ptrNative[rbx * (wordsize / 4) + rcx]);
return (DynGenFunc*)retval;
return retval;
}
static DynGenFunc* _DynGen_DispatcherEvent()
static const void* _DynGen_DispatcherEvent()
{
u8* retval = xGetPtr();
xFastCall((void*)recEventTest);
xFastCall((const void*)recEventTest);
return (DynGenFunc*)retval;
return retval;
}
static DynGenFunc* _DynGen_EnterRecompiledCode()
static const void* _DynGen_EnterRecompiledCode()
{
pxAssertDev(DispatcherReg != NULL, "Dynamically generated dispatchers are required prior to generating EnterRecompiledCode!");
@ -461,39 +456,33 @@ static DynGenFunc* _DynGen_EnterRecompiledCode()
xJMP((void*)DispatcherReg);
// Save an exit point
ExitRecompiledCode = (DynGenFunc*)xGetPtr();
ExitRecompiledCode = xGetPtr();
}
xRET();
return (DynGenFunc*)retval;
return retval;
}
static DynGenFunc* _DynGen_DispatchBlockDiscard()
static const void* _DynGen_DispatchBlockDiscard()
{
u8* retval = xGetPtr();
xFastCall((void*)dyna_block_discard);
xJMP((void*)ExitRecompiledCode);
return (DynGenFunc*)retval;
xFastCall((const void*)dyna_block_discard);
xJMP((const void*)ExitRecompiledCode);
return retval;
}
static DynGenFunc* _DynGen_DispatchPageReset()
static const void* _DynGen_DispatchPageReset()
{
u8* retval = xGetPtr();
xFastCall((void*)dyna_page_reset);
xJMP((void*)ExitRecompiledCode);
return (DynGenFunc*)retval;
xFastCall((const void*)dyna_page_reset);
xJMP((const void*)ExitRecompiledCode);
return retval;
}
static void _DynGen_Dispatchers()
{
// In case init gets called multiple times:
HostSys::MemProtectStatic(eeRecDispatchers, PageAccess_ReadWrite());
// clear the buffer to 0xcc (easier debugging).
memset(eeRecDispatchers, 0xcc, __pagesize);
xSetPtr(eeRecDispatchers);
const u8* start = xGetAlignedCallTarget();
// Place the EventTest and DispatcherReg stuff at the top, because they get called the
// most and stand to benefit from strong alignment and direct referencing.
@ -506,11 +495,9 @@ static void _DynGen_Dispatchers()
DispatchBlockDiscard = _DynGen_DispatchBlockDiscard();
DispatchPageReset = _DynGen_DispatchPageReset();
HostSys::MemProtectStatic(eeRecDispatchers, PageAccess_ExecOnly());
recBlocks.SetJITCompile(JITCompile);
Perf::any.Register((void*)eeRecDispatchers, 4096, "EE Dispatcher");
Perf::any.Register(start, static_cast<u32>(xGetPtr() - start), "EE Dispatcher");
}
@ -597,10 +584,6 @@ static void recAlloc()
if (!s_pInstCache)
pxFailRel("Failed to allocate R5900 InstCache array");
}
// No errors.. Proceed with initialization:
_DynGen_Dispatchers();
}
alignas(16) static u16 manual_page[Ps2MemSize::MainRam >> 12];
@ -616,6 +599,11 @@ static void recResetRaw()
recAlloc();
recMem->Reset();
xSetPtr(*recMem);
_DynGen_Dispatchers();
vtlb_DynGenDispatchers();
recPtr = xGetPtr();
ClearRecLUT((BASEBLOCK*)recLutReserve_RAM, recLutSize);
memset(recRAMCopy, 0, Ps2MemSize::MainRam);
@ -628,10 +616,6 @@ static void recResetRaw()
mmap_ResetBlockTracking();
vtlb_ClearLoadStoreInfo();
x86SetPtr(*recMem);
recPtr = *recMem;
g_branch = 0;
g_resetEeScalingStats = true;
}
@ -644,7 +628,7 @@ static void recShutdown()
recBlocks.Reset();
recRAM = recROM = recROM1 = recROM2 = NULL;
recRAM = recROM = recROM1 = recROM2 = nullptr;
safe_free(s_pInstCache);
s_nInstCacheSize = 0;
@ -720,13 +704,7 @@ static void recExecute()
if (!fastjmp_set(&m_SetJmp_StateCheck))
{
eeCpuExecuting = true;
// Important! Most of the console logging and such has cancel points in it. This is great
// in Windows, where SEH lets us safely kill a thread from anywhere we want. This is bad
// in Linux, which cannot have a C++ exception cross the recompiler. Hence the changing
// of the cancelstate here!
EnterRecompiledCode();
((void(*)())EnterRecompiledCode)();
// Generally unreachable code here ...
}
@ -1636,11 +1614,17 @@ void recMemcheck(u32 op, u32 bits, bool store)
// Preserve ecx (address) and edx (address+size) because we aren't breaking
// out of this loops iteration and dynarecMemLogcheck will clobber them
// Also keep 16 byte stack alignment
if(!(checks[i].result & MEMCHECK_BREAK))
if (!(checks[i].result & MEMCHECK_BREAK))
{
xPUSH(eax); xPUSH(ebx); xPUSH(ecx); xPUSH(edx);
xPUSH(eax);
xPUSH(ebx);
xPUSH(ecx);
xPUSH(edx);
xFastCall((void*)dynarecMemLogcheck, ecx, edx);
xPOP(edx); xPOP(ecx); xPOP(ebx); xPOP(eax);
xPOP(edx);
xPOP(ecx);
xPOP(ebx);
xPOP(eax);
}
else
{
@ -1926,7 +1910,7 @@ void recompileNextInstruction(bool delayslot, bool swapped_delay_slot)
std::string disasm = "";
disR5900Fasm(disasm, memRead32(i), i, false);
Console.Warning("%x %s%08X %s", i, i == pc - 4 ? "*" : i == p ? "=" :
" ",
" ",
memRead32(i), disasm.c_str());
}
break;
@ -1952,7 +1936,7 @@ void recompileNextInstruction(bool delayslot, bool swapped_delay_slot)
disasm = "";
disR5900Fasm(disasm, memRead32(i), i, false);
Console.Warning("%x %s%08X %s", i, i == pc - 4 ? "*" : i == p ? "=" :
" ",
" ",
memRead32(i), disasm.c_str());
}
break;

View File

@ -239,13 +239,9 @@ namespace vtlb_private
}
} // namespace vtlb_private
// ------------------------------------------------------------------------
// allocate one page for our naked indirect dispatcher function.
// this *must* be a full page, since we'll give it execution permission later.
// If it were smaller than a page we'd end up allowing execution rights on some
// other vars additionally (bad!).
//
alignas(__pagesize) static u8 m_IndirectDispatchers[__pagesize];
static constexpr u32 INDIRECT_DISPATCHER_SIZE = 32;
static constexpr u32 INDIRECT_DISPATCHERS_SIZE = 2 * 5 * 2 * INDIRECT_DISPATCHER_SIZE;
static u8* m_IndirectDispatchers = nullptr;
// ------------------------------------------------------------------------
// mode - 0 for read, 1 for write!
@ -255,16 +251,8 @@ static u8* GetIndirectDispatcherPtr(int mode, int operandsize, int sign = 0)
{
assert(mode || operandsize >= 3 ? !sign : true);
// Each dispatcher is aligned to 64 bytes. The actual dispatchers are only like
// 20-some bytes each, but 64 byte alignment on functions that are called
// more frequently than a hot sex hotline at 1:15am is probably a good thing.
// 7*64? 5 widths with two sign extension modes for 8 and 16 bit reads
// Gregory: a 32 bytes alignment is likely enough and more cache friendly
const int A = 32;
return &m_IndirectDispatchers[(mode * (8 * A)) + (sign * 5 * A) + (operandsize * A)];
return &m_IndirectDispatchers[(mode * (8 * INDIRECT_DISPATCHER_SIZE)) + (sign * 5 * INDIRECT_DISPATCHER_SIZE) +
(operandsize * INDIRECT_DISPATCHER_SIZE)];
}
// ------------------------------------------------------------------------
@ -359,18 +347,12 @@ static void DynGen_IndirectTlbDispatcher(int mode, int bits, bool sign)
// One-time initialization procedure. Multiple subsequent calls during the lifespan of the
// process will be ignored.
//
void vtlb_dynarec_init()
void vtlb_DynGenDispatchers()
{
static bool hasBeenCalled = false;
if (hasBeenCalled)
return;
hasBeenCalled = true;
// In case init gets called multiple times:
HostSys::MemProtectStatic(m_IndirectDispatchers, PageAccess_ReadWrite());
m_IndirectDispatchers = xGetAlignedCallTarget();
// clear the buffer to 0xcc (easier debugging).
memset(m_IndirectDispatchers, 0xcc, __pagesize);
std::memset(m_IndirectDispatchers, 0xcc, INDIRECT_DISPATCHERS_SIZE);
for (int mode = 0; mode < 2; ++mode)
{
@ -385,9 +367,9 @@ void vtlb_dynarec_init()
}
}
HostSys::MemProtectStatic(m_IndirectDispatchers, PageAccess_ExecOnly());
Perf::any.Register(m_IndirectDispatchers, INDIRECT_DISPATCHERS_SIZE, "TLB Dispatcher");
Perf::any.Register(m_IndirectDispatchers, __pagesize, "TLB Dispatcher");
xSetPtr(m_IndirectDispatchers + INDIRECT_DISPATCHERS_SIZE);
}
//////////////////////////////////////////////////////////////////////////////////////////

View File

@ -25,8 +25,6 @@
//------------------------------------------------------------------
// Micro VU - Main Functions
//------------------------------------------------------------------
alignas(__pagesize) static u8 vu0_RecDispatchers[mVUdispCacheSize];
alignas(__pagesize) static u8 vu1_RecDispatchers[mVUdispCacheSize];
void mVUreserveCache(microVU& mVU)
{
@ -49,18 +47,12 @@ void mVUinit(microVU& mVU, uint vuIndex)
mVU.progSize = (mVU.index ? 0x4000 : 0x1000) / 4;
mVU.progMemMask = mVU.progSize-1;
mVU.cacheSize = mVUcacheReserve;
mVU.cache = NULL;
mVU.dispCache = NULL;
mVU.startFunct = NULL;
mVU.exitFunct = NULL;
mVU.cache = nullptr;
mVU.startFunct = nullptr;
mVU.exitFunct = nullptr;
mVUreserveCache(mVU);
if (vuIndex)
mVU.dispCache = vu1_RecDispatchers;
else
mVU.dispCache = vu0_RecDispatchers;
mVU.regAlloc.reset(new microRegAlloc(mVU.index));
}
@ -82,15 +74,12 @@ void mVUreset(microVU& mVU, bool resetReserve)
if (resetReserve)
mVU.cache_reserve->Reset();
HostSys::MemProtect(mVU.dispCache, mVUdispCacheSize, PageAccess_ReadWrite());
memset(mVU.dispCache, 0xcc, mVUdispCacheSize);
x86SetPtr(mVU.dispCache);
xSetPtr(mVU.cache);
mVUdispatcherAB(mVU);
mVUdispatcherCD(mVU);
mvuGenerateWaitMTVU(mVU);
mvuGenerateCopyPipelineState(mVU);
mVUemitSearch();
mVUGenerateWaitMTVU(mVU);
mVUGenerateCopyPipelineState(mVU);
mVUGenerateCompareState(mVU);
mVU.regs().nextBlockCycles = 0;
memset(&mVU.prog.lpState, 0, sizeof(mVU.prog.lpState));
@ -104,10 +93,9 @@ void mVUreset(microVU& mVU, bool resetReserve)
mVU.prog.curFrame = 0;
// Setup Dynarec Cache Limits for Each Program
u8* z = mVU.cache;
mVU.prog.x86start = z;
mVU.prog.x86ptr = z;
mVU.prog.x86end = z + ((mVU.cacheSize - mVUcacheSafeZone) * _1mb);
mVU.prog.x86start = xGetAlignedCallTarget();
mVU.prog.x86ptr = mVU.prog.x86start;
mVU.prog.x86end = mVU.cache + ((mVU.cacheSize - mVUcacheSafeZone) * _1mb);
for (u32 i = 0; i < (mVU.progSize / 2); i++)
{
@ -125,8 +113,6 @@ void mVUreset(microVU& mVU, bool resetReserve)
mVU.prog.quick[i].block = NULL;
mVU.prog.quick[i].prog = NULL;
}
HostSys::MemProtect(mVU.dispCache, mVUdispCacheSize, PageAccess_ExecOnly());
}
// Free Allocated Resources

View File

@ -37,6 +37,8 @@ using namespace x86Emitter;
#include "microVU_Profiler.h"
#include "common/Perf.h"
class microBlockManager;
struct microBlockLink
{
microBlock block;
@ -49,135 +51,6 @@ struct microBlockLinkRef
u64 quick;
};
class microBlockManager
{
private:
microBlockLink *qBlockList, *qBlockEnd; // Quick Search
microBlockLink *fBlockList, *fBlockEnd; // Full Search
std::vector<microBlockLinkRef> quickLookup;
int qListI, fListI;
public:
inline int getFullListCount() const { return fListI; }
microBlockManager()
{
qListI = fListI = 0;
qBlockEnd = qBlockList = nullptr;
fBlockEnd = fBlockList = nullptr;
}
~microBlockManager() { reset(); }
void reset()
{
for (microBlockLink* linkI = qBlockList; linkI != nullptr;)
{
microBlockLink* freeI = linkI;
safe_delete_array(linkI->block.jumpCache);
linkI = linkI->next;
_aligned_free(freeI);
}
for (microBlockLink* linkI = fBlockList; linkI != nullptr;)
{
microBlockLink* freeI = linkI;
safe_delete_array(linkI->block.jumpCache);
linkI = linkI->next;
_aligned_free(freeI);
}
qListI = fListI = 0;
qBlockEnd = qBlockList = nullptr;
fBlockEnd = fBlockList = nullptr;
quickLookup.clear();
};
microBlock* add(microBlock* pBlock)
{
microBlock* thisBlock = search(&pBlock->pState);
if (!thisBlock)
{
u8 fullCmp = pBlock->pState.needExactMatch;
if (fullCmp)
fListI++;
else
qListI++;
microBlockLink*& blockList = fullCmp ? fBlockList : qBlockList;
microBlockLink*& blockEnd = fullCmp ? fBlockEnd : qBlockEnd;
microBlockLink* newBlock = (microBlockLink*)_aligned_malloc(sizeof(microBlockLink), 32);
newBlock->block.jumpCache = nullptr;
newBlock->next = nullptr;
if (blockEnd)
{
blockEnd->next = newBlock;
blockEnd = newBlock;
}
else
{
blockEnd = blockList = newBlock;
}
std::memcpy(&newBlock->block, pBlock, sizeof(microBlock));
thisBlock = &newBlock->block;
quickLookup.push_back({&newBlock->block, pBlock->pState.quick64[0]});
}
return thisBlock;
}
__ri microBlock* search(microRegInfo* pState)
{
if (pState->needExactMatch) // Needs Detailed Search (Exact Match of Pipeline State)
{
microBlockLink* prevI = nullptr;
for (microBlockLink* linkI = fBlockList; linkI != nullptr; prevI = linkI, linkI = linkI->next)
{
if (mVUquickSearch(pState, &linkI->block.pState, sizeof(microRegInfo)))
{
if (linkI != fBlockList)
{
prevI->next = linkI->next;
linkI->next = fBlockList;
fBlockList = linkI;
}
return &linkI->block;
}
}
}
else // Can do Simple Search (Only Matches the Important Pipeline Stuff)
{
const u64 quick64 = pState->quick64[0];
for (const microBlockLinkRef& ref : quickLookup)
{
if (ref.quick != quick64) continue;
if (doConstProp && (ref.pBlock->pState.vi15 != pState->vi15)) continue;
if (doConstProp && (ref.pBlock->pState.vi15v != pState->vi15v)) continue;
return ref.pBlock;
}
}
return nullptr;
}
void printInfo(int pc, bool printQuick)
{
int listI = printQuick ? qListI : fListI;
if (listI < 7)
return;
microBlockLink* linkI = printQuick ? qBlockList : fBlockList;
for (int i = 0; i <= listI; i++)
{
u32 viCRC = 0, vfCRC = 0, crc = 0, z = sizeof(microRegInfo) / 4;
for (u32 j = 0; j < 4; j++) viCRC -= ((u32*)linkI->block.pState.VI)[j];
for (u32 j = 0; j < 32; j++) vfCRC -= linkI->block.pState.VF[j].x + (linkI->block.pState.VF[j].y << 8) + (linkI->block.pState.VF[j].z << 16) + (linkI->block.pState.VF[j].w << 24);
for (u32 j = 0; j < z; j++) crc -= ((u32*)&linkI->block.pState)[j];
DevCon.WriteLn(Color_Green,
"[%04x][Block #%d][crc=%08x][q=%02d][p=%02d][xgkick=%d][vi15=%04x][vi15v=%d][viBackup=%02d]"
"[flags=%02x][exactMatch=%x][blockType=%d][viCRC=%08x][vfCRC=%08x]",
pc, i, crc, linkI->block.pState.q,
linkI->block.pState.p, linkI->block.pState.xgkick, linkI->block.pState.vi15, linkI->block.pState.vi15v,
linkI->block.pState.viBackUp, linkI->block.pState.flagInfo, linkI->block.pState.needExactMatch,
linkI->block.pState.blockType, viCRC, vfCRC);
linkI = linkI->next;
}
}
};
struct microRange
{
s32 start; // Start PC (The opcode the block starts at)
@ -246,11 +119,11 @@ struct microVU
RecompiledCodeReserve* cache_reserve;
u8* cache; // Dynarec Cache Start (where we will start writing the recompiled code to)
u8* dispCache; // Dispatchers Cache (where startFunct and exitFunct are written to)
u8* startFunct; // Function Ptr to the recompiler dispatcher (start)
u8* exitFunct; // Function Ptr to the recompiler dispatcher (exit)
u8* startFunctXG; // Function Ptr to the recompiler dispatcher (xgkick resume)
u8* exitFunctXG; // Function Ptr to the recompiler dispatcher (xgkick exit)
u8* compareStateF;// Function Ptr to search which compares all state.
u8* waitMTVU; // Ptr to function to save registers/sync VU1 thread
u8* copyPLState; // Ptr to function to copy pipeline state into microVU
u8* resumePtrXG; // Ptr to recompiled code position to resume xgkick
@ -275,6 +148,139 @@ struct microVU
{
return (index && THREAD_VU1) ? vu1Thread.vifRegs : regs().GetVifRegs();
}
__fi u32 compareState(microRegInfo* lhs, microRegInfo* rhs) const {
return reinterpret_cast<u32(*)(void*, void*)>(compareStateF)(lhs, rhs);
}
};
class microBlockManager
{
private:
microBlockLink *qBlockList, *qBlockEnd; // Quick Search
microBlockLink *fBlockList, *fBlockEnd; // Full Search
std::vector<microBlockLinkRef> quickLookup;
int qListI, fListI;
public:
inline int getFullListCount() const { return fListI; }
microBlockManager()
{
qListI = fListI = 0;
qBlockEnd = qBlockList = nullptr;
fBlockEnd = fBlockList = nullptr;
}
~microBlockManager() { reset(); }
void reset()
{
for (microBlockLink* linkI = qBlockList; linkI != nullptr;)
{
microBlockLink* freeI = linkI;
safe_delete_array(linkI->block.jumpCache);
linkI = linkI->next;
_aligned_free(freeI);
}
for (microBlockLink* linkI = fBlockList; linkI != nullptr;)
{
microBlockLink* freeI = linkI;
safe_delete_array(linkI->block.jumpCache);
linkI = linkI->next;
_aligned_free(freeI);
}
qListI = fListI = 0;
qBlockEnd = qBlockList = nullptr;
fBlockEnd = fBlockList = nullptr;
quickLookup.clear();
};
microBlock* add(microVU& mVU, microBlock* pBlock)
{
microBlock* thisBlock = search(mVU, &pBlock->pState);
if (!thisBlock)
{
u8 fullCmp = pBlock->pState.needExactMatch;
if (fullCmp)
fListI++;
else
qListI++;
microBlockLink*& blockList = fullCmp ? fBlockList : qBlockList;
microBlockLink*& blockEnd = fullCmp ? fBlockEnd : qBlockEnd;
microBlockLink* newBlock = (microBlockLink*)_aligned_malloc(sizeof(microBlockLink), 32);
newBlock->block.jumpCache = nullptr;
newBlock->next = nullptr;
if (blockEnd)
{
blockEnd->next = newBlock;
blockEnd = newBlock;
}
else
{
blockEnd = blockList = newBlock;
}
std::memcpy(&newBlock->block, pBlock, sizeof(microBlock));
thisBlock = &newBlock->block;
quickLookup.push_back({&newBlock->block, pBlock->pState.quick64[0]});
}
return thisBlock;
}
__ri microBlock* search(microVU& mVU, microRegInfo* pState)
{
if (pState->needExactMatch) // Needs Detailed Search (Exact Match of Pipeline State)
{
microBlockLink* prevI = nullptr;
for (microBlockLink* linkI = fBlockList; linkI != nullptr; prevI = linkI, linkI = linkI->next)
{
if (mVU.compareState(pState, &linkI->block.pState) == 0)
{
if (linkI != fBlockList)
{
prevI->next = linkI->next;
linkI->next = fBlockList;
fBlockList = linkI;
}
return &linkI->block;
}
}
}
else // Can do Simple Search (Only Matches the Important Pipeline Stuff)
{
const u64 quick64 = pState->quick64[0];
for (const microBlockLinkRef& ref : quickLookup)
{
if (ref.quick != quick64) continue;
if (doConstProp && (ref.pBlock->pState.vi15 != pState->vi15)) continue;
if (doConstProp && (ref.pBlock->pState.vi15v != pState->vi15v)) continue;
return ref.pBlock;
}
}
return nullptr;
}
void printInfo(int pc, bool printQuick)
{
int listI = printQuick ? qListI : fListI;
if (listI < 7)
return;
microBlockLink* linkI = printQuick ? qBlockList : fBlockList;
for (int i = 0; i <= listI; i++)
{
u32 viCRC = 0, vfCRC = 0, crc = 0, z = sizeof(microRegInfo) / 4;
for (u32 j = 0; j < 4; j++) viCRC -= ((u32*)linkI->block.pState.VI)[j];
for (u32 j = 0; j < 32; j++) vfCRC -= linkI->block.pState.VF[j].x + (linkI->block.pState.VF[j].y << 8) + (linkI->block.pState.VF[j].z << 16) + (linkI->block.pState.VF[j].w << 24);
for (u32 j = 0; j < z; j++) crc -= ((u32*)&linkI->block.pState)[j];
DevCon.WriteLn(Color_Green,
"[%04x][Block #%d][crc=%08x][q=%02d][p=%02d][xgkick=%d][vi15=%04x][vi15v=%d][viBackup=%02d]"
"[flags=%02x][exactMatch=%x][blockType=%d][viCRC=%08x][vfCRC=%08x]",
pc, i, crc, linkI->block.pState.q,
linkI->block.pState.p, linkI->block.pState.xgkick, linkI->block.pState.vi15, linkI->block.pState.vi15v,
linkI->block.pState.viBackUp, linkI->block.pState.flagInfo, linkI->block.pState.needExactMatch,
linkI->block.pState.blockType, viCRC, vfCRC);
linkI = linkI->next;
}
}
};
// microVU rec structs

View File

@ -290,7 +290,7 @@ void normBranchCompile(microVU& mVU, u32 branchPC)
{
microBlock* pBlock;
blockCreate(branchPC / 8);
pBlock = mVUblocks[branchPC / 8]->search((microRegInfo*)&mVUregs);
pBlock = mVUblocks[branchPC / 8]->search(mVU, (microRegInfo*)&mVUregs);
if (pBlock)
xJMP(pBlock->x86ptrStart);
else
@ -540,7 +540,7 @@ void condBranch(mV, microFlagCycles& mFC, int JMPcc)
microBlock* bBlock;
incPC2(1); // Check if Branch Non-Taken Side has already been recompiled
blockCreate(iPC / 2);
bBlock = mVUblocks[iPC / 2]->search((microRegInfo*)&mVUregs);
bBlock = mVUblocks[iPC / 2]->search(mVU, (microRegInfo*)&mVUregs);
incPC2(-1);
if (bBlock) // Branch non-taken has already been compiled
{

View File

@ -531,7 +531,7 @@ __fi void mVUinitFirstPass(microVU& mVU, uptr pState, u8* thisPtr)
memcpy((u8*)&mVU.prog.lpState, (u8*)pState, sizeof(microRegInfo));
}
mVUblock.x86ptrStart = thisPtr;
mVUpBlock = mVUblocks[mVUstartPC / 2]->add(&mVUblock); // Add this block to block manager
mVUpBlock = mVUblocks[mVUstartPC / 2]->add(mVU, &mVUblock); // Add this block to block manager
mVUregs.needExactMatch = (mVUpBlock->pState.blockType) ? 7 : 0; // ToDo: Fix 1-Op block flag linking (MGS2:Demo/Sly Cooper)
mVUregs.blockType = 0;
mVUregs.viBackUp = 0;
@ -988,7 +988,7 @@ perf_and_return:
// Returns the entry point of the block (compiles it if not found)
__fi void* mVUentryGet(microVU& mVU, microBlockManager* block, u32 startPC, uptr pState)
{
microBlock* pBlock = block->search((microRegInfo*)pState);
microBlock* pBlock = block->search(mVU, (microRegInfo*)pState);
if (pBlock)
return pBlock->x86ptrStart;
else

View File

@ -31,7 +31,7 @@ static bool mvuNeedsFPCRUpdate(mV)
// Generates the code for entering/exit recompiled blocks
void mVUdispatcherAB(mV)
{
mVU.startFunct = x86Ptr;
mVU.startFunct = xGetAlignedCallTarget();
{
xScopedStackFrame frame(false, true);
@ -92,9 +92,6 @@ void mVUdispatcherAB(mV)
xRET();
pxAssertDev(xGetPtr() < (mVU.dispCache + mVUdispCacheSize),
"microVU: Dispatcher generation exceeded reserved cache area!");
Perf::any.Register(mVU.startFunct, static_cast<u32>(xGetPtr() - mVU.startFunct),
mVU.index ? "VU1StartFunc" : "VU0StartFunc");
}
@ -102,7 +99,7 @@ void mVUdispatcherAB(mV)
// Generates the code for resuming/exit xgkick
void mVUdispatcherCD(mV)
{
mVU.startFunctXG = x86Ptr;
mVU.startFunctXG = xGetAlignedCallTarget();
{
xScopedStackFrame frame(false, true);
@ -135,17 +132,13 @@ void mVUdispatcherCD(mV)
xRET();
pxAssertDev(xGetPtr() < (mVU.dispCache + mVUdispCacheSize),
"microVU: Dispatcher generation exceeded reserved cache area!");
Perf::any.Register(mVU.startFunctXG, static_cast<u32>(xGetPtr() - mVU.startFunctXG),
mVU.index ? "VU1StartFuncXG" : "VU0StartFuncXG");
}
void mvuGenerateWaitMTVU(mV)
static void mVUGenerateWaitMTVU(mV)
{
xAlignCallTarget();
mVU.waitMTVU = x86Ptr;
mVU.waitMTVU = xGetAlignedCallTarget();
int num_xmms = 0, num_gprs = 0;
@ -215,17 +208,13 @@ void mvuGenerateWaitMTVU(mV)
xRET();
pxAssertDev(xGetPtr() < (mVU.dispCache + mVUdispCacheSize),
"microVU: Dispatcher generation exceeded reserved cache area!");
Perf::any.Register(mVU.waitMTVU, static_cast<u32>(xGetPtr() - mVU.waitMTVU),
mVU.index ? "VU1WaitMTVU" : "VU0WaitMTVU");
}
void mvuGenerateCopyPipelineState(mV)
static void mVUGenerateCopyPipelineState(mV)
{
xAlignCallTarget();
mVU.copyPLState = x86Ptr;
mVU.copyPLState = xGetAlignedCallTarget();
if (x86caps.hasAVX2)
{
@ -258,13 +247,76 @@ void mvuGenerateCopyPipelineState(mV)
xRET();
pxAssertDev(xGetPtr() < (mVU.dispCache + mVUdispCacheSize),
"microVU: Dispatcher generation exceeded reserved cache area!");
Perf::any.Register(mVU.copyPLState, static_cast<u32>(xGetPtr() - mVU.copyPLState),
mVU.index ? "VU1CopyPLState" : "VU0CopyPLState");
}
//------------------------------------------------------------------
// Micro VU - Custom Quick Search
//------------------------------------------------------------------
// Generates a custom optimized block-search function
// Note: Structs must be 16-byte aligned! (GCC doesn't guarantee this)
static void mVUGenerateCompareState(mV)
{
mVU.compareStateF = xGetAlignedCallTarget();
if (!x86caps.hasAVX2)
{
xMOVAPS (xmm0, ptr32[arg1reg]);
xPCMP.EQD(xmm0, ptr32[arg2reg]);
xMOVAPS (xmm1, ptr32[arg1reg + 0x10]);
xPCMP.EQD(xmm1, ptr32[arg2reg + 0x10]);
xPAND (xmm0, xmm1);
xMOVMSKPS(eax, xmm0);
xXOR (eax, 0xf);
xForwardJNZ8 exitPoint;
xMOVAPS (xmm0, ptr32[arg1reg + 0x20]);
xPCMP.EQD(xmm0, ptr32[arg2reg + 0x20]);
xMOVAPS (xmm1, ptr32[arg1reg + 0x30]);
xPCMP.EQD(xmm1, ptr32[arg2reg + 0x30]);
xPAND (xmm0, xmm1);
xMOVAPS (xmm1, ptr32[arg1reg + 0x40]);
xPCMP.EQD(xmm1, ptr32[arg2reg + 0x40]);
xMOVAPS (xmm2, ptr32[arg1reg + 0x50]);
xPCMP.EQD(xmm2, ptr32[arg2reg + 0x50]);
xPAND (xmm1, xmm2);
xPAND (xmm0, xmm1);
xMOVMSKPS(eax, xmm0);
xXOR(eax, 0xf);
exitPoint.SetTarget();
}
else
{
// We have to use unaligned loads here, because the blocks are only 16 byte aligned.
xVMOVUPS(ymm0, ptr[arg1reg]);
xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg]);
xVPMOVMSKB(eax, ymm0);
xXOR(eax, 0xffffffff);
xForwardJNZ8 exitPoint;
xVMOVUPS(ymm0, ptr[arg1reg + 0x20]);
xVMOVUPS(ymm1, ptr[arg1reg + 0x40]);
xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg + 0x20]);
xVPCMP.EQD(ymm1, ymm1, ptr[arg2reg + 0x40]);
xVPAND(ymm0, ymm0, ymm1);
xVPMOVMSKB(eax, ymm0);
xNOT(eax);
exitPoint.SetTarget();
xVZEROUPPER();
}
xRET();
}
//------------------------------------------------------------------
// Execution Functions
//------------------------------------------------------------------

View File

@ -190,18 +190,6 @@ typedef Fntype_mVUrecInst* Fnptr_mVUrecInst;
#define opCase3 if (opCase == 3) // I Opcodes
#define opCase4 if (opCase == 4) // Q Opcodes
//------------------------------------------------------------------
// Define mVUquickSearch
//------------------------------------------------------------------
alignas(__pagesize) extern u8 mVUsearchXMM[__pagesize];
typedef u32 (*mVUCall)(void*, void*);
#define mVUquickSearch(dest, src, size) ((((mVUCall)((void*)mVUsearchXMM))(dest, src)) == 0)
#define mVUemitSearch() \
{ \
mVUcustomSearch(); \
}
//------------------------------------------------------------------
// Misc Macros...
#define mVUcurProg mVU.prog.cur[0]
#define mVUblocks mVU.prog.cur->block

View File

@ -606,73 +606,3 @@ void SSE_DIVSS(mV, const xmm& to, const xmm& from, const xmm& t1 = xEmptyReg, co
{
clampOp(xDIV.SS, false);
}
//------------------------------------------------------------------
// Micro VU - Custom Quick Search
//------------------------------------------------------------------
alignas(__pagesize) u8 mVUsearchXMM[__pagesize];
// Generates a custom optimized block-search function
// Note: Structs must be 16-byte aligned! (GCC doesn't guarantee this)
void mVUcustomSearch()
{
HostSys::MemProtectStatic(mVUsearchXMM, PageAccess_ReadWrite());
memset(mVUsearchXMM, 0xcc, __pagesize);
xSetPtr(mVUsearchXMM);
if (!x86caps.hasAVX2)
{
xMOVAPS (xmm0, ptr32[arg1reg]);
xPCMP.EQD(xmm0, ptr32[arg2reg]);
xMOVAPS (xmm1, ptr32[arg1reg + 0x10]);
xPCMP.EQD(xmm1, ptr32[arg2reg + 0x10]);
xPAND (xmm0, xmm1);
xMOVMSKPS(eax, xmm0);
xXOR (eax, 0xf);
xForwardJNZ8 exitPoint;
xMOVAPS (xmm0, ptr32[arg1reg + 0x20]);
xPCMP.EQD(xmm0, ptr32[arg2reg + 0x20]);
xMOVAPS (xmm1, ptr32[arg1reg + 0x30]);
xPCMP.EQD(xmm1, ptr32[arg2reg + 0x30]);
xPAND (xmm0, xmm1);
xMOVAPS (xmm1, ptr32[arg1reg + 0x40]);
xPCMP.EQD(xmm1, ptr32[arg2reg + 0x40]);
xMOVAPS (xmm2, ptr32[arg1reg + 0x50]);
xPCMP.EQD(xmm2, ptr32[arg2reg + 0x50]);
xPAND (xmm1, xmm2);
xPAND (xmm0, xmm1);
xMOVMSKPS(eax, xmm0);
xXOR(eax, 0xf);
exitPoint.SetTarget();
}
else
{
// We have to use unaligned loads here, because the blocks are only 16 byte aligned.
xVMOVUPS(ymm0, ptr[arg1reg]);
xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg]);
xVPMOVMSKB(eax, ymm0);
xXOR(eax, 0xffffffff);
xForwardJNZ8 exitPoint;
xVMOVUPS(ymm0, ptr[arg1reg + 0x20]);
xVMOVUPS(ymm1, ptr[arg1reg + 0x40]);
xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg + 0x20]);
xVPCMP.EQD(ymm1, ymm1, ptr[arg2reg + 0x40]);
xVPAND(ymm0, ymm0, ymm1);
xVPMOVMSKB(eax, ymm0);
xNOT(eax);
exitPoint.SetTarget();
xVZEROUPPER();
}
xRET();
HostSys::MemProtectStatic(mVUsearchXMM, PageAccess_ExecOnly());
}

View File

@ -390,8 +390,6 @@ void VifUnpackSSE_Init()
for (int c = 0; c < 4; c++)
nVifGen(a, b, c);
nVifUpkExec->ForbidModification();
DevCon.WriteLn("Unpack function generation complete. Generated function statistics:");
DevCon.Indent().WriteLn(
"Reserved buffer : %u bytes @ 0x%016" PRIXPTR "\n"