microVU: Choose AVX2/SSE at runtime

This commit is contained in:
TellowKrinkle 2022-10-19 17:56:31 -05:00 committed by TellowKrinkle
parent 22f214c8e1
commit e0a0e0b00c
5 changed files with 69 additions and 61 deletions

View File

@ -68,6 +68,11 @@ namespace x86Emitter
{0x66, 0x66}, // VPCMPGTD
};
void xVPMOVMSKB(const xRegister32& to, const xRegisterSSE& from)
{
xOpWriteC5(0x66, 0xd7, to, xRegister32(), from);
}
void xVMOVMSKPS(const xRegister32& to, const xRegisterSSE& from)
{
xOpWriteC5(0x00, 0x50, to, xRegister32(), from);

View File

@ -638,6 +638,7 @@ namespace x86Emitter
extern const xImplAVX_ThreeArgYMM xVPXOR;
extern const xImplAVX_CmpInt xVPCMP;
extern void xVPMOVMSKB(const xRegister32& to, const xRegisterSSE& from);
extern void xVMOVMSKPS(const xRegister32& to, const xRegisterSSE& from);
extern void xVMOVMSKPD(const xRegister32& to, const xRegisterSSE& from);
extern void xVZEROUPPER();

View File

@ -100,7 +100,7 @@ public:
microBlockLink*& blockList = fullCmp ? fBlockList : qBlockList;
microBlockLink*& blockEnd = fullCmp ? fBlockEnd : qBlockEnd;
microBlockLink* newBlock = (microBlockLink*)_aligned_malloc(sizeof(microBlockLink), SSE_ALIGN_N);
microBlockLink* newBlock = (microBlockLink*)_aligned_malloc(sizeof(microBlockLink), 32);
newBlock->block.jumpCache = nullptr;
newBlock->next = nullptr;

View File

@ -198,11 +198,7 @@ typedef Fntype_mVUrecInst* Fnptr_mVUrecInst;
//------------------------------------------------------------------
alignas(__pagesize) extern u8 mVUsearchXMM[__pagesize];
typedef u32 (*mVUCall)(void*, void*);
#if _M_SSE >= 0x501
#define mVUquickSearch(dest, src, size) ((((mVUCall)((void*)mVUsearchXMM))(dest, src)) == 0xff)
#else
#define mVUquickSearch(dest, src, size) ((((mVUCall)((void*)mVUsearchXMM))(dest, src)) == 0xf)
#endif
#define mVUquickSearch(dest, src, size) ((((mVUCall)((void*)mVUsearchXMM))(dest, src)) == 0)
#define mVUemitSearch() \
{ \
mVUcustomSearch(); \

View File

@ -537,74 +537,80 @@ void mVUcustomSearch()
memset(mVUsearchXMM, 0xcc, __pagesize);
xSetPtr(mVUsearchXMM);
#if _M_SSE < 0x501
xMOVAPS (xmm0, ptr32[arg1reg]);
xPCMP.EQD(xmm0, ptr32[arg2reg]);
xMOVAPS (xmm1, ptr32[arg1reg + 0x10]);
xPCMP.EQD(xmm1, ptr32[arg2reg + 0x10]);
xPAND (xmm0, xmm1);
if (!x86caps.hasAVX2)
{
xMOVAPS (xmm0, ptr32[arg1reg]);
xPCMP.EQD(xmm0, ptr32[arg2reg]);
xMOVAPS (xmm1, ptr32[arg1reg + 0x10]);
xPCMP.EQD(xmm1, ptr32[arg2reg + 0x10]);
xPAND (xmm0, xmm1);
xMOVMSKPS(eax, xmm0);
xCMP (eax, 0xf);
xForwardJL8 exitPoint;
xMOVMSKPS(eax, xmm0);
xXOR (eax, 0xf);
xForwardJNZ8 exitPoint;
xMOVAPS (xmm0, ptr32[arg1reg + 0x20]);
xPCMP.EQD(xmm0, ptr32[arg2reg + 0x20]);
xMOVAPS (xmm1, ptr32[arg1reg + 0x30]);
xPCMP.EQD(xmm1, ptr32[arg2reg + 0x30]);
xPAND (xmm0, xmm1);
xMOVAPS (xmm0, ptr32[arg1reg + 0x20]);
xPCMP.EQD(xmm0, ptr32[arg2reg + 0x20]);
xMOVAPS (xmm1, ptr32[arg1reg + 0x30]);
xPCMP.EQD(xmm1, ptr32[arg2reg + 0x30]);
xPAND (xmm0, xmm1);
xMOVAPS (xmm2, ptr32[arg1reg + 0x40]);
xPCMP.EQD(xmm2, ptr32[arg2reg + 0x40]);
xMOVAPS (xmm3, ptr32[arg1reg + 0x50]);
xPCMP.EQD(xmm3, ptr32[arg2reg + 0x50]);
xPAND (xmm2, xmm3);
xMOVAPS (xmm2, ptr32[arg1reg + 0x40]);
xPCMP.EQD(xmm2, ptr32[arg2reg + 0x40]);
xMOVAPS (xmm3, ptr32[arg1reg + 0x50]);
xPCMP.EQD(xmm3, ptr32[arg2reg + 0x50]);
xPAND (xmm2, xmm3);
xMOVAPS (xmm4, ptr32[arg1reg + 0x60]);
xPCMP.EQD(xmm4, ptr32[arg2reg + 0x60]);
xMOVAPS (xmm5, ptr32[arg1reg + 0x70]);
xPCMP.EQD(xmm5, ptr32[arg2reg + 0x70]);
xPAND (xmm4, xmm5);
xMOVAPS (xmm4, ptr32[arg1reg + 0x60]);
xPCMP.EQD(xmm4, ptr32[arg2reg + 0x60]);
xMOVAPS (xmm5, ptr32[arg1reg + 0x70]);
xPCMP.EQD(xmm5, ptr32[arg2reg + 0x70]);
xPAND (xmm4, xmm5);
xMOVAPS (xmm6, ptr32[arg1reg + 0x80]);
xPCMP.EQD(xmm6, ptr32[arg2reg + 0x80]);
xMOVAPS (xmm7, ptr32[arg1reg + 0x90]);
xPCMP.EQD(xmm7, ptr32[arg2reg + 0x90]);
xPAND (xmm6, xmm7);
xMOVAPS (xmm6, ptr32[arg1reg + 0x80]);
xPCMP.EQD(xmm6, ptr32[arg2reg + 0x80]);
xMOVAPS (xmm7, ptr32[arg1reg + 0x90]);
xPCMP.EQD(xmm7, ptr32[arg2reg + 0x90]);
xPAND (xmm6, xmm7);
xPAND (xmm0, xmm2);
xPAND (xmm4, xmm6);
xPAND (xmm0, xmm4);
xMOVMSKPS(eax, xmm0);
xPAND (xmm0, xmm2);
xPAND (xmm4, xmm6);
xPAND (xmm0, xmm4);
xMOVMSKPS(eax, xmm0);
xXOR(eax, 0xf);
#else
// We have to use unaligned loads here, because the blocks are only 16 byte aligned.
xVMOVUPS(ymm0, ptr[arg1reg]);
xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg]);
xVMOVMSKPS(eax, ymm0);
xCMP(eax, 0xff);
xForwardJB8 exitPoint;
exitPoint.SetTarget();
}
else
{
// We have to use unaligned loads here, because the blocks are only 16 byte aligned.
xVMOVUPS(ymm0, ptr[arg1reg]);
xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg]);
xVPMOVMSKB(eax, ymm0);
xNOT(eax);
xForwardJNZ8 exitPoint;
xVMOVUPS(ymm0, ptr[arg1reg + 0x20]);
xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg + 0x20]);
xVMOVUPS(ymm0, ptr[arg1reg + 0x20]);
xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg + 0x20]);
xVMOVUPS(ymm1, ptr[arg1reg + 0x40]);
xVPCMP.EQD(ymm1, ymm1, ptr[arg2reg + 0x40]);
xVMOVUPS(ymm1, ptr[arg1reg + 0x40]);
xVPCMP.EQD(ymm1, ymm1, ptr[arg2reg + 0x40]);
xVMOVUPS(ymm2, ptr[arg1reg + 0x60]);
xVPCMP.EQD(ymm2, ymm2, ptr[arg2reg + 0x60]);
xVPAND(ymm0, ymm0, ymm1);
xVMOVUPS(ymm2, ptr[arg1reg + 0x60]);
xVPCMP.EQD(ymm2, ymm2, ptr[arg2reg + 0x60]);
xVPAND(ymm0, ymm0, ymm1);
xVMOVUPS(ymm1, ptr[arg1reg + 0x80]);
xVPAND(ymm0, ymm0, ymm2);
xVPCMP.EQD(ymm1, ymm1, ptr[arg2reg + 0x80]);
xVPAND(ymm0, ymm0, ymm1);
xVMOVUPS(ymm1, ptr[arg1reg + 0x80]);
xVPAND(ymm0, ymm0, ymm2);
xVPCMP.EQD(ymm1, ymm1, ptr[arg2reg + 0x80]);
xVPAND(ymm0, ymm0, ymm1);
xVMOVMSKPS(eax, ymm0);
xVZEROUPPER();
#endif
xVPMOVMSKB(eax, ymm0);
xNOT(eax);
exitPoint.SetTarget();
exitPoint.SetTarget();
xVZEROUPPER();
}
xRET();
HostSys::MemProtectStatic(mVUsearchXMM, PageAccess_ExecOnly());