From e0a0e0b00c8fab3ba45f07c33bdb82cdfd29baec Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Wed, 19 Oct 2022 17:56:31 -0500 Subject: [PATCH] microVU: Choose AVX2/SSE at runtime --- common/emitter/avx.cpp | 5 ++ common/emitter/instructions.h | 1 + pcsx2/x86/microVU.h | 2 +- pcsx2/x86/microVU_Misc.h | 6 +- pcsx2/x86/microVU_Misc.inl | 116 ++++++++++++++++++---------------- 5 files changed, 69 insertions(+), 61 deletions(-) diff --git a/common/emitter/avx.cpp b/common/emitter/avx.cpp index 272bdd9b3c..feaafe55b3 100644 --- a/common/emitter/avx.cpp +++ b/common/emitter/avx.cpp @@ -68,6 +68,11 @@ namespace x86Emitter {0x66, 0x66}, // VPCMPGTD }; + void xVPMOVMSKB(const xRegister32& to, const xRegisterSSE& from) + { + xOpWriteC5(0x66, 0xd7, to, xRegister32(), from); + } + void xVMOVMSKPS(const xRegister32& to, const xRegisterSSE& from) { xOpWriteC5(0x00, 0x50, to, xRegister32(), from); diff --git a/common/emitter/instructions.h b/common/emitter/instructions.h index 543bf67306..78807d4683 100644 --- a/common/emitter/instructions.h +++ b/common/emitter/instructions.h @@ -638,6 +638,7 @@ namespace x86Emitter extern const xImplAVX_ThreeArgYMM xVPXOR; extern const xImplAVX_CmpInt xVPCMP; + extern void xVPMOVMSKB(const xRegister32& to, const xRegisterSSE& from); extern void xVMOVMSKPS(const xRegister32& to, const xRegisterSSE& from); extern void xVMOVMSKPD(const xRegister32& to, const xRegisterSSE& from); extern void xVZEROUPPER(); diff --git a/pcsx2/x86/microVU.h b/pcsx2/x86/microVU.h index 95d4ce2fb6..20683c7f4e 100644 --- a/pcsx2/x86/microVU.h +++ b/pcsx2/x86/microVU.h @@ -100,7 +100,7 @@ public: microBlockLink*& blockList = fullCmp ? fBlockList : qBlockList; microBlockLink*& blockEnd = fullCmp ? fBlockEnd : qBlockEnd; - microBlockLink* newBlock = (microBlockLink*)_aligned_malloc(sizeof(microBlockLink), SSE_ALIGN_N); + microBlockLink* newBlock = (microBlockLink*)_aligned_malloc(sizeof(microBlockLink), 32); newBlock->block.jumpCache = nullptr; newBlock->next = nullptr; diff --git a/pcsx2/x86/microVU_Misc.h b/pcsx2/x86/microVU_Misc.h index 5244bcff8f..7db1615ca6 100644 --- a/pcsx2/x86/microVU_Misc.h +++ b/pcsx2/x86/microVU_Misc.h @@ -198,11 +198,7 @@ typedef Fntype_mVUrecInst* Fnptr_mVUrecInst; //------------------------------------------------------------------ alignas(__pagesize) extern u8 mVUsearchXMM[__pagesize]; typedef u32 (*mVUCall)(void*, void*); -#if _M_SSE >= 0x501 -#define mVUquickSearch(dest, src, size) ((((mVUCall)((void*)mVUsearchXMM))(dest, src)) == 0xff) -#else -#define mVUquickSearch(dest, src, size) ((((mVUCall)((void*)mVUsearchXMM))(dest, src)) == 0xf) -#endif +#define mVUquickSearch(dest, src, size) ((((mVUCall)((void*)mVUsearchXMM))(dest, src)) == 0) #define mVUemitSearch() \ { \ mVUcustomSearch(); \ diff --git a/pcsx2/x86/microVU_Misc.inl b/pcsx2/x86/microVU_Misc.inl index 0fbc01913f..bca0ecbff1 100644 --- a/pcsx2/x86/microVU_Misc.inl +++ b/pcsx2/x86/microVU_Misc.inl @@ -537,74 +537,80 @@ void mVUcustomSearch() memset(mVUsearchXMM, 0xcc, __pagesize); xSetPtr(mVUsearchXMM); -#if _M_SSE < 0x501 - xMOVAPS (xmm0, ptr32[arg1reg]); - xPCMP.EQD(xmm0, ptr32[arg2reg]); - xMOVAPS (xmm1, ptr32[arg1reg + 0x10]); - xPCMP.EQD(xmm1, ptr32[arg2reg + 0x10]); - xPAND (xmm0, xmm1); + if (!x86caps.hasAVX2) + { + xMOVAPS (xmm0, ptr32[arg1reg]); + xPCMP.EQD(xmm0, ptr32[arg2reg]); + xMOVAPS (xmm1, ptr32[arg1reg + 0x10]); + xPCMP.EQD(xmm1, ptr32[arg2reg + 0x10]); + xPAND (xmm0, xmm1); - xMOVMSKPS(eax, xmm0); - xCMP (eax, 0xf); - xForwardJL8 exitPoint; + xMOVMSKPS(eax, xmm0); + xXOR (eax, 0xf); + xForwardJNZ8 exitPoint; - xMOVAPS (xmm0, ptr32[arg1reg + 0x20]); - xPCMP.EQD(xmm0, ptr32[arg2reg + 0x20]); - xMOVAPS (xmm1, ptr32[arg1reg + 0x30]); - xPCMP.EQD(xmm1, ptr32[arg2reg + 0x30]); - xPAND (xmm0, xmm1); + xMOVAPS (xmm0, ptr32[arg1reg + 0x20]); + xPCMP.EQD(xmm0, ptr32[arg2reg + 0x20]); + xMOVAPS (xmm1, ptr32[arg1reg + 0x30]); + xPCMP.EQD(xmm1, ptr32[arg2reg + 0x30]); + xPAND (xmm0, xmm1); - xMOVAPS (xmm2, ptr32[arg1reg + 0x40]); - xPCMP.EQD(xmm2, ptr32[arg2reg + 0x40]); - xMOVAPS (xmm3, ptr32[arg1reg + 0x50]); - xPCMP.EQD(xmm3, ptr32[arg2reg + 0x50]); - xPAND (xmm2, xmm3); + xMOVAPS (xmm2, ptr32[arg1reg + 0x40]); + xPCMP.EQD(xmm2, ptr32[arg2reg + 0x40]); + xMOVAPS (xmm3, ptr32[arg1reg + 0x50]); + xPCMP.EQD(xmm3, ptr32[arg2reg + 0x50]); + xPAND (xmm2, xmm3); - xMOVAPS (xmm4, ptr32[arg1reg + 0x60]); - xPCMP.EQD(xmm4, ptr32[arg2reg + 0x60]); - xMOVAPS (xmm5, ptr32[arg1reg + 0x70]); - xPCMP.EQD(xmm5, ptr32[arg2reg + 0x70]); - xPAND (xmm4, xmm5); + xMOVAPS (xmm4, ptr32[arg1reg + 0x60]); + xPCMP.EQD(xmm4, ptr32[arg2reg + 0x60]); + xMOVAPS (xmm5, ptr32[arg1reg + 0x70]); + xPCMP.EQD(xmm5, ptr32[arg2reg + 0x70]); + xPAND (xmm4, xmm5); - xMOVAPS (xmm6, ptr32[arg1reg + 0x80]); - xPCMP.EQD(xmm6, ptr32[arg2reg + 0x80]); - xMOVAPS (xmm7, ptr32[arg1reg + 0x90]); - xPCMP.EQD(xmm7, ptr32[arg2reg + 0x90]); - xPAND (xmm6, xmm7); + xMOVAPS (xmm6, ptr32[arg1reg + 0x80]); + xPCMP.EQD(xmm6, ptr32[arg2reg + 0x80]); + xMOVAPS (xmm7, ptr32[arg1reg + 0x90]); + xPCMP.EQD(xmm7, ptr32[arg2reg + 0x90]); + xPAND (xmm6, xmm7); - xPAND (xmm0, xmm2); - xPAND (xmm4, xmm6); - xPAND (xmm0, xmm4); - xMOVMSKPS(eax, xmm0); + xPAND (xmm0, xmm2); + xPAND (xmm4, xmm6); + xPAND (xmm0, xmm4); + xMOVMSKPS(eax, xmm0); + xXOR(eax, 0xf); -#else - // We have to use unaligned loads here, because the blocks are only 16 byte aligned. - xVMOVUPS(ymm0, ptr[arg1reg]); - xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg]); - xVMOVMSKPS(eax, ymm0); - xCMP(eax, 0xff); - xForwardJB8 exitPoint; + exitPoint.SetTarget(); + } + else + { + // We have to use unaligned loads here, because the blocks are only 16 byte aligned. + xVMOVUPS(ymm0, ptr[arg1reg]); + xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg]); + xVPMOVMSKB(eax, ymm0); + xNOT(eax); + xForwardJNZ8 exitPoint; - xVMOVUPS(ymm0, ptr[arg1reg + 0x20]); - xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg + 0x20]); + xVMOVUPS(ymm0, ptr[arg1reg + 0x20]); + xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg + 0x20]); - xVMOVUPS(ymm1, ptr[arg1reg + 0x40]); - xVPCMP.EQD(ymm1, ymm1, ptr[arg2reg + 0x40]); + xVMOVUPS(ymm1, ptr[arg1reg + 0x40]); + xVPCMP.EQD(ymm1, ymm1, ptr[arg2reg + 0x40]); - xVMOVUPS(ymm2, ptr[arg1reg + 0x60]); - xVPCMP.EQD(ymm2, ymm2, ptr[arg2reg + 0x60]); - xVPAND(ymm0, ymm0, ymm1); + xVMOVUPS(ymm2, ptr[arg1reg + 0x60]); + xVPCMP.EQD(ymm2, ymm2, ptr[arg2reg + 0x60]); + xVPAND(ymm0, ymm0, ymm1); - xVMOVUPS(ymm1, ptr[arg1reg + 0x80]); - xVPAND(ymm0, ymm0, ymm2); - xVPCMP.EQD(ymm1, ymm1, ptr[arg2reg + 0x80]); - xVPAND(ymm0, ymm0, ymm1); + xVMOVUPS(ymm1, ptr[arg1reg + 0x80]); + xVPAND(ymm0, ymm0, ymm2); + xVPCMP.EQD(ymm1, ymm1, ptr[arg2reg + 0x80]); + xVPAND(ymm0, ymm0, ymm1); - xVMOVMSKPS(eax, ymm0); - xVZEROUPPER(); -#endif + xVPMOVMSKB(eax, ymm0); + xNOT(eax); - exitPoint.SetTarget(); + exitPoint.SetTarget(); + xVZEROUPPER(); + } xRET(); HostSys::MemProtectStatic(mVUsearchXMM, PageAccess_ExecOnly());