diff --git a/pcsx2/PCSX2Base.h b/pcsx2/PCSX2Base.h index 96aa82ef9b..59a05bd67d 100644 --- a/pcsx2/PCSX2Base.h +++ b/pcsx2/PCSX2Base.h @@ -36,6 +36,14 @@ #error PCSX2 requires compiling for at least SSE 4.1 #endif +// Require 32 bit alignment for vectors for AVX2. +#if _M_SSE >= 0x501 + #define SSE_ALIGN_N 32 +#else + #define SSE_ALIGN_N 16 +#endif +#define SSE_ALIGN alignas(SSE_ALIGN_N) + // Starting with AVX, processors have fast unaligned loads // Reduce code duplication by not compiling multiple versions #if _M_SSE >= 0x500 diff --git a/pcsx2/x86/microVU.h b/pcsx2/x86/microVU.h index a3ce717493..2ce22cd1fd 100644 --- a/pcsx2/x86/microVU.h +++ b/pcsx2/x86/microVU.h @@ -92,7 +92,7 @@ public: microBlockLink*& blockList = fullCmp ? fBlockList : qBlockList; microBlockLink*& blockEnd = fullCmp ? fBlockEnd : qBlockEnd; - microBlockLink* newBlock = (microBlockLink*)_aligned_malloc(sizeof(microBlockLink), 16); + microBlockLink* newBlock = (microBlockLink*)_aligned_malloc(sizeof(microBlockLink), SSE_ALIGN_N); newBlock->block.jumpCache = NULL; newBlock->next = NULL; diff --git a/pcsx2/x86/microVU_Misc.h b/pcsx2/x86/microVU_Misc.h index 38daa9fd3b..41d34a203d 100644 --- a/pcsx2/x86/microVU_Misc.h +++ b/pcsx2/x86/microVU_Misc.h @@ -198,7 +198,11 @@ typedef Fntype_mVUrecInst* Fnptr_mVUrecInst; //------------------------------------------------------------------ alignas(__pagesize) extern u8 mVUsearchXMM[__pagesize]; typedef u32 (*mVUCall)(void*, void*); +#if _M_SSE >= 0x501 +#define mVUquickSearch(dest, src, size) ((((mVUCall)((void*)mVUsearchXMM))(dest, src)) == 0xff) +#else #define mVUquickSearch(dest, src, size) ((((mVUCall)((void*)mVUsearchXMM))(dest, src)) == 0xf) +#endif #define mVUemitSearch() \ { \ mVUcustomSearch(); \ diff --git a/pcsx2/x86/microVU_Misc.inl b/pcsx2/x86/microVU_Misc.inl index c2cf097eb7..f5826780f2 100644 --- a/pcsx2/x86/microVU_Misc.inl +++ b/pcsx2/x86/microVU_Misc.inl @@ -537,6 +537,7 @@ void mVUcustomSearch() memset(mVUsearchXMM, 0xcc, __pagesize); xSetPtr(mVUsearchXMM); +#if _M_SSE < 0x501 xMOVAPS (xmm0, ptr32[arg1reg]); xPCMP.EQD(xmm0, ptr32[arg2reg]); xMOVAPS (xmm1, ptr32[arg1reg + 0x10]); @@ -576,7 +577,34 @@ void mVUcustomSearch() xPAND (xmm0, xmm4); xMOVMSKPS(eax, xmm0); +#else + // We have to use unaligned loads here, because the blocks are only 16 byte aligned. + xVMOVUPS(ymm0, ptr[arg1reg]); + xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg]); + xVMOVMSKPS(eax, ymm0); + xCMP(eax, 0xff); + xForwardJB8 exitPoint; + + xVMOVUPS(ymm0, ptr[arg1reg + 0x20]); + xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg + 0x20]); + + xVMOVUPS(ymm1, ptr[arg1reg + 0x40]); + xVPCMP.EQD(ymm1, ymm1, ptr[arg2reg + 0x40]); + + xVMOVUPS(ymm2, ptr[arg1reg + 0x60]); + xVPCMP.EQD(ymm2, ymm2, ptr[arg2reg + 0x60]); + xVPAND(ymm0, ymm0, ymm1); + + xVMOVUPS(ymm2, ptr[arg1reg + 0x80]); + xVPCMP.EQD(ymm2, ymm2, ptr[arg2reg + 0x80]); + xVPAND(ymm0, ymm0, ymm2); + + xVMOVMSKPS(eax, ymm0); + xVZEROUPPER(); +#endif + exitPoint.SetTarget(); + xRET(); HostSys::MemProtectStatic(mVUsearchXMM, PageAccess_ExecOnly()); }