From a8c2941901fd77ca0f5bec55d78a0f65bdb787ff Mon Sep 17 00:00:00 2001 From: cottonvibes Date: Thu, 21 Jan 2010 06:51:09 +0000 Subject: [PATCH] Removed the old vif unpack code since pcsx2 is now using newVif. Notes to Devs: - Linux project files probably need to be updated since I deleted some files. - In the vif0/vif1 Freeze() functions for saved states, I kept some dummy vars to keep saved state compatibility. We should remove them next time we decide to break saved state compatibility. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2461 96395faa-99c1-11dd-bbfe-3dabce05a288 --- pcsx2/Hw.cpp | 1 - pcsx2/Vif.cpp | 2 - pcsx2/Vif.h | 73 +- pcsx2/Vif0Dma.cpp | 79 +- pcsx2/Vif1Dma.cpp | 66 +- pcsx2/VifDma.cpp | 737 +------- pcsx2/VifDma.h | 2 - pcsx2/VifDma_internal.h | 20 - pcsx2/windows/VCprojects/pcsx2_2008.vcproj | 12 - pcsx2/x86/VifUnpackSSE.cpp | 28 +- pcsx2/x86/VifUnpackSSE.h | 5 +- pcsx2/x86/VifUnpackSSE_Dynarec.cpp | 28 - pcsx2/x86/aVif.S | 1607 ---------------- pcsx2/x86/aVif.asm | 1941 -------------------- pcsx2/x86/iVif.cpp | 92 - pcsx2/x86/newVif.h | 3 - pcsx2/x86/newVif_OldUnpack.inl | 8 +- pcsx2/x86/newVif_Unpack.cpp | 4 +- 18 files changed, 69 insertions(+), 4639 deletions(-) delete mode 100644 pcsx2/x86/aVif.S delete mode 100644 pcsx2/x86/aVif.asm delete mode 100644 pcsx2/x86/iVif.cpp diff --git a/pcsx2/Hw.cpp b/pcsx2/Hw.cpp index a716de663a..fc0332ab72 100644 --- a/pcsx2/Hw.cpp +++ b/pcsx2/Hw.cpp @@ -31,7 +31,6 @@ void hwInit() gsInit(); vif0Init(); vif1Init(); - vifDmaInit(); sifInit(); sprInit(); ipuInit(); diff --git a/pcsx2/Vif.cpp b/pcsx2/Vif.cpp index 6cef9f7344..df428f4d0d 100644 --- a/pcsx2/Vif.cpp +++ b/pcsx2/Vif.cpp @@ -23,8 +23,6 @@ #include "VifDma.h" VIFregisters *vifRegs; -u32* vifRow = NULL; -u32* vifMaskRegs = NULL; vifStruct *vif; u16 vifqwc = 0; diff --git a/pcsx2/Vif.h b/pcsx2/Vif.h index 39bfd62a2e..26e7df1c96 100644 --- a/pcsx2/Vif.h +++ b/pcsx2/Vif.h @@ -58,8 +58,8 @@ enum vif1_stat_flags // for occassions where we don't neccessarily know which we are using. enum vif_stat_flags { - VIF_STAT_VPS_W = (1), - VIF_STAT_VPS_D = (2), + VIF_STAT_VPS_W = (1), + VIF_STAT_VPS_D = (2), VIF_STAT_VPS_T = (3), VIF_STAT_VPS = (3), VIF_STAT_VEW = (1<<2), @@ -75,9 +75,9 @@ enum vif_stat_flags enum vif_status { - VPS_IDLE = 0, - VPS_WAITING = 1, - VPS_DECODING = 2, + VPS_IDLE = 0, + VPS_WAITING = 1, + VPS_DECODING = 2, VPS_TRANSFERRING = 3 // And decompressing. }; @@ -96,7 +96,7 @@ union tVIF_STAT { u32 VFS : 1; // Stopped by ForceBreak u32 VIS : 1; // Vif Interrupt Stall u32 INT : 1; // Intereupt by the i bit. - u32 ER0 : 1; // DmaTag Mismatch error. + u32 ER0 : 1; // DmaTag Mismatch error. u32 ER1 : 1; // VifCode error u32 reserved2 : 9; u32 FDR : 1; // VIF/FIFO transfer direction. (false - memory -> Vif, true - Vif -> memory) @@ -104,13 +104,12 @@ union tVIF_STAT { }; u32 _u32; - tVIF_STAT(u32 val) { _u32 = val; } - - bool test(u32 flags) { return !!(_u32 & flags); } - void set_flags(u32 flags) { _u32 |= flags; } + tVIF_STAT(u32 val) { _u32 = val; } + bool test(u32 flags) { return !!(_u32 & flags); } + void set_flags (u32 flags) { _u32 |= flags; } void clear_flags(u32 flags) { _u32 &= ~flags; } - void reset() { _u32 = 0; } - wxString desc() { return wxsFormat(L"Stat: 0x%x", _u32); } + void reset() { _u32 = 0; } + wxString desc() { return wxsFormat(L"Stat: 0x%x", _u32); } }; #define VIF_STAT(value) ((tVIF_STAT)(value)) @@ -125,13 +124,12 @@ union tVIF_FBRST { }; u32 _u32; - tVIF_FBRST(u32 val) { _u32 = val; } - - bool test(u32 flags) { return !!(_u32 & flags); } - void set_flags(u32 flags) { _u32 |= flags; } + tVIF_FBRST(u32 val) { _u32 = val; } + bool test (u32 flags) { return !!(_u32 & flags); } + void set_flags (u32 flags) { _u32 |= flags; } void clear_flags(u32 flags) { _u32 &= ~flags; } - void reset() { _u32 = 0; } - wxString desc() { return wxsFormat(L"Fbrst: 0x%x", _u32); } + void reset() { _u32 = 0; } + wxString desc() { return wxsFormat(L"Fbrst: 0x%x", _u32); } }; #define FBRST(value) ((tVIF_FBRST)(value)) @@ -145,14 +143,13 @@ union tVIF_ERR { }; u32 _u32; - tVIF_ERR(u32 val) { _u32 = val; } - - void write(u32 val) { _u32 = val; } - bool test(u32 flags) { return !!(_u32 & flags); } - void set_flags(u32 flags) { _u32 |= flags; } + tVIF_ERR (u32 val) { _u32 = val; } + void write(u32 val) { _u32 = val; } + bool test (u32 flags) { return !!(_u32 & flags); } + void set_flags (u32 flags) { _u32 |= flags; } void clear_flags(u32 flags) { _u32 &= ~flags; } - void reset() { _u32 = 0; } - wxString desc() { return wxsFormat(L"Err: 0x%x", _u32); } + void reset() { _u32 = 0; } + wxString desc() { return wxsFormat(L"Err: 0x%x", _u32); } }; struct vifCycle @@ -214,14 +211,7 @@ struct VIFregisters { u32 addr; }; -extern "C" -{ - // these use cdecl for Asm code references. - extern VIFregisters *vifRegs; - extern u32* vifMaskRegs; - extern u32* vifRow; - extern u32* _vifCol; -} +extern VIFregisters *vifRegs; #define vif0RegsRef ((VIFregisters&)PS2MEM_HW[0x3800]) #define vif1RegsRef ((VIFregisters&)PS2MEM_HW[0x3c00]) @@ -236,7 +226,7 @@ extern bool VIF1transfer(u32 *data, int size, bool istag); extern void vifMFIFOInterrupt(); // -------------------------------------------------------------------------------------- -// VIF SSE-optimized Masking Mess +// newVif SSE-optimized Row/Col Structs // -------------------------------------------------------------------------------------- struct VifMaskTypes @@ -245,19 +235,6 @@ struct VifMaskTypes u32 Row1[4], Col1[4]; }; -extern __aligned16 VifMaskTypes g_vifmask; // This struct is used by newVif as well as oldVif code... - -extern void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask); - -#define XMM_R0 xmm0 -#define XMM_R1 xmm1 -#define XMM_R2 xmm2 -#define XMM_WRITEMASK xmm3 -#define XMM_ROWMASK xmm4 -#define XMM_ROWCOLMASK xmm5 -#define XMM_ROW xmm6 -#define XMM_COL xmm7 - -#define XMM_R3 XMM_COL +extern __aligned16 VifMaskTypes g_vifmask; // This struct is used by newVif #endif /* __VIF_H__ */ diff --git a/pcsx2/Vif0Dma.cpp b/pcsx2/Vif0Dma.cpp index 17661dec2e..2e0d738176 100644 --- a/pcsx2/Vif0Dma.cpp +++ b/pcsx2/Vif0Dma.cpp @@ -18,13 +18,9 @@ #include "Common.h" #include "VifDma_internal.h" - #include "VUmicro.h" #include "newVif.h" -__aligned16 u32 g_vif0Masks[64]; -u32 g_vif0HasMask3[4] = {0}; - extern int (__fastcall *Vif0TransTLB[128])(u32 *data); extern void (*Vif0CMDTLB[75])(); @@ -41,18 +37,9 @@ __forceinline void vif0FLUSH() g_vifCycles += (VU0.cycle - _cycles) * BIAS; } -void vif0Init() +void vif0Init() { - for (u32 i = 0; i < 256; ++i) - { - s_maskwrite[i] = ((i & 3) == 3) || ((i & 0xc) == 0xc) || ((i & 0x30) == 0x30) || ((i & 0xc0) == 0xc0); - } - - SetNewMask(g_vif0Masks, g_vif0HasMask3, 0, 0xffffffff); - -#if newVif0 initNewVif(0); -#endif } static __forceinline void vif0UNPACK(u32 *data) @@ -119,7 +106,6 @@ static int __fastcall Vif0TransNull(u32 *data) // Shouldnt go here static int __fastcall Vif0TransSTMask(u32 *data) // STMASK { - SetNewMask(g_vif0Masks, g_vif0HasMask3, data[0], vif0Regs->mask); vif0Regs->mask = data[0]; VIF_LOG("STMASK == %x", vif0Regs->mask); @@ -226,61 +212,7 @@ static int __fastcall Vif0TransMPG(u32 *data) // MPG static int __fastcall Vif0TransUnpack(u32 *data) // UNPACK { -#if newVif0 return nVifUnpack(0, (u8*)data); -#endif - - int ret; - - XMMRegisters::Freeze(); - if (vif0.vifpacketsize < vif0.tag.size) - { - if(vif0Regs->offset != 0 || vif0.cl != 0) - { - ret = vif0.tag.size; - vif0.tag.size -= vif0.vifpacketsize - VIFalign<0>(data, &vif0.tag, vif0.vifpacketsize); - ret = ret - vif0.tag.size; - data += ret; - - if(vif0.vifpacketsize > 0) VIFunpack<0>(data, &vif0.tag, vif0.vifpacketsize - ret); - - ProcessMemSkip<0>((vif0.vifpacketsize - ret) << 2, (vif0.cmd & 0xf)); - vif0.tag.size -= (vif0.vifpacketsize - ret); - XMMRegisters::Thaw(); - - return vif0.vifpacketsize; - } - /* size is less that the total size, transfer is 'in pieces' */ - VIFunpack<0>(data, &vif0.tag, vif0.vifpacketsize); - - ProcessMemSkip<0>(vif0.vifpacketsize << 2, (vif0.cmd & 0xf)); - - ret = vif0.vifpacketsize; - vif0.tag.size -= ret; - } - else - { - /* we got all the data, transfer it fully */ - ret = vif0.tag.size; - - //Align data after a split transfer first - if ((vif0Regs->offset != 0) || (vif0.cl != 0)) - { - vif0.tag.size = VIFalign<0>(data, &vif0.tag, vif0.tag.size); - data += ret - vif0.tag.size; - if(vif0.tag.size > 0) VIFunpack<0>(data, &vif0.tag, vif0.tag.size); - } - else - { - VIFunpack<0>(data, &vif0.tag, vif0.tag.size); - } - - vif0.tag.size = 0; - vif0.cmd = 0; - } - - XMMRegisters::Thaw(); - return ret; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -785,7 +717,6 @@ void vif0Reset() /* Reset the whole VIF, meaning the internal pcsx2 vars and all the registers */ memzero(vif0); memzero(*vif0Regs); - SetNewMask(g_vif0Masks, g_vif0HasMask3, 0, 0xffffffff); psHu64(VIF0_FIFO) = 0; psHu64(VIF0_FIFO + 8) = 0; @@ -795,13 +726,13 @@ void vif0Reset() vif0.done = true; -#if newVif0 resetNewVif(0); -#endif } void SaveStateBase::vif0Freeze() { + static u32 g_vif0Masks[64]; // Dummy Var for saved state compatibility + static u32 g_vif0HasMask3[4]; // Dummy Var for saved state compatibility FreezeTag("VIFdma"); // Dunno if this one is needed, but whatever, it's small. :) @@ -811,6 +742,6 @@ void SaveStateBase::vif0Freeze() Freeze(g_vifmask); Freeze(vif0); - Freeze(g_vif0HasMask3); - Freeze(g_vif0Masks); + Freeze(g_vif0HasMask3); // Not Used Anymore + Freeze(g_vif0Masks); // Not Used Anymore } diff --git a/pcsx2/Vif1Dma.cpp b/pcsx2/Vif1Dma.cpp index 2c716c2841..5cfa00195e 100644 --- a/pcsx2/Vif1Dma.cpp +++ b/pcsx2/Vif1Dma.cpp @@ -24,9 +24,6 @@ #include "VUmicro.h" #include "newVif.h" -__aligned16 u32 g_vif1Masks[64]; -u32 g_vif1HasMask3[4] = {0}; - extern void (*Vif1CMDTLB[82])(); extern int (__fastcall *Vif1TransTLB[128])(u32 *data); @@ -58,10 +55,7 @@ __forceinline void vif1FLUSH() void vif1Init() { - SetNewMask(g_vif1Masks, g_vif1HasMask3, 0, 0xffffffff); -#if newVif1 initNewVif(1); -#endif } static __forceinline void vif1UNPACK(u32 *data) @@ -136,7 +130,6 @@ static int __fastcall Vif1TransNull(u32 *data) // Shouldnt go here static int __fastcall Vif1TransSTMask(u32 *data) // STMASK { - SetNewMask(g_vif1Masks, g_vif1HasMask3, data[0], vif1Regs->mask); vif1Regs->mask = data[0]; VIF_LOG("STMASK == %x", vif1Regs->mask); @@ -318,57 +311,7 @@ static int __fastcall Vif1TransDirectHL(u32 *data) } static int __fastcall Vif1TransUnpack(u32 *data) { -#if newVif1 return nVifUnpack(1, (u8*)data); -#endif - - XMMRegisters::Freeze(); - - if (vif1.vifpacketsize < vif1.tag.size) - { - int ret = vif1.tag.size; - // size is less that the total size, transfer is 'in pieces' - if (vif1Regs->offset != 0 || vif1.cl != 0) - { - vif1.tag.size -= vif1.vifpacketsize - VIFalign<1>(data, &vif1.tag, vif1.vifpacketsize); - ret = ret - vif1.tag.size; - data += ret; - if ((vif1.vifpacketsize - ret) > 0) VIFunpack<1>(data, &vif1.tag, vif1.vifpacketsize - ret); - ProcessMemSkip<1>((vif1.vifpacketsize - ret) << 2, (vif1.cmd & 0xf)); - vif1.tag.size -= (vif1.vifpacketsize - ret); - } - else - { - VIFunpack<1>(data, &vif1.tag, vif1.vifpacketsize); - - ProcessMemSkip<1>(vif1.vifpacketsize << 2, (vif1.cmd & 0xf)); - vif1.tag.size -= vif1.vifpacketsize; - } - - XMMRegisters::Thaw(); - return vif1.vifpacketsize; - } - else - { - int ret = vif1.tag.size; - - if (vif1Regs->offset != 0 || vif1.cl != 0) - { - vif1.tag.size = VIFalign<1>(data, &vif1.tag, vif1.tag.size); - data += ret - vif1.tag.size; - if (vif1.tag.size > 0) VIFunpack<1>(data, &vif1.tag, vif1.tag.size); - } - else - { - /* we got all the data, transfer it fully */ - VIFunpack<1>(data, &vif1.tag, vif1.tag.size); - } - - vif1.tag.size = 0; - vif1.cmd = 0; - XMMRegisters::Thaw(); - return ret; - } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1175,7 +1118,6 @@ void vif1Reset() /* Reset the whole VIF, meaning the internal pcsx2 vars, and all the registers */ memzero(vif1); memzero(*vif1Regs); - SetNewMask(g_vif1Masks, g_vif1HasMask3, 0, 0xffffffff); psHu64(VIF1_FIFO) = 0; psHu64(VIF1_FIFO + 8) = 0; @@ -1186,15 +1128,15 @@ void vif1Reset() vif1.done = true; cpuRegs.interrupt &= ~((1 << 1) | (1 << 10)); //Stop all vif1 DMA's -#if newVif1 resetNewVif(1); -#endif } void SaveStateBase::vif1Freeze() { + static u32 g_vif1Masks[64]; // Dummy Var for saved state compatibility + static u32 g_vif1HasMask3[4]; // Dummy Var for saved state compatibility Freeze(vif1); - Freeze(g_vif1HasMask3); - Freeze(g_vif1Masks); + Freeze(g_vif1HasMask3); // Not Used Anymore + Freeze(g_vif1Masks); // Not Used Anymore } diff --git a/pcsx2/VifDma.cpp b/pcsx2/VifDma.cpp index 865069e43b..a542b213a6 100644 --- a/pcsx2/VifDma.cpp +++ b/pcsx2/VifDma.cpp @@ -19,726 +19,7 @@ #include "VifDma_internal.h" #include "VUmicro.h" -#include -#include - -// Extern variables -extern "C" -{ - // Need cdecl on these for ASM references. - extern VIFregisters *vifRegs; - extern u32* vifMaskRegs; - extern u32* vifRow; -} - int g_vifCycles = 0; -u8 s_maskwrite[256]; - -struct VIFSSEUnpackTable -{ - // regular 0, 1, 2; mask 0, 1, 2 - UNPACKPARTFUNCTYPESSE funcU[9], funcS[9]; -}; - -#define DECL_UNPACK_TABLE_SSE(name, sign) \ -extern "C" { \ - extern int UNPACK_SkippingWrite_##name##_##sign##_Regular_0(u32* dest, u32* data, int dmasize); \ - extern int UNPACK_SkippingWrite_##name##_##sign##_Regular_1(u32* dest, u32* data, int dmasize); \ - extern int UNPACK_SkippingWrite_##name##_##sign##_Regular_2(u32* dest, u32* data, int dmasize); \ - extern int UNPACK_SkippingWrite_##name##_##sign##_Mask_0(u32* dest, u32* data, int dmasize); \ - extern int UNPACK_SkippingWrite_##name##_##sign##_Mask_1(u32* dest, u32* data, int dmasize); \ - extern int UNPACK_SkippingWrite_##name##_##sign##_Mask_2(u32* dest, u32* data, int dmasize); \ - extern int UNPACK_SkippingWrite_##name##_##sign##_WriteMask_0(u32* dest, u32* data, int dmasize); \ - extern int UNPACK_SkippingWrite_##name##_##sign##_WriteMask_1(u32* dest, u32* data, int dmasize); \ - extern int UNPACK_SkippingWrite_##name##_##sign##_WriteMask_2(u32* dest, u32* data, int dmasize); \ -} - -#define _UNPACK_TABLE_SSE(name, sign) \ - UNPACK_SkippingWrite_##name##_##sign##_Regular_0, \ - UNPACK_SkippingWrite_##name##_##sign##_Regular_1, \ - UNPACK_SkippingWrite_##name##_##sign##_Regular_2, \ - UNPACK_SkippingWrite_##name##_##sign##_Mask_0, \ - UNPACK_SkippingWrite_##name##_##sign##_Mask_1, \ - UNPACK_SkippingWrite_##name##_##sign##_Mask_2, \ - UNPACK_SkippingWrite_##name##_##sign##_WriteMask_0, \ - UNPACK_SkippingWrite_##name##_##sign##_WriteMask_1, \ - UNPACK_SkippingWrite_##name##_##sign##_WriteMask_2 \ - -#define _UNPACK_TABLE_SSE_NULL \ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL - -// Main table for function unpacking -DECL_UNPACK_TABLE_SSE(S_32, u); -DECL_UNPACK_TABLE_SSE(S_16, u); -DECL_UNPACK_TABLE_SSE(S_8, u); -DECL_UNPACK_TABLE_SSE(S_16, s); -DECL_UNPACK_TABLE_SSE(S_8, s); - -DECL_UNPACK_TABLE_SSE(V2_32, u); -DECL_UNPACK_TABLE_SSE(V2_16, u); -DECL_UNPACK_TABLE_SSE(V2_8, u); -DECL_UNPACK_TABLE_SSE(V2_16, s); -DECL_UNPACK_TABLE_SSE(V2_8, s); - -DECL_UNPACK_TABLE_SSE(V3_32, u); -DECL_UNPACK_TABLE_SSE(V3_16, u); -DECL_UNPACK_TABLE_SSE(V3_8, u); -DECL_UNPACK_TABLE_SSE(V3_16, s); -DECL_UNPACK_TABLE_SSE(V3_8, s); - -DECL_UNPACK_TABLE_SSE(V4_32, u); -DECL_UNPACK_TABLE_SSE(V4_16, u); -DECL_UNPACK_TABLE_SSE(V4_8, u); -DECL_UNPACK_TABLE_SSE(V4_16, s); -DECL_UNPACK_TABLE_SSE(V4_8, s); -DECL_UNPACK_TABLE_SSE(V4_5, u); - -static const VIFSSEUnpackTable VIFfuncTableSSE[16] = -{ - { _UNPACK_TABLE_SSE(S_32, u), _UNPACK_TABLE_SSE(S_32, u) }, - { _UNPACK_TABLE_SSE(S_16, u), _UNPACK_TABLE_SSE(S_16, s) }, - { _UNPACK_TABLE_SSE(S_8, u), _UNPACK_TABLE_SSE(S_8, s) }, - { _UNPACK_TABLE_SSE_NULL, _UNPACK_TABLE_SSE_NULL }, - - { _UNPACK_TABLE_SSE(V2_32, u), _UNPACK_TABLE_SSE(V2_32, u) }, - { _UNPACK_TABLE_SSE(V2_16, u), _UNPACK_TABLE_SSE(V2_16, s) }, - { _UNPACK_TABLE_SSE(V2_8, u), _UNPACK_TABLE_SSE(V2_8, s) }, - { _UNPACK_TABLE_SSE_NULL, _UNPACK_TABLE_SSE_NULL }, - - { _UNPACK_TABLE_SSE(V3_32, u), _UNPACK_TABLE_SSE(V3_32, u) }, - { _UNPACK_TABLE_SSE(V3_16, u), _UNPACK_TABLE_SSE(V3_16, s) }, - { _UNPACK_TABLE_SSE(V3_8, u), _UNPACK_TABLE_SSE(V3_8, s) }, - { _UNPACK_TABLE_SSE_NULL, _UNPACK_TABLE_SSE_NULL }, - - { _UNPACK_TABLE_SSE(V4_32, u), _UNPACK_TABLE_SSE(V4_32, u) }, - { _UNPACK_TABLE_SSE(V4_16, u), _UNPACK_TABLE_SSE(V4_16, s) }, - { _UNPACK_TABLE_SSE(V4_8, u), _UNPACK_TABLE_SSE(V4_8, s) }, - { _UNPACK_TABLE_SSE(V4_5, u), _UNPACK_TABLE_SSE(V4_5, u) }, -}; - -void vifDmaInit() -{ -} - -template void ProcessMemSkip<0>(u32 size, u32 unpackType); -template void ProcessMemSkip<1>(u32 size, u32 unpackType); -template void ProcessMemSkip(u32 size, u32 unpackType) -{ - const VIFUnpackFuncTable *unpack; - - // unpackType is only 0->0xf but that's ok, because the data we're using here is - // just duplicated in 0x10->0x1f. - - unpack = &VIFfuncTable[ unpackType ]; - - switch (unpackType) - { - case 0x0: - vif->tag.addr += (size / unpack->gsize) * 16; - VIFUNPACK_LOG("Processing S-32 skip, size = %d", size); - break; - case 0x1: - vif->tag.addr += (size / unpack->gsize) * 16; - VIFUNPACK_LOG("Processing S-16 skip, size = %d", size); - break; - case 0x2: - vif->tag.addr += (size / unpack->gsize) * 16; - VIFUNPACK_LOG("Processing S-8 skip, size = %d", size); - break; - case 0x4: - vif->tag.addr += (size / unpack->gsize) * 16; - VIFUNPACK_LOG("Processing V2-32 skip, size = %d", size); - break; - case 0x5: - vif->tag.addr += (size / unpack->gsize) * 16; - VIFUNPACK_LOG("Processing V2-16 skip, size = %d", size); - break; - case 0x6: - vif->tag.addr += (size / unpack->gsize) * 16; - VIFUNPACK_LOG("Processing V2-8 skip, size = %d", size); - break; - case 0x8: - vif->tag.addr += (size / unpack->gsize) * 16; - VIFUNPACK_LOG("Processing V3-32 skip, size = %d", size); - break; - case 0x9: - vif->tag.addr += (size / unpack->gsize) * 16; - VIFUNPACK_LOG("Processing V3-16 skip, size = %d", size); - break; - case 0xA: - vif->tag.addr += (size / unpack->gsize) * 16; - VIFUNPACK_LOG("Processing V3-8 skip, size = %d", size); - break; - case 0xC: - vif->tag.addr += size; - VIFUNPACK_LOG("Processing V4-32 skip, size = %d, CL = %d, WL = %d", size, vifRegs->cycle.cl, vifRegs->cycle.wl); - break; - case 0xD: - vif->tag.addr += (size / unpack->gsize) * 16; - VIFUNPACK_LOG("Processing V4-16 skip, size = %d", size); - break; - case 0xE: - vif->tag.addr += (size / unpack->gsize) * 16; - VIFUNPACK_LOG("Processing V4-8 skip, size = %d", size); - break; - case 0xF: - vif->tag.addr += (size / unpack->gsize) * 16; - VIFUNPACK_LOG("Processing V4-5 skip, size = %d", size); - break; - default: - Console.WriteLn("Invalid unpack type %x", unpackType); - break; - } - - //Append any skips in to the equation - - if (vifRegs->cycle.cl > vifRegs->cycle.wl) - { - VIFUNPACK_LOG("Old addr %x CL %x WL %x", vif->tag.addr, vifRegs->cycle.cl, vifRegs->cycle.wl); - vif->tag.addr += (size / (unpack->gsize*vifRegs->cycle.wl)) * ((vifRegs->cycle.cl - vifRegs->cycle.wl)*16); - VIFUNPACK_LOG("New addr %x CL %x WL %x", vif->tag.addr, vifRegs->cycle.cl, vifRegs->cycle.wl); - } - - //This is sorted out later - if ((vif->tag.addr & 0xf) != (vifRegs->offset * 4)) - { - VIFUNPACK_LOG("addr aligned to %x", vif->tag.addr); - vif->tag.addr = (vif->tag.addr & ~0xf) + (vifRegs->offset * 4); - } - - if (vif->tag.addr >= (u32)vif_size(VIFdmanum)) - { - vif->tag.addr &= (u32)(vif_size(VIFdmanum) - 1); - } -} - -template u32 VIFalign<0>(u32 *data, vifCode *v, u32 size); -template u32 VIFalign<1>(u32 *data, vifCode *v, u32 size); -template u32 VIFalign(u32 *data, vifCode *v, u32 size) -{ - u32 *dest; - VURegs * VU; - u8 *cdata = (u8*)data; - - u32 memsize = vif_size(VIFdmanum); - - if (VIFdmanum == 0) - { - VU = &VU0; - vifRegs = vif0Regs; - vifMaskRegs = g_vif0Masks; - vif = &vif0; - vifRow = g_vifmask.Row0; - } - else - { - VU = &VU1; - vifRegs = vif1Regs; - vifMaskRegs = g_vif1Masks; - vif = &vif1; - vifRow = g_vifmask.Row1; - } - pxAssume(v->addr < memsize); - - dest = (u32*)(VU->Mem + v->addr); - - VIF_LOG("VIF%d UNPACK Align: Mode=%x, v->size=%d, size=%d, v->addr=%x v->num=%x", - VIFdmanum, v->cmd & 0xf, v->size, size, v->addr, vifRegs->num); - - const VIFUnpackFuncTable& ft( VIFfuncTable[ v->cmd & 0x1f ] ); - UNPACKFUNCTYPE func = vif->usn ? ft.funcU : ft.funcS; - - size <<= 2; - memsize = size; - - if(vifRegs->offset != 0) - { - int unpacksize; - - //This is just to make sure the alignment isn't loopy on a split packet - if(vifRegs->offset != ((vif->tag.addr & 0xf) >> 2)) - { - DevCon.Error("Warning: Unpack alignment error"); - } - - VIFUNPACK_LOG("Aligning packet size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr); - - if (((u32)size / (u32)ft.dsize) < ((u32)ft.qsize - vifRegs->offset)) - { - DevCon.Error("Wasn't enough left size/dsize = %x left to write %x", (size / ft.dsize), (ft.qsize - vifRegs->offset)); - } - unpacksize = min((size / ft.dsize), (ft.qsize - vifRegs->offset)); - - - VIFUNPACK_LOG("Increasing dest by %x from offset %x", (4 - ft.qsize) + unpacksize, vifRegs->offset); - - (vif->usn ? ft.oddU : ft.oddS)(dest, (u32*)cdata, unpacksize); - size -= unpacksize * ft.dsize; - - if(vifRegs->offset == 0) - { - vifRegs->num--; - ++vif->cl; - } - else - { - DevCon.Warning("Offset = %x", vifRegs->offset); - vif->tag.addr += unpacksize * 4; - return size>>2; - } - - if (vif->cl == vifRegs->cycle.wl) - { - if (vifRegs->cycle.cl != vifRegs->cycle.wl) - { - vif->tag.addr += (((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + ((4 - ft.qsize) + unpacksize)) * 4; - dest += ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + (4 - ft.qsize) + unpacksize; - } - else - { - vif->tag.addr += ((4 - ft.qsize) + unpacksize) * 4; - dest += (4 - ft.qsize) + unpacksize; - } - - if (vif->tag.addr >= (u32)vif_size(VIFdmanum)) - { - vif->tag.addr &= (u32)(vif_size(VIFdmanum) - 1); - dest = (u32*)(VU->Mem + v->addr); - } - - cdata += unpacksize * ft.dsize; - vif->cl = 0; - VIFUNPACK_LOG("Aligning packet done size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr); - if ((size & 0xf) == 0) return size >> 2; - - } - else - { - vif->tag.addr += ((4 - ft.qsize) + unpacksize) * 4; - dest += (4 - ft.qsize) + unpacksize; - - if (vif->tag.addr >= (u32)vif_size(VIFdmanum)) - { - vif->tag.addr &= (u32)(vif_size(VIFdmanum) - 1); - dest = (u32*)(VU->Mem + v->addr); - } - - cdata += unpacksize * ft.dsize; - VIFUNPACK_LOG("Aligning packet done size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr); - } - } - - if (vif->cl != 0 || (size & 0xf)) //Check alignment for SSE unpacks - { - int incdest; - - if (vifRegs->cycle.cl >= vifRegs->cycle.wl) // skipping write - { - if (vif->tag.addr >= (u32)vif_size(VIFdmanum)) - { - vif->tag.addr &= (u32)(vif_size(VIFdmanum) - 1); - dest = (u32*)(VU->Mem + v->addr); - } - // continuation from last stream - VIFUNPACK_LOG("Continuing last stream size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr); - incdest = ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + 4; - - while ((size >= ft.gsize) && (vifRegs->num > 0)) - { - func(dest, (u32*)cdata); - cdata += ft.gsize; - size -= ft.gsize; - - vifRegs->num--; - ++vif->cl; - if (vif->cl == vifRegs->cycle.wl) - { - dest += incdest; - vif->tag.addr += incdest * 4; - - vif->cl = 0; - if ((size & 0xf) == 0) break; - } - else - { - dest += 4; - vif->tag.addr += 16; - } - - if (vif->tag.addr >= (u32)vif_size(VIFdmanum)) - { - vif->tag.addr &= (u32)(vif_size(VIFdmanum) - 1); - dest = (u32*)(VU->Mem + v->addr); - } - } - - if(vifRegs->mode == 2) - { - //Update the reg rows for SSE - vifRow = VIFdmanum ? g_vifmask.Row1 : g_vifmask.Row0; - vifRow[0] = vifRegs->r0; - vifRow[1] = vifRegs->r1; - vifRow[2] = vifRegs->r2; - vifRow[3] = vifRegs->r3; - } - - } - if (size >= ft.dsize && vifRegs->num > 0 && ((size & 0xf) != 0 || vif->cl != 0)) - { - //VIF_LOG("warning, end with size = %d", size); - /* unpack one qword */ - if(vif->tag.addr + ((size / ft.dsize) * 4) >= (u32)vif_size(VIFdmanum)) - { - //DevCon.Warning("Overflow"); - vif->tag.addr &= (u32)(vif_size(VIFdmanum) - 1); - dest = (u32*)(VU->Mem + v->addr); - } - - vif->tag.addr += (size / ft.dsize) * 4; - - (vif->usn ? ft.oddU : ft.oddS)(dest, (u32*)cdata, size / ft.dsize); - size = 0; - - if(vifRegs->mode == 2) - { - //Update the reg rows for SSE - vifRow[0] = vifRegs->r0; - vifRow[1] = vifRegs->r1; - vifRow[2] = vifRegs->r2; - vifRow[3] = vifRegs->r3; - } - VIFUNPACK_LOG("leftover done, size %d, vifnum %d, addr %x", size, vifRegs->num, vif->tag.addr); - } - } - return size>>2; -} -#include "newVif.h" -#if !newVif -template void VIFunpack<0>(u32 *data, vifCode *v, u32 size); -template void VIFunpack<1>(u32 *data, vifCode *v, u32 size); -template void VIFunpack(u32 *data, vifCode *v, u32 size) -{ - //DevCon.WriteLn("vif#%d, size = %d [%x]", VIFdmanum, size, data); - u32 *dest; - VURegs * VU; - u8 *cdata = (u8*)data; - u32 tempsize = 0; - const u32 memlimit = vif_size(VIFdmanum); - - pxDebugCode( u32 memsize = memlimit ); - - _mm_prefetch((char*)data, _MM_HINT_NTA); - - if (VIFdmanum == 0) - { - VU = &VU0; - vifRegs = vif0Regs; - vifMaskRegs = g_vif0Masks; - vif = &vif0; - vifRow = g_vifmask.Row0; - pxDebugCode( pxAssume(v->addr < memsize) ); - } - else - { - - VU = &VU1; - vifRegs = vif1Regs; - vifMaskRegs = g_vif1Masks; - vif = &vif1; - vifRow = g_vifmask.Row1; - pxDebugCode( pxAssume(v->addr < memsize) ); - } - - dest = (u32*)(VU->Mem + v->addr); - - VIF_LOG("VIF%d UNPACK: Mode=%x, v->size=%d, size=%d, v->addr=%x v->num=%x", - VIFdmanum, v->cmd & 0xf, v->size, size, v->addr, vifRegs->num); - - VIFUNPACK_LOG("USN %x Masking %x Mask %x Mode %x CL %x WL %x Offset %x", vif->usn, (vifRegs->code & 0x10000000) >> 28, vifRegs->mask, vifRegs->mode, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->offset); - - _mm_prefetch((char*)data + 128, _MM_HINT_NTA); - - const VIFUnpackFuncTable& ft( VIFfuncTable[ v->cmd & 0x1f ] ); - UNPACKFUNCTYPE func = vif->usn ? ft.funcU : ft.funcS; - - size <<= 2; - - pxDebugCode( memsize = size ); - - if (vifRegs->cycle.cl >= vifRegs->cycle.wl) // skipping write - { - if (v->addr >= memlimit) - { - //DevCon.Warning("Overflown at the start"); - v->addr &= (memlimit - 1); - dest = (u32*)(VU->Mem + v->addr); - } - - size = std::min(size, vifRegs->num * ft.gsize); //size will always be the same or smaller - - tempsize = vif->tag.addr + ((((vifRegs->num-1) / vifRegs->cycle.wl) * - (vifRegs->cycle.cl - vifRegs->cycle.wl)) * 16) + (vifRegs->num * 16); - - /*tempsize = vif->tag.addr + (((size / (ft.gsize * vifRegs->cycle.wl)) * - (vifRegs->cycle.cl - vifRegs->cycle.wl)) * 16) + (vifRegs->num * 16);*/ - - //Sanity Check (memory overflow) - if (tempsize > memlimit) - { - if (((vifRegs->cycle.cl != vifRegs->cycle.wl) && - ((memlimit + (vifRegs->cycle.cl - vifRegs->cycle.wl) * 16) == tempsize))) - { - //It's a red herring, so ignore it! SSE unpacks will be much quicker. - tempsize = 0; - } - else - { - //DevCon.Warning("VIF%x Unpack ending %x > %x", VIFdmanum, tempsize, VIFdmanum ? 0x4000 : 0x1000); - tempsize = size; - size = 0; - } - } - else - { -#ifndef NON_SSE_UNPACKS - tempsize = 0; -#else - tempsize = size; - size = 0; -#endif - } - - if (size >= ft.gsize) - { - const UNPACKPARTFUNCTYPESSE* pfn; - int writemask; - u32 oldcycle = -1; - - // yay evil .. let's just set some XMM registers in the middle of C code - // and "hope" they get preserved, in spite of the fact that x86-32 ABI specifies - // these as "clobberable" registers (so any printf or something could decide to - // clobber them, and has every right to... >_<) --air - -#ifdef _MSC_VER - if (VIFdmanum) - { - __asm movaps XMM_ROW, xmmword ptr [g_vifmask.Row1] - __asm movaps XMM_COL, xmmword ptr [g_vifmask.Col1] - } - else - { - __asm movaps XMM_ROW, xmmword ptr [g_vifmask.Row0] - __asm movaps XMM_COL, xmmword ptr [g_vifmask.Col0] - } -#else - // I'd add volatile to these, but what's the point? This code already breaks - // like 5000 coveted rules of binary interfacing regardless, and is only working by - // the miracles and graces of a profound deity (or maybe it doesn't -- linux port - // *does* have stability issues, especially in GCC 4.4). --air - if (VIFdmanum) - { - __asm__(".intel_syntax noprefix\n" - "movaps xmm6, xmmword ptr [%[Row1]]\n" - "movaps xmm7, xmmword ptr [%[Col1]]\n" - ".att_syntax\n" : : [Row1]"r"(g_vifmask.Row1), [Col1]"r"(g_vifmask.Col1)); - } - else - { - __asm__(".intel_syntax noprefix\n" - "movaps xmm6, xmmword ptr [%[Row0]]\n" - "movaps xmm7, xmmword ptr [%[Col0]]\n" - ".att_syntax\n" : : [Row0]"r"(g_vifmask.Row0), [Col0]"r"(g_vifmask.Col0)); - } -#endif - - if ((vifRegs->cycle.cl == 0) || (vifRegs->cycle.wl == 0) || - ((vifRegs->cycle.cl == vifRegs->cycle.wl) && !(vifRegs->code & 0x10000000))) - { - oldcycle = *(u32*) & vifRegs->cycle; - vifRegs->cycle.cl = vifRegs->cycle.wl = 1; - } - - pfn = vif->usn ? VIFfuncTableSSE[v->cmd & 0xf].funcU : VIFfuncTableSSE[v->cmd & 0xf].funcS; - writemask = VIFdmanum ? g_vif1HasMask3[min(vifRegs->cycle.wl,(u8)3)] : g_vif0HasMask3[min(vifRegs->cycle.wl,(u8)3)]; - writemask = pfn[(((vifRegs->code & 0x10000000)>>28)<mode](dest, (u32*)cdata, size); - - if (oldcycle != -1) *(u32*)&vifRegs->cycle = oldcycle; - - if(vifRegs->mode == 2) - { - //Update the reg rows for non SSE - vifRegs->r0 = vifRow[0]; - vifRegs->r1 = vifRow[1]; - vifRegs->r2 = vifRow[2]; - vifRegs->r3 = vifRow[3]; - } - - // if size is left over, update the src,dst pointers - if (writemask > 0) - { - int left = (size - writemask) / ft.gsize; - cdata += left * ft.gsize; - dest = (u32*)((u8*)dest + ((left / vifRegs->cycle.wl) * vifRegs->cycle.cl + left % vifRegs->cycle.wl) * 16); - vifRegs->num -= left; - vif->cl = (size % (ft.gsize * vifRegs->cycle.wl)) / ft.gsize; - size = writemask; - - if (size >= ft.dsize && vifRegs->num > 0) - { - VIF_LOG("warning, end with size = %d", size); - - /* unpack one qword */ - //vif->tag.addr += (size / ft.dsize) * 4; - (vif->usn ? ft.oddU : ft.oddS)(dest, (u32*)cdata, size / ft.dsize); - size = 0; - - if(vifRegs->mode == 2) - { - //Update the reg rows for SSE - vifRow[0] = vifRegs->r0; - vifRow[1] = vifRegs->r1; - vifRow[2] = vifRegs->r2; - vifRow[3] = vifRegs->r3; - } - VIFUNPACK_LOG("leftover done, size %d, vifnum %d, addr %x", size, vifRegs->num, vif->tag.addr); - } - } - else - { - vifRegs->num -= size / ft.gsize; - if (vifRegs->num > 0) vif->cl = (size % (ft.gsize * vifRegs->cycle.wl)) / ft.gsize; - size = 0; - } - } - else if(tempsize) - { - int incdest = ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + 4; - size = 0; - int addrstart = v->addr; - - #ifndef NON_SSE_UNPACKS // spams pointlessly when SSE unpacks are disabled - //if((tempsize >> 2) != vif->tag.size) DevCon.Warning("split when size != tagsize"); - #endif - - VIFUNPACK_LOG("sorting tempsize :p, size %d, vifnum %d, addr %x", tempsize, vifRegs->num, vif->tag.addr); - - while ((tempsize >= ft.gsize) && (vifRegs->num > 0)) - { - if(v->addr >= memlimit) - { - DevCon.Warning("Mem limit overflow"); - v->addr &= (memlimit - 1); - dest = (u32*)(VU->Mem + v->addr); - } - - func(dest, (u32*)cdata); - cdata += ft.gsize; - tempsize -= ft.gsize; - - vifRegs->num--; - ++vif->cl; - - if (vif->cl == vifRegs->cycle.wl) - { - dest += incdest; - v->addr += (incdest * 4); - vif->cl = 0; - } - else - { - dest += 4; - v->addr += 16; - } - } - - if (vifRegs->mode == 2) - { - //Update the reg rows for SSE - vifRow[0] = vifRegs->r0; - vifRow[1] = vifRegs->r1; - vifRow[2] = vifRegs->r2; - vifRow[3] = vifRegs->r3; - } - - if (v->addr >= memlimit) - { - v->addr &= (memlimit - 1); - dest = (u32*)(VU->Mem + v->addr); - } - - v->addr = addrstart; - if(tempsize > 0) size = tempsize; - } - - if (size >= ft.dsize && vifRegs->num > 0) //Else write what we do have - { - VIF_LOG("warning, end with size = %d", size); - - /* unpack one qword */ - //vif->tag.addr += (size / ft.dsize) * 4; - (vif->usn ? ft.oddU : ft.oddS)(dest, (u32*)cdata, size / ft.dsize); - size = 0; - - if(vifRegs->mode == 2) - { - //Update the reg rows for SSE - vifRow[0] = vifRegs->r0; - vifRow[1] = vifRegs->r1; - vifRow[2] = vifRegs->r2; - vifRow[3] = vifRegs->r3; - } - VIFUNPACK_LOG("leftover done, size %d, vifnum %d, addr %x", size, vifRegs->num, vif->tag.addr); - } - } - else /* filling write */ - { - - if(vifRegs->cycle.cl > 0) // Quicker and avoids zero division :P - if((u32)(((size / ft.gsize) / vifRegs->cycle.cl) * vifRegs->cycle.wl) < vifRegs->num) - DevCon.Warning("Filling write warning! %x < %x and CL = %x WL = %x", (size / ft.gsize), vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl); - - //DevCon.Warning("filling write %d cl %d, wl %d mask %x mode %x unpacktype %x addr %x", vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mask, vifRegs->mode, unpackType, vif->tag.addr); - while (vifRegs->num > 0) - { - if (vif->cl == vifRegs->cycle.wl) - { - vif->cl = 0; - } - - if (vif->cl < vifRegs->cycle.cl) /* unpack one qword */ - { - if(size < ft.gsize) - { - VIF_LOG("Out of Filling write data"); - break; - } - - func(dest, (u32*)cdata); - cdata += ft.gsize; - size -= ft.gsize; - - vif->cl++; - vifRegs->num--; - - if (vif->cl == vifRegs->cycle.wl) - { - vif->cl = 0; - } - } - else - { - func(dest, (u32*)cdata); - vif->tag.addr += 16; - vifRegs->num--; - ++vif->cl; - - } - dest += 4; - if (vifRegs->num == 0) break; - } - } -} -#endif // #if !newVif template void vuExecMicro<0>(u32 addr); template void vuExecMicro<1>(u32 addr); @@ -746,13 +27,11 @@ template void vuExecMicro(u32 addr) { VURegs * VU; - if (VIFdmanum == 0) - { + if (VIFdmanum == 0) { VU = &VU0; vif0FLUSH(); } - else - { + else { VU = &VU1; vif1FLUSH(); } @@ -768,22 +47,18 @@ template void vuExecMicro(u32 addr) VU->vifRegs->top = VU->vifRegs->tops & 0x3ff; /* is DBF flag set in VIF_STAT? */ - if (VU->vifRegs->stat.DBF) - { + if (VU->vifRegs->stat.DBF) { /* it is, so set tops with base, and clear the stat DBF flag */ VU->vifRegs->tops = VU->vifRegs->base; VU->vifRegs->stat.DBF = false; } - else - { + else { /* it is not, so set tops with base + offset, and set stat DBF flag */ VU->vifRegs->tops = VU->vifRegs->base + VU->vifRegs->ofst; VU->vifRegs->stat.DBF = true; } } - if (VIFdmanum == 0) - vu0ExecMicro(addr); - else - vu1ExecMicro(addr); + if (!VIFdmanum) vu0ExecMicro(addr); + else vu1ExecMicro(addr); } diff --git a/pcsx2/VifDma.h b/pcsx2/VifDma.h index 840670a322..0b6c58737a 100644 --- a/pcsx2/VifDma.h +++ b/pcsx2/VifDma.h @@ -47,8 +47,6 @@ extern vifStruct vif0, vif1; extern u8 schedulepath3msk; static const int VifCycleVoodoo = 4; -extern void vifDmaInit(); - extern void vif0Init(); extern void vif0Interrupt(); extern void vif0Write32(u32 mem, u32 value); diff --git a/pcsx2/VifDma_internal.h b/pcsx2/VifDma_internal.h index ba046dd64a..991d924780 100644 --- a/pcsx2/VifDma_internal.h +++ b/pcsx2/VifDma_internal.h @@ -65,36 +65,16 @@ struct VIFUnpackFuncTable extern const __aligned16 VIFUnpackFuncTable VIFfuncTable[32]; -extern __aligned16 u32 g_vif0Masks[64], g_vif1Masks[64]; -extern u32 g_vif0HasMask3[4], g_vif1HasMask3[4]; extern int g_vifCycles; -extern u8 s_maskwrite[256]; extern vifStruct *vif; -template void ProcessMemSkip(u32 size, u32 unpackType); -template u32 VIFalign(u32 *data, vifCode *v, u32 size); template void VIFunpack(u32 *data, vifCode *v, u32 size); template void vuExecMicro(u32 addr); extern void vif0FLUSH(); extern void vif1FLUSH(); -static __forceinline u32 vif_size(u8 num) -{ - return (num == 0) ? 0x1000 : 0x4000; -} - -// All defines are enabled with '1' or disabled with '0' - -#define newVif 1 // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code) -#define newVif1 1 // Use New Code for Vif1 Unpacks (needs newVif defined) -#define newVif0 1 // Use New Code for Vif0 Unpacks (needs newVif defined) - -#if newVif extern int nVifUnpack (int idx, u8 *data); extern void initNewVif (int idx); extern void resetNewVif(int idx); -#else -//# define NON_SSE_UNPACKS // Turns off SSE Unpacks (slower) -#endif #endif diff --git a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj index fe3c53837e..0fcbc28ad4 100644 --- a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj +++ b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj @@ -828,18 +828,6 @@ RelativePath="..\..\VIFunpack.cpp" > - - - - - - diff --git a/pcsx2/x86/VifUnpackSSE.cpp b/pcsx2/x86/VifUnpackSSE.cpp index 3117590278..075a477868 100644 --- a/pcsx2/x86/VifUnpackSSE.cpp +++ b/pcsx2/x86/VifUnpackSSE.cpp @@ -16,8 +16,6 @@ #include "PrecompiledHeader.h" #include "VifUnpackSSE.h" -#if newVif - #define xMOV8(regX, loc) xMOVSSZX(regX, loc) #define xMOV16(regX, loc) xMOVSSZX(regX, loc) #define xMOV32(regX, loc) xMOVSSZX(regX, loc) @@ -38,6 +36,30 @@ void mergeVectors(int dest, int src, int temp, int xyzw) { } } +// Loads Row/Col Data from vifRegs instead of g_vifmask +// Useful for testing vifReg and g_vifmask inconsistency. +void loadRowCol(nVifStruct& v) { + xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]); + xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]); + xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]); + xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]); + xPSHUF.D(xmm0, xmm0, _v0); + xPSHUF.D(xmm1, xmm1, _v0); + xPSHUF.D(xmm2, xmm2, _v0); + xPSHUF.D(xmm6, xmm6, _v0); + mVUmergeRegs(XMM6, XMM0, 8); + mVUmergeRegs(XMM6, XMM1, 4); + mVUmergeRegs(XMM6, XMM2, 2); + xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]); + xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]); + xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]); + xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]); + xPSHUF.D(xmm2, xmm2, _v0); + xPSHUF.D(xmm3, xmm3, _v0); + xPSHUF.D(xmm4, xmm4, _v0); + xPSHUF.D(xmm5, xmm5, _v0); +} + // ===================================================================================================== // VifUnpackSSE_Base Section // ===================================================================================================== @@ -286,5 +308,3 @@ void VifUnpackSSE_Init() HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true); } - -#endif diff --git a/pcsx2/x86/VifUnpackSSE.h b/pcsx2/x86/VifUnpackSSE.h index a3f59a61f6..d8ea2b38fe 100644 --- a/pcsx2/x86/VifUnpackSSE.h +++ b/pcsx2/x86/VifUnpackSSE.h @@ -24,9 +24,8 @@ using namespace x86Emitter; -#if newVif - extern void mergeVectors(int dest, int src, int temp, int xyzw); +extern void loadRowCol(nVifStruct& v); // -------------------------------------------------------------------------------------- // VifUnpackSSE_Base @@ -143,4 +142,4 @@ protected: return fillingWrite; } }; -#endif + diff --git a/pcsx2/x86/VifUnpackSSE_Dynarec.cpp b/pcsx2/x86/VifUnpackSSE_Dynarec.cpp index b16b87a9f1..d27b153413 100644 --- a/pcsx2/x86/VifUnpackSSE_Dynarec.cpp +++ b/pcsx2/x86/VifUnpackSSE_Dynarec.cpp @@ -20,8 +20,6 @@ #include "PrecompiledHeader.h" #include "VifUnpackSSE.h" -#if newVif - static __aligned16 nVifBlock _vBlock = {0}; static __pagealigned u8 nVifMemCmp[__pagesize]; @@ -39,30 +37,6 @@ void dVifClose(int idx) { safe_delete(nVif[idx].vifBlocks); } -// Loads Row/Col Data from vifRegs instead of g_vifmask -// Useful for testing vifReg and g_vifmask inconsistency. -static void loadRowCol(nVifStruct& v) { - xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]); - xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]); - xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]); - xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]); - xPSHUF.D(xmm0, xmm0, _v0); - xPSHUF.D(xmm1, xmm1, _v0); - xPSHUF.D(xmm2, xmm2, _v0); - xPSHUF.D(xmm6, xmm6, _v0); - mVUmergeRegs(XMM6, XMM0, 8); - mVUmergeRegs(XMM6, XMM1, 4); - mVUmergeRegs(XMM6, XMM2, 2); - xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]); - xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]); - xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]); - xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]); - xPSHUF.D(xmm2, xmm2, _v0); - xPSHUF.D(xmm3, xmm3, _v0); - xPSHUF.D(xmm4, xmm4, _v0); - xPSHUF.D(xmm5, xmm5, _v0); -} - VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_) : v(vif_) , vB(vifBlock_) @@ -291,5 +265,3 @@ _f void dVifUnpack(int idx, u8 *data, u32 size, bool isFill) { // the interpreter unpacker though, so a recursive call is the safest way here... dVifUnpack(idx, data, size, isFill); } - -#endif diff --git a/pcsx2/x86/aVif.S b/pcsx2/x86/aVif.S deleted file mode 100644 index 3263d0bd29..0000000000 --- a/pcsx2/x86/aVif.S +++ /dev/null @@ -1,1607 +0,0 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -.intel_syntax noprefix - -.extern vifRegs -.extern vifMaskRegs -.extern vifRow - -#define VIF_ESP esp -#define VIF_SRC esi -#define VIF_INC ecx -#define VIF_DST edi -#define VIF_SIZE edx -#define VIF_TMPADDR eax -#define VIF_SAVEEBX ebx -#define VIF_SAVEEBXd ebx - -#define XMM_R0 xmm0 -#define XMM_R1 xmm1 -#define XMM_R2 xmm2 -#define XMM_WRITEMASK xmm3 -#define XMM_ROWMASK xmm4 -#define XMM_ROWCOLMASK xmm5 -#define XMM_ROW xmm6 -#define XMM_COL xmm7 - -#define XMM_R3 XMM_COL - -// writing masks -#define UNPACK_Write0_Regular(r0, CL, DEST_OFFSET, MOVDQA) \ - MOVDQA xmmword ptr [VIF_DST+DEST_OFFSET], r0; - -#define UNPACK_Write1_Regular(r0, CL, DEST_OFFSET, MOVDQA) \ - MOVDQA xmmword ptr [VIF_DST], r0; \ - add VIF_DST, VIF_INC; \ - -#define UNPACK_Write0_Mask UNPACK_Write0_Regular -#define UNPACK_Write1_Mask UNPACK_Write1_Regular - -// masked write (dest needs to be in edi) -#define UNPACK_Write0_WriteMask(r0, CL, DEST_OFFSET, MOVDQA) \ - movdqa XMM_WRITEMASK, xmmword ptr [VIF_TMPADDR + 64*(CL) + 48]; \ - pand r0, XMM_WRITEMASK; \ - pandn XMM_WRITEMASK, xmmword ptr [VIF_DST]; \ - por r0, XMM_WRITEMASK; \ - MOVDQA xmmword ptr [VIF_DST], r0; \ - add VIF_DST, 16; \ - -// masked write (dest needs to be in edi) -#define UNPACK_Write1_WriteMask(r0, CL, DEST_OFFSET, MOVDQA) \ - movdqa XMM_WRITEMASK, xmmword ptr [VIF_TMPADDR + 64*(0) + 48]; \ - pand r0, XMM_WRITEMASK; \ - pandn XMM_WRITEMASK, xmmword ptr [VIF_DST]; \ - por r0, XMM_WRITEMASK; \ - MOVDQA xmmword ptr [VIF_DST], r0; \ - add VIF_DST, VIF_INC; \ - -#define UNPACK_Mask_SSE_0(r0) \ - pand r0, XMM_WRITEMASK; \ - por r0, XMM_ROWCOLMASK; \ - -// once a xmmword is uncomprssed, applies masks and saves -// note: modifying XMM_WRITEMASK -// dest = row + write (only when mask=0), otherwise write -#define UNPACK_Mask_SSE_1(r0) \ - pand r0, XMM_WRITEMASK; \ - por r0, XMM_ROWCOLMASK; \ - pand XMM_WRITEMASK, XMM_ROW; \ - paddd r0, XMM_WRITEMASK; \ - -// dest = row + write (only when mask=0), otherwise write -// row = row + write (only when mask = 0), otherwise row -#define UNPACK_Mask_SSE_2(r0) \ - pand r0, XMM_WRITEMASK; \ - pand XMM_WRITEMASK, XMM_ROW; \ - paddd XMM_ROW, r0; \ - por r0, XMM_ROWCOLMASK; \ - paddd r0, XMM_WRITEMASK; \ - -#define UNPACK_WriteMask_SSE_0 UNPACK_Mask_SSE_0 -#define UNPACK_WriteMask_SSE_1 UNPACK_Mask_SSE_1 -#define UNPACK_WriteMask_SSE_2 UNPACK_Mask_SSE_2 - -#define UNPACK_Regular_SSE_0(r0) - -#define UNPACK_Regular_SSE_1(r0) \ - paddd r0, XMM_ROW; \ - -#define UNPACK_Regular_SSE_2(r0) \ - paddd r0, XMM_ROW; \ - movdqa XMM_ROW, r0; \ - -// setting up masks -#define UNPACK_Setup_Mask_SSE(CL) \ - mov VIF_TMPADDR, vifMaskRegs; \ - movdqa XMM_ROWMASK, xmmword ptr [VIF_TMPADDR + 64*(CL) + 16]; \ - movdqa XMM_ROWCOLMASK, xmmword ptr [VIF_TMPADDR + 64*(CL) + 32]; \ - movdqa XMM_WRITEMASK, xmmword ptr [VIF_TMPADDR + 64*(CL)]; \ - pand XMM_ROWMASK, XMM_ROW; \ - pand XMM_ROWCOLMASK, XMM_COL; \ - por XMM_ROWCOLMASK, XMM_ROWMASK; \ - -#define UNPACK_Start_Setup_Mask_SSE_0(CL) UNPACK_Setup_Mask_SSE(CL) -#define UNPACK_Start_Setup_Mask_SSE_1(CL) \ - mov VIF_TMPADDR, vifMaskRegs; \ - movdqa XMM_ROWMASK, xmmword ptr [VIF_TMPADDR + 64*(CL) + 16]; \ - movdqa XMM_ROWCOLMASK, xmmword ptr [VIF_TMPADDR + 64*(CL) + 32]; \ - pand XMM_ROWMASK, XMM_ROW; \ - pand XMM_ROWCOLMASK, XMM_COL; \ - por XMM_ROWCOLMASK, XMM_ROWMASK; \ - -#define UNPACK_Start_Setup_Mask_SSE_2(CL) - -#define UNPACK_Setup_Mask_SSE_0_1(CL) -#define UNPACK_Setup_Mask_SSE_1_1(CL) \ - mov VIF_TMPADDR, vifMaskRegs; \ - movdqa XMM_WRITEMASK, xmmword ptr [VIF_TMPADDR + 64*(0)]; \ - -// ignore CL, since vif.cycle.wl == 1 -#define UNPACK_Setup_Mask_SSE_2_1(CL) \ - mov VIF_TMPADDR, vifMaskRegs; \ - movdqa XMM_ROWMASK, xmmword ptr [VIF_TMPADDR + 64*(0) + 16]; \ - movdqa XMM_ROWCOLMASK, xmmword ptr [VIF_TMPADDR + 64*(0) + 32]; \ - movdqa XMM_WRITEMASK, xmmword ptr [VIF_TMPADDR + 64*(0)]; \ - pand XMM_ROWMASK, XMM_ROW; \ - pand XMM_ROWCOLMASK, XMM_COL; \ - por XMM_ROWCOLMASK, XMM_ROWMASK; \ - -#define UNPACK_Setup_Mask_SSE_0_0(CL) UNPACK_Setup_Mask_SSE(CL) -#define UNPACK_Setup_Mask_SSE_1_0(CL) UNPACK_Setup_Mask_SSE(CL) -#define UNPACK_Setup_Mask_SSE_2_0(CL) UNPACK_Setup_Mask_SSE(CL) - -// write mask always destroys XMM_WRITEMASK, so 0_0 = 1_0 -#define UNPACK_Setup_WriteMask_SSE_0_0(CL) UNPACK_Setup_Mask_SSE(CL) -#define UNPACK_Setup_WriteMask_SSE_1_0(CL) UNPACK_Setup_Mask_SSE(CL) -#define UNPACK_Setup_WriteMask_SSE_2_0(CL) UNPACK_Setup_Mask_SSE(CL) -#define UNPACK_Setup_WriteMask_SSE_0_1(CL) UNPACK_Setup_Mask_SSE_1_1(CL) -#define UNPACK_Setup_WriteMask_SSE_1_1(CL) UNPACK_Setup_Mask_SSE_1_1(CL) -#define UNPACK_Setup_WriteMask_SSE_2_1(CL) UNPACK_Setup_Mask_SSE_2_1(CL) - -#define UNPACK_Start_Setup_WriteMask_SSE_0(CL) UNPACK_Start_Setup_Mask_SSE_1(CL) -#define UNPACK_Start_Setup_WriteMask_SSE_1(CL) UNPACK_Start_Setup_Mask_SSE_1(CL) -#define UNPACK_Start_Setup_WriteMask_SSE_2(CL) UNPACK_Start_Setup_Mask_SSE_2(CL) - -#define UNPACK_Start_Setup_Regular_SSE_0(CL) -#define UNPACK_Start_Setup_Regular_SSE_1(CL) -#define UNPACK_Start_Setup_Regular_SSE_2(CL) -#define UNPACK_Setup_Regular_SSE_0_0(CL) -#define UNPACK_Setup_Regular_SSE_1_0(CL) -#define UNPACK_Setup_Regular_SSE_2_0(CL) -#define UNPACK_Setup_Regular_SSE_0_1(CL) -#define UNPACK_Setup_Regular_SSE_1_1(CL) -#define UNPACK_Setup_Regular_SSE_2_1(CL) - -#define UNPACK_INC_DST_0_Regular(qw) add VIF_DST, (16*qw) -#define UNPACK_INC_DST_1_Regular(qw) -#define UNPACK_INC_DST_0_Mask(qw) add VIF_DST, (16*qw) -#define UNPACK_INC_DST_1_Mask(qw) -#define UNPACK_INC_DST_0_WriteMask(qw) -#define UNPACK_INC_DST_1_WriteMask(qw) - -// unpacks for 1,2,3,4 elements (V3 uses this directly) -#define UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType) \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+0); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \ - \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \ - \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \ - \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+3); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R3); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R3, CL+3, 48, movdqa); \ - \ - UNPACK_INC_DST_##TOTALCL##_##MaskType##(4) - -// V3 uses this directly -#define UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType) \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \ - \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \ - \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \ - \ - UNPACK_INC_DST_##TOTALCL##_##MaskType##(3); \ - -#define UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType) \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \ - \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \ - \ - UNPACK_INC_DST_##TOTALCL##_##MaskType##(2); \ - -#define UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType) \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \ - \ - UNPACK_INC_DST_##TOTALCL##_##MaskType##(1); \ - -// S-32 -// only when cl==1 -#define UNPACK_S_32SSE_4x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \ - MOVDQA XMM_R3, xmmword ptr [VIF_SRC]; \ - \ - pshufd XMM_R0, XMM_R3, 0; \ - pshufd XMM_R1, XMM_R3, 0x55; \ - pshufd XMM_R2, XMM_R3, 0xaa; \ - pshufd XMM_R3, XMM_R3, 0xff; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 16; \ - -#define UNPACK_S_32SSE_4A(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqa) -#define UNPACK_S_32SSE_4(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqu) - -#define UNPACK_S_32SSE_3x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \ - MOVDQA XMM_R2, xmmword ptr [VIF_SRC]; \ - \ - pshufd XMM_R0, XMM_R2, 0; \ - pshufd XMM_R1, XMM_R2, 0x55; \ - pshufd XMM_R2, XMM_R2, 0xaa; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 12; \ - -#define UNPACK_S_32SSE_3A(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqa) -#define UNPACK_S_32SSE_3(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqu) - -#define UNPACK_S_32SSE_2(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R1, qword ptr [VIF_SRC]; \ - \ - pshufd XMM_R0, XMM_R1, 0; \ - pshufd XMM_R1, XMM_R1, 0x55; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 8; \ - -#define UNPACK_S_32SSE_2A UNPACK_S_32SSE_2 - -#define UNPACK_S_32SSE_1(CL, TOTALCL, MaskType, ModeType) \ - movd XMM_R0, dword ptr [VIF_SRC]; \ - pshufd XMM_R0, XMM_R0, 0; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 4; \ - -#define UNPACK_S_32SSE_1A UNPACK_S_32SSE_1 - -// S-16 -#define UNPACK_S_16SSE_4(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R3, qword ptr [VIF_SRC]; \ - punpcklwd XMM_R3, XMM_R3; \ - UNPACK_RIGHTSHIFT XMM_R3, 16; \ - \ - pshufd XMM_R0, XMM_R3, 0; \ - pshufd XMM_R1, XMM_R3, 0x55; \ - pshufd XMM_R2, XMM_R3, 0xaa; \ - pshufd XMM_R3, XMM_R3, 0xff; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 8; \ - -#define UNPACK_S_16SSE_4A UNPACK_S_16SSE_4 - -#define UNPACK_S_16SSE_3(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R2, qword ptr [VIF_SRC]; \ - punpcklwd XMM_R2, XMM_R2; \ - UNPACK_RIGHTSHIFT XMM_R2, 16; \ - \ - pshufd XMM_R0, XMM_R2, 0; \ - pshufd XMM_R1, XMM_R2, 0x55; \ - pshufd XMM_R2, XMM_R2, 0xaa; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - add VIF_SRC, 6; \ - -#define UNPACK_S_16SSE_3A UNPACK_S_16SSE_3 - -#define UNPACK_S_16SSE_2(CL, TOTALCL, MaskType, ModeType) \ - movd XMM_R1, dword ptr [VIF_SRC]; \ - punpcklwd XMM_R1, XMM_R1; \ - UNPACK_RIGHTSHIFT XMM_R1, 16; \ - \ - pshufd XMM_R0, XMM_R1, 0; \ - pshufd XMM_R1, XMM_R1, 0x55; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 4; \ - -#define UNPACK_S_16SSE_2A UNPACK_S_16SSE_2 - -#define UNPACK_S_16SSE_1(CL, TOTALCL, MaskType, ModeType) \ - movd XMM_R0, dword ptr [VIF_SRC]; \ - punpcklwd XMM_R0, XMM_R0; \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - pshufd XMM_R0, XMM_R0, 0; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 2; \ - -#define UNPACK_S_16SSE_1A UNPACK_S_16SSE_1 - -// S-8 -#define UNPACK_S_8SSE_4(CL, TOTALCL, MaskType, ModeType) \ - movd XMM_R3, dword ptr [VIF_SRC]; \ - punpcklbw XMM_R3, XMM_R3; \ - punpcklwd XMM_R3, XMM_R3; \ - UNPACK_RIGHTSHIFT XMM_R3, 24; \ - \ - pshufd XMM_R0, XMM_R3, 0; \ - pshufd XMM_R1, XMM_R3, 0x55; \ - pshufd XMM_R2, XMM_R3, 0xaa; \ - pshufd XMM_R3, XMM_R3, 0xff; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 4; \ - -#define UNPACK_S_8SSE_4A UNPACK_S_8SSE_4 - -#define UNPACK_S_8SSE_3(CL, TOTALCL, MaskType, ModeType) \ - movd XMM_R2, dword ptr [VIF_SRC]; \ - punpcklbw XMM_R2, XMM_R2; \ - punpcklwd XMM_R2, XMM_R2; \ - UNPACK_RIGHTSHIFT XMM_R2, 24; \ - \ - pshufd XMM_R0, XMM_R2, 0; \ - pshufd XMM_R1, XMM_R2, 0x55; \ - pshufd XMM_R2, XMM_R2, 0xaa; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 3; \ - -#define UNPACK_S_8SSE_3A UNPACK_S_8SSE_3 - -#define UNPACK_S_8SSE_2(CL, TOTALCL, MaskType, ModeType) \ - movd XMM_R1, dword ptr [VIF_SRC]; \ - punpcklbw XMM_R1, XMM_R1; \ - punpcklwd XMM_R1, XMM_R1; \ - UNPACK_RIGHTSHIFT XMM_R1, 24; \ - \ - pshufd XMM_R0, XMM_R1, 0; \ - pshufd XMM_R1, XMM_R1, 0x55; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 2; \ - -#define UNPACK_S_8SSE_2A UNPACK_S_8SSE_2 - -#define UNPACK_S_8SSE_1(CL, TOTALCL, MaskType, ModeType) \ - movd XMM_R0, dword ptr [VIF_SRC]; \ - punpcklbw XMM_R0, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - pshufd XMM_R0, XMM_R0, 0; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - inc VIF_SRC; \ - -#define UNPACK_S_8SSE_1A UNPACK_S_8SSE_1 - -// V2-32 -#define UNPACK_V2_32SSE_4A(CL, TOTALCL, MaskType, ModeType) \ - MOVDQA XMM_R0, xmmword ptr [VIF_SRC]; \ - MOVDQA XMM_R2, xmmword ptr [VIF_SRC+16]; \ - \ - pshufd XMM_R1, XMM_R0, 0xee; \ - pshufd XMM_R3, XMM_R2, 0xee; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 32; \ - -#define UNPACK_V2_32SSE_4(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - movq XMM_R1, qword ptr [VIF_SRC+8]; \ - movq XMM_R2, qword ptr [VIF_SRC+16]; \ - movq XMM_R3, qword ptr [VIF_SRC+24]; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 32; \ - -#define UNPACK_V2_32SSE_3A(CL, TOTALCL, MaskType, ModeType) \ - MOVDQA XMM_R0, xmmword ptr [VIF_SRC]; \ - movq XMM_R2, qword ptr [VIF_SRC+16]; \ - pshufd XMM_R1, XMM_R0, 0xee; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 24; \ - -#define UNPACK_V2_32SSE_3(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - movq XMM_R1, qword ptr [VIF_SRC+8]; \ - movq XMM_R2, qword ptr [VIF_SRC+16]; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 24; \ - -#define UNPACK_V2_32SSE_2(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - movq XMM_R1, qword ptr [VIF_SRC+8]; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 16; \ - -#define UNPACK_V2_32SSE_2A UNPACK_V2_32SSE_2 - -#define UNPACK_V2_32SSE_1(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 8; \ - -#define UNPACK_V2_32SSE_1A UNPACK_V2_32SSE_1 - -// V2-16 -// due to lemmings, have to copy lower xmmword to the upper xmmword of every reg -#define UNPACK_V2_16SSE_4A(CL, TOTALCL, MaskType, ModeType) \ - punpcklwd XMM_R0, xmmword ptr [VIF_SRC]; \ - punpckhwd XMM_R2, xmmword ptr [VIF_SRC]; \ - \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - UNPACK_RIGHTSHIFT XMM_R2, 16; \ - \ - punpckhqdq XMM_R1, XMM_R0; \ - punpckhqdq XMM_R3, XMM_R2; \ - \ - punpcklqdq XMM_R0, XMM_R0; \ - punpcklqdq XMM_R2, XMM_R2; \ - punpckhqdq XMM_R1, XMM_R1; \ - punpckhqdq XMM_R3, XMM_R3; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - add VIF_SRC, 16; \ - -#define UNPACK_V2_16SSE_4(CL, TOTALCL, MaskType, ModeType) \ - movdqu XMM_R0, xmmword ptr [VIF_SRC]; \ - \ - punpckhwd XMM_R2, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - UNPACK_RIGHTSHIFT XMM_R2, 16; \ - \ - punpckhqdq XMM_R1, XMM_R0; \ - punpckhqdq XMM_R3, XMM_R2; \ - \ - punpcklqdq XMM_R0, XMM_R0; \ - punpcklqdq XMM_R2, XMM_R2; \ - punpckhqdq XMM_R1, XMM_R1; \ - punpckhqdq XMM_R3, XMM_R3; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 16; \ - -#define UNPACK_V2_16SSE_3A(CL, TOTALCL, MaskType, ModeType) \ - punpcklwd XMM_R0, xmmword ptr [VIF_SRC]; \ - punpckhwd XMM_R2, xmmword ptr [VIF_SRC]; \ - \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - UNPACK_RIGHTSHIFT XMM_R2, 16; \ - \ - punpckhqdq XMM_R1, XMM_R0; \ - \ - punpcklqdq XMM_R0, XMM_R0; \ - punpcklqdq XMM_R2, XMM_R2; \ - punpckhqdq XMM_R1, XMM_R1; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 12; \ - -#define UNPACK_V2_16SSE_3(CL, TOTALCL, MaskType, ModeType) \ - movdqu XMM_R0, xmmword ptr [VIF_SRC]; \ - \ - punpckhwd XMM_R2, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - UNPACK_RIGHTSHIFT XMM_R2, 16; \ - \ - punpckhqdq XMM_R1, XMM_R0; \ - \ - punpcklqdq XMM_R0, XMM_R0; \ - punpcklqdq XMM_R2, XMM_R2; \ - punpckhqdq XMM_R1, XMM_R1; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 12; \ - -#define UNPACK_V2_16SSE_2A(CL, TOTALCL, MaskType, ModeType) \ - punpcklwd XMM_R0, xmmword ptr [VIF_SRC]; \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - \ - punpckhqdq XMM_R1, XMM_R0; \ - \ - punpcklqdq XMM_R0, XMM_R0; \ - punpckhqdq XMM_R1, XMM_R1; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 8; \ - -#define UNPACK_V2_16SSE_2(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - punpcklwd XMM_R0, XMM_R0; \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - \ - punpckhqdq XMM_R1, XMM_R0; \ - \ - punpcklqdq XMM_R0, XMM_R0; \ - punpckhqdq XMM_R1, XMM_R1; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 8; \ - -#define UNPACK_V2_16SSE_1A(CL, TOTALCL, MaskType, ModeType) \ - punpcklwd XMM_R0, xmmword ptr [VIF_SRC]; \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - punpcklqdq XMM_R0, XMM_R0; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 4; \ - -#define UNPACK_V2_16SSE_1(CL, TOTALCL, MaskType, ModeType) \ - movd XMM_R0, dword ptr [VIF_SRC]; \ - punpcklwd XMM_R0, XMM_R0; \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - punpcklqdq XMM_R0, XMM_R0; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 4; \ - -// V2-8 -// and1 streetball needs to copy lower xmmword to the upper xmmword of every reg -#define UNPACK_V2_8SSE_4(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - \ - punpcklbw XMM_R0, XMM_R0; \ - punpckhwd XMM_R2, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - UNPACK_RIGHTSHIFT XMM_R2, 24; \ - \ - punpckhqdq XMM_R1, XMM_R0; \ - punpckhqdq XMM_R3, XMM_R2; \ - \ - punpcklqdq XMM_R0, XMM_R0; \ - punpcklqdq XMM_R2, XMM_R2; \ - punpckhqdq XMM_R1, XMM_R1; \ - punpckhqdq XMM_R3, XMM_R3; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 8; \ - -#define UNPACK_V2_8SSE_4A UNPACK_V2_8SSE_4 - -#define UNPACK_V2_8SSE_3(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - \ - punpcklbw XMM_R0, XMM_R0; \ - punpckhwd XMM_R2, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - UNPACK_RIGHTSHIFT XMM_R2, 24; \ - \ - punpckhqdq XMM_R1, XMM_R0; \ - \ - punpcklqdq XMM_R0, XMM_R0; \ - punpcklqdq XMM_R2, XMM_R2; \ - punpckhqdq XMM_R1, XMM_R1; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 6; \ - -#define UNPACK_V2_8SSE_3A UNPACK_V2_8SSE_3 - -#define UNPACK_V2_8SSE_2(CL, TOTALCL, MaskType, ModeType) \ - movd XMM_R0, dword ptr [VIF_SRC]; \ - punpcklbw XMM_R0, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - \ - punpckhqdq XMM_R1, XMM_R0; \ - \ - punpcklqdq XMM_R0, XMM_R0; \ - punpckhqdq XMM_R1, XMM_R1; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 4; \ - -#define UNPACK_V2_8SSE_2A UNPACK_V2_8SSE_2 - -#define UNPACK_V2_8SSE_1(CL, TOTALCL, MaskType, ModeType) \ - movd XMM_R0, dword ptr [VIF_SRC]; \ - punpcklbw XMM_R0, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - punpcklqdq XMM_R0, XMM_R0; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 2; \ - -#define UNPACK_V2_8SSE_1A UNPACK_V2_8SSE_1 - -// V3-32 -// midnight club 2 crashes because reading a qw at +36 is out of bounds -#define UNPACK_V3_32SSE_4x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \ - MOVDQA XMM_R0, xmmword ptr [VIF_SRC]; \ - movdqu XMM_R1, xmmword ptr [VIF_SRC+12]; \ - \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+0); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \ - \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \ - \ - MOVDQA XMM_R3, xmmword ptr [VIF_SRC+32]; \ - movdqu XMM_R2, xmmword ptr [VIF_SRC+24]; \ - psrldq XMM_R3, 4; \ - \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \ - \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+3); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R3); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R3, CL+3, 48, movdqa); \ - \ - UNPACK_INC_DST_##TOTALCL##_##MaskType##(4); \ - \ - add VIF_SRC, 48; \ - -#define UNPACK_V3_32SSE_4A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqa) -#define UNPACK_V3_32SSE_4(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqu) - -#define UNPACK_V3_32SSE_3x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \ - MOVDQA XMM_R0, xmmword ptr [VIF_SRC]; \ - movdqu XMM_R1, xmmword ptr [VIF_SRC+12]; \ - \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \ - \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \ - \ - movdqu XMM_R2, xmmword ptr [VIF_SRC+24]; \ - \ - UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \ - UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \ - UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \ - \ - UNPACK_INC_DST_##TOTALCL##_##MaskType##(3); \ - \ - add VIF_SRC, 36; \ - -#define UNPACK_V3_32SSE_3A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqa) -#define UNPACK_V3_32SSE_3(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqu) - -#define UNPACK_V3_32SSE_2x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \ - MOVDQA XMM_R0, xmmword ptr [VIF_SRC]; \ - movdqu XMM_R1, xmmword ptr [VIF_SRC+12]; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 24; \ - -#define UNPACK_V3_32SSE_2A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_2x(CL, TOTALCL, MaskType, ModeType, movdqa) -#define UNPACK_V3_32SSE_2(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_2x(CL, TOTALCL, MaskType, ModeType, movdqu) - -#define UNPACK_V3_32SSE_1x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \ - MOVDQA XMM_R0, xmmword ptr [VIF_SRC]; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 12; \ - -#define UNPACK_V3_32SSE_1A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_1x(CL, TOTALCL, MaskType, ModeType, movdqa) -#define UNPACK_V3_32SSE_1(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_1x(CL, TOTALCL, MaskType, ModeType, movdqu) - -// V3-16 -#define UNPACK_V3_16SSE_4(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - movq XMM_R1, qword ptr [VIF_SRC+6]; \ - \ - punpcklwd XMM_R0, XMM_R0; \ - movq XMM_R2, qword ptr [VIF_SRC+12]; \ - punpcklwd XMM_R1, XMM_R1; \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - movq XMM_R3, qword ptr [VIF_SRC+18]; \ - UNPACK_RIGHTSHIFT XMM_R1, 16; \ - punpcklwd XMM_R2, XMM_R2; \ - punpcklwd XMM_R3, XMM_R3; \ - \ - UNPACK_RIGHTSHIFT XMM_R2, 16; \ - UNPACK_RIGHTSHIFT XMM_R3, 16; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 24; \ - -#define UNPACK_V3_16SSE_4A UNPACK_V3_16SSE_4 - -#define UNPACK_V3_16SSE_3(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - movq XMM_R1, qword ptr [VIF_SRC+6]; \ - \ - punpcklwd XMM_R0, XMM_R0; \ - movq XMM_R2, qword ptr [VIF_SRC+12]; \ - punpcklwd XMM_R1, XMM_R1; \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - punpcklwd XMM_R2, XMM_R2; \ - \ - UNPACK_RIGHTSHIFT XMM_R1, 16; \ - UNPACK_RIGHTSHIFT XMM_R2, 16; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 18; \ - -#define UNPACK_V3_16SSE_3A UNPACK_V3_16SSE_3 - -#define UNPACK_V3_16SSE_2(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - movq XMM_R1, qword ptr [VIF_SRC+6]; \ - \ - punpcklwd XMM_R0, XMM_R0; \ - punpcklwd XMM_R1, XMM_R1; \ - \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - UNPACK_RIGHTSHIFT XMM_R1, 16; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 12; \ - -#define UNPACK_V3_16SSE_2A UNPACK_V3_16SSE_2 - -#define UNPACK_V3_16SSE_1(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - punpcklwd XMM_R0, XMM_R0; \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 6; \ - -#define UNPACK_V3_16SSE_1A UNPACK_V3_16SSE_1 - -// V3-8 -#define UNPACK_V3_8SSE_4(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R1, qword ptr [VIF_SRC]; \ - movq XMM_R3, qword ptr [VIF_SRC+6]; \ - \ - punpcklbw XMM_R1, XMM_R1; \ - punpcklbw XMM_R3, XMM_R3; \ - punpcklwd XMM_R0, XMM_R1; \ - psrldq XMM_R1, 6; \ - punpcklwd XMM_R2, XMM_R3; \ - psrldq XMM_R3, 6; \ - punpcklwd XMM_R1, XMM_R1; \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - punpcklwd XMM_R3, XMM_R3; \ - \ - UNPACK_RIGHTSHIFT XMM_R2, 24; \ - UNPACK_RIGHTSHIFT XMM_R1, 24; \ - UNPACK_RIGHTSHIFT XMM_R3, 24; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 12; \ - -#define UNPACK_V3_8SSE_4A UNPACK_V3_8SSE_4 - -#define UNPACK_V3_8SSE_3(CL, TOTALCL, MaskType, ModeType) \ - movd XMM_R0, dword ptr [VIF_SRC]; \ - movd XMM_R1, dword ptr [VIF_SRC+3]; \ - \ - punpcklbw XMM_R0, XMM_R0; \ - movd XMM_R2, dword ptr [VIF_SRC+6]; \ - punpcklbw XMM_R1, XMM_R1; \ - punpcklwd XMM_R0, XMM_R0; \ - punpcklbw XMM_R2, XMM_R2; \ - \ - punpcklwd XMM_R1, XMM_R1; \ - punpcklwd XMM_R2, XMM_R2; \ - \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - UNPACK_RIGHTSHIFT XMM_R1, 24; \ - UNPACK_RIGHTSHIFT XMM_R2, 24; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 9 \ - -#define UNPACK_V3_8SSE_3A UNPACK_V3_8SSE_3 - -#define UNPACK_V3_8SSE_2(CL, TOTALCL, MaskType, ModeType) \ - movd XMM_R0, dword ptr [VIF_SRC]; \ - movd XMM_R1, dword ptr [VIF_SRC+3]; \ - \ - punpcklbw XMM_R0, XMM_R0; \ - punpcklbw XMM_R1, XMM_R1; \ - \ - punpcklwd XMM_R0, XMM_R0; \ - punpcklwd XMM_R1, XMM_R1; \ - \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - UNPACK_RIGHTSHIFT XMM_R1, 24; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 6; \ - -#define UNPACK_V3_8SSE_2A UNPACK_V3_8SSE_2 - -#define UNPACK_V3_8SSE_1(CL, TOTALCL, MaskType, ModeType) \ - movd XMM_R0, dword ptr [VIF_SRC]; \ - punpcklbw XMM_R0, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 3; \ - -#define UNPACK_V3_8SSE_1A UNPACK_V3_8SSE_1 - -// V4-32 -#define UNPACK_V4_32SSE_4A(CL, TOTALCL, MaskType, ModeType) \ - movdqa XMM_R0, xmmword ptr [VIF_SRC]; \ - movdqa XMM_R1, xmmword ptr [VIF_SRC+16]; \ - movdqa XMM_R2, xmmword ptr [VIF_SRC+32]; \ - movdqa XMM_R3, xmmword ptr [VIF_SRC+48]; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 64; \ - -#define UNPACK_V4_32SSE_4(CL, TOTALCL, MaskType, ModeType) \ - movdqu XMM_R0, xmmword ptr [VIF_SRC]; \ - movdqu XMM_R1, xmmword ptr [VIF_SRC+16]; \ - movdqu XMM_R2, xmmword ptr [VIF_SRC+32]; \ - movdqu XMM_R3, xmmword ptr [VIF_SRC+48]; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 64; \ - -#define UNPACK_V4_32SSE_3A(CL, TOTALCL, MaskType, ModeType) \ - movdqa XMM_R0, xmmword ptr [VIF_SRC]; \ - movdqa XMM_R1, xmmword ptr [VIF_SRC+16]; \ - movdqa XMM_R2, xmmword ptr [VIF_SRC+32]; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 48; \ - -#define UNPACK_V4_32SSE_3(CL, TOTALCL, MaskType, ModeType) \ - movdqu XMM_R0, xmmword ptr [VIF_SRC]; \ - movdqu XMM_R1, xmmword ptr [VIF_SRC+16]; \ - movdqu XMM_R2, xmmword ptr [VIF_SRC+32]; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 48; \ - -#define UNPACK_V4_32SSE_2A(CL, TOTALCL, MaskType, ModeType) \ - movdqa XMM_R0, xmmword ptr [VIF_SRC]; \ - movdqa XMM_R1, xmmword ptr [VIF_SRC+16]; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 32; \ - -#define UNPACK_V4_32SSE_2(CL, TOTALCL, MaskType, ModeType) \ - movdqu XMM_R0, xmmword ptr [VIF_SRC]; \ - movdqu XMM_R1, xmmword ptr [VIF_SRC+16]; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 32; \ - -#define UNPACK_V4_32SSE_1A(CL, TOTALCL, MaskType, ModeType) \ - movdqa XMM_R0, xmmword ptr [VIF_SRC]; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 16; \ - -#define UNPACK_V4_32SSE_1(CL, TOTALCL, MaskType, ModeType) \ - movdqu XMM_R0, xmmword ptr [VIF_SRC]; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 16; \ - -// V4-16 -#define UNPACK_V4_16SSE_4A(CL, TOTALCL, MaskType, ModeType) \ - \ - punpcklwd XMM_R0, xmmword ptr [VIF_SRC]; \ - punpckhwd XMM_R1, xmmword ptr [VIF_SRC]; \ - punpcklwd XMM_R2, xmmword ptr [VIF_SRC+16]; \ - punpckhwd XMM_R3, xmmword ptr [VIF_SRC+16]; \ - \ - UNPACK_RIGHTSHIFT XMM_R1, 16; \ - UNPACK_RIGHTSHIFT XMM_R3, 16; \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - UNPACK_RIGHTSHIFT XMM_R2, 16; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 32; \ - -#define UNPACK_V4_16SSE_4(CL, TOTALCL, MaskType, ModeType) \ - movdqu XMM_R0, xmmword ptr [VIF_SRC]; \ - movdqu XMM_R2, xmmword ptr [VIF_SRC+16]; \ - \ - punpckhwd XMM_R1, XMM_R0; \ - punpckhwd XMM_R3, XMM_R2; \ - punpcklwd XMM_R0, XMM_R0; \ - punpcklwd XMM_R2, XMM_R2; \ - \ - UNPACK_RIGHTSHIFT XMM_R1, 16; \ - UNPACK_RIGHTSHIFT XMM_R3, 16; \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - UNPACK_RIGHTSHIFT XMM_R2, 16; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 32; \ - -#define UNPACK_V4_16SSE_3A(CL, TOTALCL, MaskType, ModeType) \ - punpcklwd XMM_R0, xmmword ptr [VIF_SRC]; \ - punpckhwd XMM_R1, xmmword ptr [VIF_SRC]; \ - punpcklwd XMM_R2, xmmword ptr [VIF_SRC+16]; \ - \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - UNPACK_RIGHTSHIFT XMM_R1, 16; \ - UNPACK_RIGHTSHIFT XMM_R2, 16; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 24; \ - -#define UNPACK_V4_16SSE_3(CL, TOTALCL, MaskType, ModeType) \ - movdqu XMM_R0, xmmword ptr [VIF_SRC]; \ - movq XMM_R2, qword ptr [VIF_SRC+16]; \ - \ - punpckhwd XMM_R1, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - punpcklwd XMM_R2, XMM_R2; \ - \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - UNPACK_RIGHTSHIFT XMM_R1, 16; \ - UNPACK_RIGHTSHIFT XMM_R2, 16; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 24; \ - -#define UNPACK_V4_16SSE_2A(CL, TOTALCL, MaskType, ModeType) \ - punpcklwd XMM_R0, xmmword ptr [VIF_SRC]; \ - punpckhwd XMM_R1, xmmword ptr [VIF_SRC]; \ - \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - UNPACK_RIGHTSHIFT XMM_R1, 16; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 16; \ - -#define UNPACK_V4_16SSE_2(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - movq XMM_R1, qword ptr [VIF_SRC+8]; \ - \ - punpcklwd XMM_R0, XMM_R0; \ - punpcklwd XMM_R1, XMM_R1; \ - \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - UNPACK_RIGHTSHIFT XMM_R1, 16; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 16; \ - -#define UNPACK_V4_16SSE_1A(CL, TOTALCL, MaskType, ModeType) \ - punpcklwd XMM_R0, xmmword ptr [VIF_SRC]; \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 8; \ - -#define UNPACK_V4_16SSE_1(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - punpcklwd XMM_R0, XMM_R0; \ - UNPACK_RIGHTSHIFT XMM_R0, 16; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 8; \ - -// V4-8 -#define UNPACK_V4_8SSE_4A(CL, TOTALCL, MaskType, ModeType) \ - punpcklbw XMM_R0, xmmword ptr [VIF_SRC]; \ - punpckhbw XMM_R2, xmmword ptr [VIF_SRC]; \ - \ - punpckhwd XMM_R1, XMM_R0; \ - punpckhwd XMM_R3, XMM_R2; \ - punpcklwd XMM_R0, XMM_R0; \ - punpcklwd XMM_R2, XMM_R2; \ - \ - UNPACK_RIGHTSHIFT XMM_R1, 24; \ - UNPACK_RIGHTSHIFT XMM_R3, 24; \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - UNPACK_RIGHTSHIFT XMM_R2, 24; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 16; \ - -#define UNPACK_V4_8SSE_4(CL, TOTALCL, MaskType, ModeType) \ - movdqu XMM_R0, xmmword ptr [VIF_SRC]; \ - \ - punpckhbw XMM_R2, XMM_R0; \ - punpcklbw XMM_R0, XMM_R0; \ - \ - punpckhwd XMM_R3, XMM_R2; \ - punpckhwd XMM_R1, XMM_R0; \ - punpcklwd XMM_R2, XMM_R2; \ - punpcklwd XMM_R0, XMM_R0; \ - \ - UNPACK_RIGHTSHIFT XMM_R3, 24; \ - UNPACK_RIGHTSHIFT XMM_R2, 24; \ - \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - UNPACK_RIGHTSHIFT XMM_R1, 24; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 16; \ - -#define UNPACK_V4_8SSE_3A(CL, TOTALCL, MaskType, ModeType) \ - punpcklbw XMM_R0, xmmword ptr [VIF_SRC]; \ - punpckhbw XMM_R2, xmmword ptr [VIF_SRC]; \ - \ - punpckhwd XMM_R1, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - punpcklwd XMM_R2, XMM_R2; \ - \ - UNPACK_RIGHTSHIFT XMM_R1, 24; \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - UNPACK_RIGHTSHIFT XMM_R2, 24; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 12; \ - -#define UNPACK_V4_8SSE_3(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - movd XMM_R2, dword ptr [VIF_SRC+8]; \ - \ - punpcklbw XMM_R0, XMM_R0; \ - punpcklbw XMM_R2, XMM_R2; \ - \ - punpckhwd XMM_R1, XMM_R0; \ - punpcklwd XMM_R2, XMM_R2; \ - punpcklwd XMM_R0, XMM_R0; \ - \ - UNPACK_RIGHTSHIFT XMM_R1, 24; \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - UNPACK_RIGHTSHIFT XMM_R2, 24; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 12; \ - -#define UNPACK_V4_8SSE_2A(CL, TOTALCL, MaskType, ModeType) \ - punpcklbw XMM_R0, xmmword ptr [VIF_SRC]; \ - \ - punpckhwd XMM_R1, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - \ - UNPACK_RIGHTSHIFT XMM_R1, 24; \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 8; \ - -#define UNPACK_V4_8SSE_2(CL, TOTALCL, MaskType, ModeType) \ - movq XMM_R0, qword ptr [VIF_SRC]; \ - \ - punpcklbw XMM_R0, XMM_R0; \ - \ - punpckhwd XMM_R1, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - \ - UNPACK_RIGHTSHIFT XMM_R1, 24; \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 8; \ - -#define UNPACK_V4_8SSE_1A(CL, TOTALCL, MaskType, ModeType) \ - punpcklbw XMM_R0, xmmword ptr [VIF_SRC]; \ - punpcklwd XMM_R0, XMM_R0; \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 4; \ - -#define UNPACK_V4_8SSE_1(CL, TOTALCL, MaskType, ModeType) \ - movd XMM_R0, dword ptr [VIF_SRC]; \ - punpcklbw XMM_R0, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - UNPACK_RIGHTSHIFT XMM_R0, 24; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 4; \ - -// V4-5 -.extern s_TempDecompress - -#define DECOMPRESS_RGBA(OFFSET) \ - mov bl, al; \ - shl bl, 3; \ - mov byte ptr [s_TempDecompress+OFFSET], bl; \ - \ - mov bx, ax; \ - shr bx, 2; \ - and bx, 0xf8; \ - mov byte ptr [s_TempDecompress+OFFSET+1], bl; \ - \ - mov bx, ax; \ - shr bx, 7; \ - and bx, 0xf8; \ - mov byte ptr [s_TempDecompress+OFFSET+2], bl; \ - mov bx, ax; \ - shr bx, 8; \ - and bx, 0x80; \ - mov byte ptr [s_TempDecompress+OFFSET+3], bl; \ - -#define UNPACK_V4_5SSE_4(CL, TOTALCL, MaskType, ModeType) \ - mov eax, dword ptr [VIF_SRC]; \ - DECOMPRESS_RGBA(0); \ - \ - shr eax, 16; \ - DECOMPRESS_RGBA(4); \ - \ - mov eax, dword ptr [VIF_SRC+4]; \ - DECOMPRESS_RGBA(8); \ - \ - shr eax, 16; \ - DECOMPRESS_RGBA(12); \ - \ - movdqa XMM_R0, xmmword ptr [s_TempDecompress]; \ - \ - punpckhbw XMM_R2, XMM_R0; \ - punpcklbw XMM_R0, XMM_R0; \ - \ - punpckhwd XMM_R3, XMM_R2; \ - punpckhwd XMM_R1, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - punpcklwd XMM_R2, XMM_R2; \ - \ - psrld XMM_R0, 24; \ - psrld XMM_R1, 24; \ - psrld XMM_R2, 24; \ - psrld XMM_R3, 24; \ - \ - UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 8; \ - -#define UNPACK_V4_5SSE_4A UNPACK_V4_5SSE_4 - -#define UNPACK_V4_5SSE_3(CL, TOTALCL, MaskType, ModeType) \ - mov eax, dword ptr [VIF_SRC]; \ - DECOMPRESS_RGBA(0); \ - \ - shr eax, 16; \ - DECOMPRESS_RGBA(4); \ - \ - mov eax, dword ptr [VIF_SRC]; \ - DECOMPRESS_RGBA(8); \ - \ - movdqa XMM_R0, xmmword ptr [s_TempDecompress]; \ - \ - punpckhbw XMM_R2, XMM_R0; \ - punpcklbw XMM_R0, XMM_R0; \ - \ - punpckhwd XMM_R1, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - punpcklwd XMM_R2, XMM_R2; \ - \ - psrld XMM_R0, 24; \ - psrld XMM_R1, 24; \ - psrld XMM_R2, 24; \ - \ - UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 6; \ - -#define UNPACK_V4_5SSE_3A UNPACK_V4_5SSE_3 - -#define UNPACK_V4_5SSE_2(CL, TOTALCL, MaskType, ModeType) \ - mov eax, dword ptr [VIF_SRC]; \ - DECOMPRESS_RGBA(0); \ - \ - shr eax, 16; \ - DECOMPRESS_RGBA(4); \ - \ - movq XMM_R0, qword ptr [s_TempDecompress]; \ - \ - punpcklbw XMM_R0, XMM_R0; \ - \ - punpckhwd XMM_R1, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - \ - psrld XMM_R0, 24; \ - psrld XMM_R1, 24; \ - \ - UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 4; \ - -#define UNPACK_V4_5SSE_2A UNPACK_V4_5SSE_2 - -#define UNPACK_V4_5SSE_1(CL, TOTALCL, MaskType, ModeType) \ - mov ax, word ptr [VIF_SRC]; \ - DECOMPRESS_RGBA(0) \ - \ - movd XMM_R0, dword ptr [s_TempDecompress]; \ - punpcklbw XMM_R0, XMM_R0; \ - punpcklwd XMM_R0, XMM_R0; \ - \ - psrld XMM_R0, 24; \ - \ - UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \ - \ - add VIF_SRC, 2; \ - -#define UNPACK_V4_5SSE_1A UNPACK_V4_5SSE_1 - -#pragma warning(disable:4731) - -#define SAVE_ROW_REG_BASE \ - mov VIF_TMPADDR, vifRow; \ - movdqa xmmword ptr [VIF_TMPADDR], XMM_ROW; \ - mov VIF_TMPADDR, vifRegs; \ - movss dword ptr [VIF_TMPADDR+0x100], XMM_ROW; \ - psrldq XMM_ROW, 4; \ - movss dword ptr [VIF_TMPADDR+0x110], XMM_ROW; \ - psrldq XMM_ROW, 4; \ - movss dword ptr [VIF_TMPADDR+0x120], XMM_ROW; \ - psrldq XMM_ROW, 4; \ - movss dword ptr [VIF_TMPADDR+0x130], XMM_ROW; \ - -#define SAVE_NO_REG - -// 32 bit versions have the args on the stack -#define INIT_ARGS() \ - push edi; \ - push esi; \ - push ebx; \ - mov VIF_DST, dword ptr [esp+4+12]; \ - mov VIF_SRC, dword ptr [esp+8+12]; \ - mov VIF_SIZE, dword ptr [esp+12+12]; \ - - -#define POP_REGS() \ - pop ebx; \ - pop esi; \ - pop edi; \ - -#define INC_STACK(reg) add esp, 4; - -// qsize - bytes of compressed size of 1 decompressed xmmword -// int UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType(u32* dest, u32* data, int dmasize) - -#define defUNPACK_SkippingWrite(name, MaskType, ModeType, qsize, sign, SAVE_ROW_REG) \ -.globl UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType; \ -UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType: \ - INIT_ARGS(); \ - mov VIF_TMPADDR, vifRegs; \ - movzx VIF_INC, byte ptr [VIF_TMPADDR + 0x40]; \ - movzx VIF_SAVEEBX, byte ptr [VIF_TMPADDR + 0x41]; \ - sub VIF_INC, VIF_SAVEEBX; \ - shl VIF_INC, 4; \ - \ - cmp VIF_SAVEEBXd, 1; \ - je name##_##sign##_##MaskType##_##ModeType##_WL1; \ - cmp VIF_SAVEEBXd, 2; \ - je name##_##sign##_##MaskType##_##ModeType##_WL2; \ - cmp VIF_SAVEEBXd, 3; \ - je name##_##sign##_##MaskType##_##ModeType##_WL3; \ - jmp name##_##sign##_##MaskType##_##ModeType##_WL4; \ - \ -name##_##sign##_##MaskType##_##ModeType##_WL1: \ - UNPACK_Start_Setup_##MaskType##_SSE_##ModeType##(0); \ - \ - cmp VIF_SIZE, qsize; \ - jl name##_##sign##_##MaskType##_##ModeType##_C1_Done3; \ - \ - add VIF_INC, 16; \ - \ - /* first align VIF_SRC to 16 bytes */ \ -name##_##sign##_##MaskType##_##ModeType##_C1_Align16: \ - \ - test VIF_SRC, 15; \ - jz name##_##sign##_##MaskType##_##ModeType##_C1_UnpackAligned; \ - \ - UNPACK_##name##SSE_1(0, 1, MaskType, ModeType); \ - \ - cmp VIF_SIZE, (2*qsize); \ - jl name##_##sign##_##MaskType##_##ModeType##_C1_DoneWithDec; \ - sub VIF_SIZE, qsize; \ - jmp name##_##sign##_##MaskType##_##ModeType##_C1_Align16; \ - \ -name##_##sign##_##MaskType##_##ModeType##_C1_UnpackAligned: \ - \ - cmp VIF_SIZE, (2*qsize); \ - jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack1; \ - cmp VIF_SIZE, (3*qsize); \ - jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack2; \ - cmp VIF_SIZE, (4*qsize); \ - jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack3; \ - prefetchnta [VIF_SRC + 64]; \ - \ -name##_##sign##_##MaskType##_##ModeType##_C1_Unpack4: \ - UNPACK_##name##SSE_4A(0, 1, MaskType, ModeType); \ - \ - cmp VIF_SIZE, (8*qsize); \ - jl name##_##sign##_##MaskType##_##ModeType##_C1_DoneUnpack4; \ - sub VIF_SIZE, (4*qsize); \ - jmp name##_##sign##_##MaskType##_##ModeType##_C1_Unpack4; \ - \ -name##_##sign##_##MaskType##_##ModeType##_C1_DoneUnpack4: \ - \ - sub VIF_SIZE, (4*qsize); \ - cmp VIF_SIZE, qsize; \ - jl name##_##sign##_##MaskType##_##ModeType##_C1_Done3; \ - cmp VIF_SIZE, (2*qsize); \ - jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack1; \ - cmp VIF_SIZE, (3*qsize); \ - jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack2; \ - /* fall through */ \ - \ -name##_##sign##_##MaskType##_##ModeType##_C1_Unpack3: \ - UNPACK_##name##SSE_3A(0, 1, MaskType, ModeType); \ - \ - sub VIF_SIZE, (3*qsize); \ - jmp name##_##sign##_##MaskType##_##ModeType##_C1_Done3; \ - \ -name##_##sign##_##MaskType##_##ModeType##_C1_Unpack2: \ - UNPACK_##name##SSE_2A(0, 1, MaskType, ModeType); \ - \ - sub VIF_SIZE, (2*qsize); \ - jmp name##_##sign##_##MaskType##_##ModeType##_C1_Done3; \ - \ -name##_##sign##_##MaskType##_##ModeType##_C1_Unpack1: \ - UNPACK_##name##SSE_1A(0, 1, MaskType, ModeType); \ -name##_##sign##_##MaskType##_##ModeType##_C1_DoneWithDec: \ - sub VIF_SIZE, qsize; \ -name##_##sign##_##MaskType##_##ModeType##_C1_Done3: \ - SAVE_ROW_REG; \ - mov eax, VIF_SIZE; \ - POP_REGS(); \ - ret; \ - \ -name##_##sign##_##MaskType##_##ModeType##_WL2: \ - cmp VIF_SIZE, (2*qsize); \ - \ - jl name##_##sign##_##MaskType##_##ModeType##_C2_Done3; \ -name##_##sign##_##MaskType##_##ModeType##_C2_Unpack: \ - UNPACK_##name##SSE_2(0, 0, MaskType, ModeType); \ - \ - add VIF_DST, VIF_INC; /* take into account wl */ \ - cmp VIF_SIZE, (4*qsize); \ - jl name##_##sign##_##MaskType##_##ModeType##_C2_Done2; \ - sub VIF_SIZE, (2*qsize); \ - jmp name##_##sign##_##MaskType##_##ModeType##_C2_Unpack; /* unpack next */ \ - \ -name##_##sign##_##MaskType##_##ModeType##_C2_Done2: \ - sub VIF_SIZE, (2*qsize); \ -name##_##sign##_##MaskType##_##ModeType##_C2_Done3: \ - cmp VIF_SIZE, qsize; \ - /* execute left over qw */ \ - jl name##_##sign##_##MaskType##_##ModeType##_C2_Done4; \ - UNPACK_##name##SSE_1(0, 0, MaskType, ModeType); \ - \ - sub VIF_SIZE, qsize; \ -name##_##sign##_##MaskType##_##ModeType##_C2_Done4: \ - \ - SAVE_ROW_REG; \ - mov eax, VIF_SIZE; \ - POP_REGS(); \ - ret; \ - \ -name##_##sign##_##MaskType##_##ModeType##_WL3: \ - cmp VIF_SIZE, (3*qsize); \ - \ - jl name##_##sign##_##MaskType##_##ModeType##_C3_Done5; \ -name##_##sign##_##MaskType##_##ModeType##_C3_Unpack: \ - UNPACK_##name##SSE_3(0, 0, MaskType, ModeType); \ - \ - add VIF_DST, VIF_INC; /* take into account wl */ \ - cmp VIF_SIZE, (6*qsize); \ - jl name##_##sign##_##MaskType##_##ModeType##_C3_Done2; \ - sub VIF_SIZE, (3*qsize); \ - jmp name##_##sign##_##MaskType##_##ModeType##_C3_Unpack; /* unpack next */ \ -name##_##sign##_##MaskType##_##ModeType##_C3_Done2: \ - sub VIF_SIZE, (3*qsize); \ -name##_##sign##_##MaskType##_##ModeType##_C3_Done5: \ - cmp VIF_SIZE, qsize; \ - jl name##_##sign##_##MaskType##_##ModeType##_C3_Done4; \ - \ - /* execute left over qw */ \ - cmp VIF_SIZE, (2*qsize); \ - jl name##_##sign##_##MaskType##_##ModeType##_C3_Done3; \ - \ - /* process 2 qws */ \ - UNPACK_##name##SSE_2(0, 0, MaskType, ModeType); \ - \ - sub VIF_SIZE, (2*qsize); \ - jmp name##_##sign##_##MaskType##_##ModeType##_C3_Done4; \ -name##_##sign##_##MaskType##_##ModeType##_C3_Done3: \ - /* process 1 qw */ \ - sub VIF_SIZE, qsize; \ - UNPACK_##name##SSE_1(0, 0, MaskType, ModeType); \ -name##_##sign##_##MaskType##_##ModeType##_C3_Done4: \ - SAVE_ROW_REG; \ - mov eax, VIF_SIZE; \ - POP_REGS(); \ - ret; \ - \ -name##_##sign##_##MaskType##_##ModeType##_WL4: /* >= 4 */ \ - sub VIF_SAVEEBX, 3; \ - push VIF_INC; \ - cmp VIF_SIZE, qsize; \ - jl name##_##sign##_##MaskType##_##ModeType##_C4_Done; \ - \ -name##_##sign##_##MaskType##_##ModeType##_C4_Unpack: \ - cmp VIF_SIZE, (3*qsize); \ - jge name##_##sign##_##MaskType##_##ModeType##_C4_Unpack3; \ - cmp VIF_SIZE, (2*qsize); \ - jge name##_##sign##_##MaskType##_##ModeType##_C4_Unpack2; \ - \ - UNPACK_##name##SSE_1(0, 0, MaskType, ModeType) \ - \ - /* not enough data left */ \ - sub VIF_SIZE, qsize; \ - jmp name##_##sign##_##MaskType##_##ModeType##_C4_Done; \ -name##_##sign##_##MaskType##_##ModeType##_C4_Unpack2: \ - UNPACK_##name##SSE_2(0, 0, MaskType, ModeType); \ - \ - /* not enough data left */ \ - sub VIF_SIZE, (2*qsize); \ - jmp name##_##sign##_##MaskType##_##ModeType##_C4_Done; \ -name##_##sign##_##MaskType##_##ModeType##_C4_Unpack3: \ - UNPACK_##name##SSE_3(0, 0, MaskType, ModeType); \ - \ - sub VIF_SIZE, (3*qsize); \ - /* more data left, process 1qw at a time */ \ - mov VIF_INC, VIF_SAVEEBX; \ - \ -name##_##sign##_##MaskType##_##ModeType##_C4_UnpackX: \ - /* check if any data left */ \ - cmp VIF_SIZE, qsize; \ - jl name##_##sign##_##MaskType##_##ModeType##_C4_Done; \ - \ - UNPACK_##name##SSE_1(3, 0, MaskType, ModeType); \ - \ - sub VIF_SIZE, qsize; \ - cmp VIF_INC, 1; \ - je name##_##sign##_##MaskType##_##ModeType##_C4_DoneLoop; \ - sub VIF_INC, 1; \ - jmp name##_##sign##_##MaskType##_##ModeType##_C4_UnpackX; \ -name##_##sign##_##MaskType##_##ModeType##_C4_DoneLoop: \ - add VIF_DST, [VIF_ESP]; /* take into account wl */ \ - cmp VIF_SIZE, qsize; \ - jl name##_##sign##_##MaskType##_##ModeType##_C4_Done; \ - jmp name##_##sign##_##MaskType##_##ModeType##_C4_Unpack; /* unpack next */ \ -name##_##sign##_##MaskType##_##ModeType##_C4_Done: \ - \ - SAVE_ROW_REG; \ - INC_STACK(); \ - mov eax, VIF_SIZE; \ - POP_REGS(); \ - ret; \ - -#define UNPACK_RIGHTSHIFT psrld -#define defUNPACK_SkippingWrite2(name, qsize) \ - defUNPACK_SkippingWrite(name, Regular, 0, qsize, u, SAVE_NO_REG) \ - defUNPACK_SkippingWrite(name, Regular, 1, qsize, u, SAVE_NO_REG) \ - defUNPACK_SkippingWrite(name, Regular, 2, qsize, u, SAVE_ROW_REG_BASE) \ - defUNPACK_SkippingWrite(name, Mask, 0, qsize, u, SAVE_NO_REG) \ - defUNPACK_SkippingWrite(name, Mask, 1, qsize, u, SAVE_NO_REG) \ - defUNPACK_SkippingWrite(name, Mask, 2, qsize, u, SAVE_ROW_REG_BASE) \ - defUNPACK_SkippingWrite(name, WriteMask, 0, qsize, u, SAVE_NO_REG) \ - defUNPACK_SkippingWrite(name, WriteMask, 1, qsize, u, SAVE_NO_REG) \ - defUNPACK_SkippingWrite(name, WriteMask, 2, qsize, u, SAVE_ROW_REG_BASE) \ - -defUNPACK_SkippingWrite2(S_32, 4) -defUNPACK_SkippingWrite2(S_16, 2) -defUNPACK_SkippingWrite2(S_8, 1) -defUNPACK_SkippingWrite2(V2_32, 8) -defUNPACK_SkippingWrite2(V2_16, 4) -defUNPACK_SkippingWrite2(V2_8, 2) -defUNPACK_SkippingWrite2(V3_32, 12) -defUNPACK_SkippingWrite2(V3_16, 6) -defUNPACK_SkippingWrite2(V3_8, 3) -defUNPACK_SkippingWrite2(V4_32, 16) -defUNPACK_SkippingWrite2(V4_16, 8) -defUNPACK_SkippingWrite2(V4_8, 4) -defUNPACK_SkippingWrite2(V4_5, 2) - -#undef UNPACK_RIGHTSHIFT -#undef defUNPACK_SkippingWrite2 - -#define UNPACK_RIGHTSHIFT psrad -#define defUNPACK_SkippingWrite2(name, qsize) \ - defUNPACK_SkippingWrite(name, Mask, 0, qsize, s, SAVE_NO_REG) \ - defUNPACK_SkippingWrite(name, Regular, 0, qsize, s, SAVE_NO_REG) \ - defUNPACK_SkippingWrite(name, Regular, 1, qsize, s, SAVE_NO_REG) \ - defUNPACK_SkippingWrite(name, Regular, 2, qsize, s, SAVE_ROW_REG_BASE) \ - defUNPACK_SkippingWrite(name, Mask, 1, qsize, s, SAVE_NO_REG) \ - defUNPACK_SkippingWrite(name, Mask, 2, qsize, s, SAVE_ROW_REG_BASE) \ - defUNPACK_SkippingWrite(name, WriteMask, 0, qsize, s, SAVE_NO_REG) \ - defUNPACK_SkippingWrite(name, WriteMask, 1, qsize, s, SAVE_NO_REG) \ - defUNPACK_SkippingWrite(name, WriteMask, 2, qsize, s, SAVE_ROW_REG_BASE) \ - -defUNPACK_SkippingWrite2(S_16, 2) -defUNPACK_SkippingWrite2(S_8, 1) -defUNPACK_SkippingWrite2(V2_16, 4) -defUNPACK_SkippingWrite2(V2_8, 2) -defUNPACK_SkippingWrite2(V3_16, 6) -defUNPACK_SkippingWrite2(V3_8, 3) -defUNPACK_SkippingWrite2(V4_16, 8) -defUNPACK_SkippingWrite2(V4_8, 4) - -#undef UNPACK_RIGHTSHIFT -#undef defUNPACK_SkippingWrite2 diff --git a/pcsx2/x86/aVif.asm b/pcsx2/x86/aVif.asm deleted file mode 100644 index d23423de40..0000000000 --- a/pcsx2/x86/aVif.asm +++ /dev/null @@ -1,1941 +0,0 @@ -; Pcsx2 - Pc Ps2 Emulator -; Copyright (C) 2002-2008 Pcsx2 Team -; -; This program is free software; you can redistribute it and/or modify -; it under the terms of the GNU General Public License as published by -; the Free Software Foundation; either version 2 of the License, or -; (at your option) any later version. - -; This program is distributed in the hope that it will be useful, -; but WITHOUT ANY WARRANTY; without even the implied warranty of -; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; GNU General Public License for more details. -; -; You should have received a copy of the GNU General Public License -; along with this program; if not, write to the Free Software -; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA - -;; Fast VIF assembly routines for UNPACK zerofrog(@gmail.com) -;; NOTE: This file is used to build aVif_proc-[32/64].asm because ml has a very -;; weak preprocessor. To generate the files, install nasm and run the following command: -;; aVif_proc-32.asm: nasmw -e aVif.asm > aVif_proc-32.asm -;; aVif_proc-64.asm: nasmw -e -D__x86_64__ aVif.asm > aVif_proc-64.asm -;; once the files are built, remove all lines starting with %line -;; and remove the brackets from the exports - -%ifndef __x86_64__ -.686 -.model flat, c -.mmx -.xmm -%endif - -extern _vifRegs:abs -extern _vifMaskRegs:abs -extern _vifRow:abs -extern _vifCol:abs -extern s_TempDecompress:abs - - -.code - - -%ifdef __x86_64__ -%define VIF_ESP rsp -%define VIF_SRC rdx -%define VIF_INC rdi -%define VIF_DST rcx -%define VIF_SIZE r8d -%define VIF_TMPADDR rax -%define VIF_SAVEEBX r9 -%define VIF_SAVEEBXd r9d -%else -%define VIF_ESP esp -%define VIF_SRC esi -%define VIF_INC ecx -%define VIF_DST edi -%define VIF_SIZE edx -%define VIF_TMPADDR eax -%define VIF_SAVEEBX ebx -%define VIF_SAVEEBXd ebx -%endif - -%define XMM_R0 xmm0 -%define XMM_R1 xmm1 -%define XMM_R2 xmm2 -%define XMM_WRITEMASK xmm3 -%define XMM_ROWMASK xmm4 -%define XMM_ROWCOLMASK xmm5 -%define XMM_ROW xmm6 -%define XMM_COL xmm7 -%define XMM_R3 XMM_COL - -;; writing masks -UNPACK_Write0_Regular macro r0, CL, DEST_OFFSET, MOVDQA - MOVDQA [VIF_DST+DEST_OFFSET], r0 - endm - -UNPACK_Write1_Regular macro r0, CL, DEST_OFFSET, MOVDQA - MOVDQA [VIF_DST], r0 - add VIF_DST, VIF_INC - endm - -UNPACK_Write0_Mask macro r0, CL, DEST_OFFSET, MOVDQA - UNPACK_Write0_Regular r0, CL, DEST_OFFSET, MOVDQA - endm - -UNPACK_Write1_Mask macro r0, CL, DEST_OFFSET, MOVDQA - UNPACK_Write1_Regular r0, CL, DEST_OFFSET, MOVDQA - endm - -;; masked write (dest needs to be in edi) -UNPACK_Write0_WriteMask macro r0, CL, DEST_OFFSET, MOVDQA - ;; masked write (dest needs to be in edi) - movdqa XMM_WRITEMASK, [VIF_TMPADDR + 64*(CL) + 48] - pand r0, XMM_WRITEMASK - pandn XMM_WRITEMASK, [VIF_DST] - por r0, XMM_WRITEMASK - MOVDQA [VIF_DST], r0 - add VIF_DST, 16 - endm - -;; masked write (dest needs to be in edi) -UNPACK_Write1_WriteMask macro r0, CL, DEST_OFFSET, MOVDQA - ;; masked write (dest needs to be in edi) - movdqa XMM_WRITEMASK, [VIF_TMPADDR + 64*(0) + 48] - pand r0, XMM_WRITEMASK - pandn XMM_WRITEMASK, [VIF_DST] - por r0, XMM_WRITEMASK - MOVDQA [VIF_DST], r0 - add VIF_DST, VIF_INC - endm - -UNPACK_Mask_SSE_0 macro r0 - pand r0, XMM_WRITEMASK - por r0, XMM_ROWCOLMASK - endm - -;; once a qword is uncomprssed, applies masks and saves -;; note: modifying XMM_WRITEMASK -;; dest = row + write (only when mask=0), otherwise write -UNPACK_Mask_SSE_1 macro r0 - ;; dest = row + write (only when mask=0), otherwise write - pand r0, XMM_WRITEMASK - por r0, XMM_ROWCOLMASK - pand XMM_WRITEMASK, XMM_ROW - paddd r0, XMM_WRITEMASK - endm - -;; dest = row + write (only when mask=0), otherwise write -;; row = row + write (only when mask = 0), otherwise row -UNPACK_Mask_SSE_2 macro r0 - ;; dest = row + write (only when mask=0), otherwise write - ;; row = row + write (only when mask = 0), otherwise row - pand r0, XMM_WRITEMASK - pand XMM_WRITEMASK, XMM_ROW - paddd XMM_ROW, r0 - por r0, XMM_ROWCOLMASK - paddd r0, XMM_WRITEMASK - endm - -UNPACK_WriteMask_SSE_0 macro r0 - UNPACK_Mask_SSE_0 r0 - endm -UNPACK_WriteMask_SSE_1 macro r0 - UNPACK_Mask_SSE_1 r0 - endm -UNPACK_WriteMask_SSE_2 macro r0 - UNPACK_Mask_SSE_2 r0 - endm - -UNPACK_Regular_SSE_0 macro r0 - endm - -UNPACK_Regular_SSE_1 macro r0 - paddd r0, XMM_ROW - endm - -UNPACK_Regular_SSE_2 macro r0 - paddd r0, XMM_ROW - movdqa XMM_ROW, r0 - endm - -;; setting up masks -UNPACK_Setup_Mask_SSE macro CL - mov VIF_TMPADDR, [_vifMaskRegs] - movdqa XMM_ROWMASK, [VIF_TMPADDR + 64*(CL) + 16] - movdqa XMM_ROWCOLMASK, [VIF_TMPADDR + 64*(CL) + 32] - movdqa XMM_WRITEMASK, [VIF_TMPADDR + 64*(CL)] - pand XMM_ROWMASK, XMM_ROW - pand XMM_ROWCOLMASK, XMM_COL - por XMM_ROWCOLMASK, XMM_ROWMASK - endm - -UNPACK_Start_Setup_Mask_SSE_0 macro CL - UNPACK_Setup_Mask_SSE CL - endm - -UNPACK_Start_Setup_Mask_SSE_1 macro CL - mov VIF_TMPADDR, [_vifMaskRegs] - movdqa XMM_ROWMASK, [VIF_TMPADDR + 64*(CL) + 16] - movdqa XMM_ROWCOLMASK, [VIF_TMPADDR + 64*(CL) + 32] - pand XMM_ROWMASK, XMM_ROW - pand XMM_ROWCOLMASK, XMM_COL - por XMM_ROWCOLMASK, XMM_ROWMASK - endm - -UNPACK_Start_Setup_Mask_SSE_2 macro CL - endm - -UNPACK_Setup_Mask_SSE_0_1 macro CL - endm -UNPACK_Setup_Mask_SSE_1_1 macro CL - mov VIF_TMPADDR, [_vifMaskRegs] - movdqa XMM_WRITEMASK, [VIF_TMPADDR + 64*(0)] - endm - -;; ignore CL, since vif.cycle.wl == 1 -UNPACK_Setup_Mask_SSE_2_1 macro CL - ;; ignore CL, since vif.cycle.wl == 1 - mov VIF_TMPADDR, [_vifMaskRegs] - movdqa XMM_ROWMASK, [VIF_TMPADDR + 64*(0) + 16] - movdqa XMM_ROWCOLMASK, [VIF_TMPADDR + 64*(0) + 32] - movdqa XMM_WRITEMASK, [VIF_TMPADDR + 64*(0)] - pand XMM_ROWMASK, XMM_ROW - pand XMM_ROWCOLMASK, XMM_COL - por XMM_ROWCOLMASK, XMM_ROWMASK - endm - -UNPACK_Setup_Mask_SSE_0_0 macro CL - UNPACK_Setup_Mask_SSE CL - endm -UNPACK_Setup_Mask_SSE_1_0 macro CL - UNPACK_Setup_Mask_SSE CL - endm -UNPACK_Setup_Mask_SSE_2_0 macro CL - UNPACK_Setup_Mask_SSE CL - endm - -;; write mask always destroys XMM_WRITEMASK, so 0_0 = 1_0 -UNPACK_Setup_WriteMask_SSE_0_0 macro CL - UNPACK_Setup_Mask_SSE CL - endm -UNPACK_Setup_WriteMask_SSE_1_0 macro CL - UNPACK_Setup_Mask_SSE CL - endm -UNPACK_Setup_WriteMask_SSE_2_0 macro CL - UNPACK_Setup_Mask_SSE CL - endm -UNPACK_Setup_WriteMask_SSE_0_1 macro CL - UNPACK_Setup_Mask_SSE_1_1 CL - endm - -UNPACK_Setup_WriteMask_SSE_1_1 macro CL - UNPACK_Setup_Mask_SSE_1_1 CL - endm - -UNPACK_Setup_WriteMask_SSE_2_1 macro CL - UNPACK_Setup_Mask_SSE_2_1 CL - endm - -UNPACK_Start_Setup_WriteMask_SSE_0 macro CL - UNPACK_Start_Setup_Mask_SSE_1 CL - endm -UNPACK_Start_Setup_WriteMask_SSE_1 macro CL - UNPACK_Start_Setup_Mask_SSE_1 CL - endm -UNPACK_Start_Setup_WriteMask_SSE_2 macro CL - UNPACK_Start_Setup_Mask_SSE_2 CL - endm - -UNPACK_Start_Setup_Regular_SSE_0 macro CL - endm -UNPACK_Start_Setup_Regular_SSE_1 macro CL - endm -UNPACK_Start_Setup_Regular_SSE_2 macro CL - endm -UNPACK_Setup_Regular_SSE_0_0 macro CL - endm -UNPACK_Setup_Regular_SSE_1_0 macro CL - endm -UNPACK_Setup_Regular_SSE_2_0 macro CL - endm -UNPACK_Setup_Regular_SSE_0_1 macro CL - endm -UNPACK_Setup_Regular_SSE_1_1 macro CL - endm -UNPACK_Setup_Regular_SSE_2_1 macro CL - endm - -UNPACK_INC_DST_0_Regular macro qw - add VIF_DST, (16*qw) - endm -UNPACK_INC_DST_1_Regular macro qw - endm -UNPACK_INC_DST_0_Mask macro qw - add VIF_DST, (16*qw) - endm -UNPACK_INC_DST_1_Mask macro qw - endm -UNPACK_INC_DST_0_WriteMask macro qw - endm -UNPACK_INC_DST_1_WriteMask macro qw - endm - -;; unpacks for 1,2,3,4 elements (V3 uses this directly) -UNPACK4_SSE macro CL, TOTALCL, MaskType, ModeType - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL+0 - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R0 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R0, CL, 0, movdqa - - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL+1 - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R1 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R1, CL+1, 16, movdqa - - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL+2 - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R2 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R2, CL+2, 32, movdqa - - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL+3 - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R3 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R3, CL+3, 48, movdqa - - @CatStr(UNPACK_INC_DST_, TOTALCL, _, MaskType) 4 - endm - -;; V3 uses this directly -UNPACK3_SSE macro CL, TOTALCL, MaskType, ModeType - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R0 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R0, CL, 0, movdqa - - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL+1 - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R1 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R1, CL+1, 16, movdqa - - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL+2 - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R2 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R2, CL+2, 32, movdqa - - @CatStr(UNPACK_INC_DST_, TOTALCL, _, MaskType) 3 - endm - -UNPACK2_SSE macro CL, TOTALCL, MaskType, ModeType - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R0 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R0, CL, 0, movdqa - - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL+1 - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R1 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R1, CL+1, 16, movdqa - - @CatStr(UNPACK_INC_DST_, TOTALCL, _, MaskType) 2 - endm - -UNPACK1_SSE macro CL, TOTALCL, MaskType, ModeType - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R0 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R0, CL, 0, movdqa - - @CatStr(UNPACK_INC_DST_, TOTALCL, _, MaskType) 1 - endm - -;; S-32 -;; only when cl==1 -UNPACK_S_32SSE_4x macro CL, TOTALCL, MaskType, ModeType, MOVDQA - MOVDQA XMM_R3, [VIF_SRC] - - pshufd XMM_R0, XMM_R3, 0 - pshufd XMM_R1, XMM_R3, 055h - pshufd XMM_R2, XMM_R3, 0aah - pshufd XMM_R3, XMM_R3, 0ffh - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 16 - endm - -UNPACK_S_32SSE_4A macro CL, TOTALCL, MaskType, ModeType - UNPACK_S_32SSE_4x CL, TOTALCL, MaskType, ModeType, movdqa - endm -UNPACK_S_32SSE_4 macro CL, TOTALCL, MaskType, ModeType - UNPACK_S_32SSE_4x CL, TOTALCL, MaskType, ModeType, movdqu - endm - -UNPACK_S_32SSE_3x macro CL, TOTALCL, MaskType, ModeType, MOVDQA - MOVDQA XMM_R2, [VIF_SRC] - - pshufd XMM_R0, XMM_R2, 0 - pshufd XMM_R1, XMM_R2, 055h - pshufd XMM_R2, XMM_R2, 0aah - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 12 - endm - -UNPACK_S_32SSE_3A macro CL, TOTALCL, MaskType, ModeType - UNPACK_S_32SSE_3x CL, TOTALCL, MaskType, ModeType, movdqa - endm -UNPACK_S_32SSE_3 macro CL, TOTALCL, MaskType, ModeType - UNPACK_S_32SSE_3x CL, TOTALCL, MaskType, ModeType, movdqu - endm - -UNPACK_S_32SSE_2 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R1, QWORD PTR [VIF_SRC] - - pshufd XMM_R0, XMM_R1, 0 - pshufd XMM_R1, XMM_R1, 055h - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 8 - endm - -UNPACK_S_32SSE_2A macro CL, TOTALCL, MaskType, ModeType - UNPACK_S_32SSE_2 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_S_32SSE_1 macro CL, TOTALCL, MaskType, ModeType - movd XMM_R0, dword ptr [VIF_SRC] - pshufd XMM_R0, XMM_R0, 0 - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 4 - endm - -UNPACK_S_32SSE_1A macro CL, TOTALCL, MaskType, ModeType - UNPACK_S_32SSE_1 CL, TOTALCL, MaskType, ModeType - endm - -;; S-16 -UNPACK_S_16SSE_4 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R3, QWORD PTR [VIF_SRC] - punpcklwd XMM_R3, XMM_R3 - UNPACK_RIGHTSHIFT XMM_R3, 16 - - pshufd XMM_R0, XMM_R3, 0 - pshufd XMM_R1, XMM_R3, 055h - pshufd XMM_R2, XMM_R3, 0aah - pshufd XMM_R3, XMM_R3, 0ffh - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 8 - endm - -UNPACK_S_16SSE_4A macro CL, TOTALCL, MaskType, ModeType - UNPACK_S_16SSE_4 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_S_16SSE_3 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R2, QWORD PTR [VIF_SRC] - punpcklwd XMM_R2, XMM_R2 - UNPACK_RIGHTSHIFT XMM_R2, 16 - - pshufd XMM_R0, XMM_R2, 0 - pshufd XMM_R1, XMM_R2, 055h - pshufd XMM_R2, XMM_R2, 0aah - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - add VIF_SRC, 6 - endm - -UNPACK_S_16SSE_3A macro CL, TOTALCL, MaskType, ModeType - UNPACK_S_16SSE_3 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_S_16SSE_2 macro CL, TOTALCL, MaskType, ModeType - movd XMM_R1, dword ptr [VIF_SRC] - punpcklwd XMM_R1, XMM_R1 - UNPACK_RIGHTSHIFT XMM_R1, 16 - - pshufd XMM_R0, XMM_R1, 0 - pshufd XMM_R1, XMM_R1, 055h - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 4 - endm - -UNPACK_S_16SSE_2A macro CL, TOTALCL, MaskType, ModeType - UNPACK_S_16SSE_2 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_S_16SSE_1 macro CL, TOTALCL, MaskType, ModeType - movd XMM_R0, dword ptr [VIF_SRC] - punpcklwd XMM_R0, XMM_R0 - UNPACK_RIGHTSHIFT XMM_R0, 16 - pshufd XMM_R0, XMM_R0, 0 - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 2 - endm - -UNPACK_S_16SSE_1A macro CL, TOTALCL, MaskType, ModeType - UNPACK_S_16SSE_1 CL, TOTALCL, MaskType, ModeType - endm - -;; S-8 -UNPACK_S_8SSE_4 macro CL, TOTALCL, MaskType, ModeType - movd XMM_R3, dword ptr [VIF_SRC] - punpcklbw XMM_R3, XMM_R3 - punpcklwd XMM_R3, XMM_R3 - UNPACK_RIGHTSHIFT XMM_R3, 24 - - pshufd XMM_R0, XMM_R3, 0 - pshufd XMM_R1, XMM_R3, 055h - pshufd XMM_R2, XMM_R3, 0aah - pshufd XMM_R3, XMM_R3, 0ffh - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 4 - endm - -UNPACK_S_8SSE_4A macro CL, TOTALCL, MaskType, ModeType - UNPACK_S_8SSE_4 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_S_8SSE_3 macro CL, TOTALCL, MaskType, ModeType - movd XMM_R2, dword ptr [VIF_SRC] - punpcklbw XMM_R2, XMM_R2 - punpcklwd XMM_R2, XMM_R2 - UNPACK_RIGHTSHIFT XMM_R2, 24 - - pshufd XMM_R0, XMM_R2, 0 - pshufd XMM_R1, XMM_R2, 055h - pshufd XMM_R2, XMM_R2, 0aah - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 3 - endm - -UNPACK_S_8SSE_3A macro CL, TOTALCL, MaskType, ModeType - UNPACK_S_8SSE_3 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_S_8SSE_2 macro CL, TOTALCL, MaskType, ModeType - movd XMM_R1, dword ptr [VIF_SRC] - punpcklbw XMM_R1, XMM_R1 - punpcklwd XMM_R1, XMM_R1 - UNPACK_RIGHTSHIFT XMM_R1, 24 - - pshufd XMM_R0, XMM_R1, 0 - pshufd XMM_R1, XMM_R1, 055h - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 2 - endm - -UNPACK_S_8SSE_2A macro CL, TOTALCL, MaskType, ModeType - UNPACK_S_8SSE_2 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_S_8SSE_1 macro CL, TOTALCL, MaskType, ModeType - movd XMM_R0, dword ptr [VIF_SRC] - punpcklbw XMM_R0, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - UNPACK_RIGHTSHIFT XMM_R0, 24 - pshufd XMM_R0, XMM_R0, 0 - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - inc VIF_SRC - endm - -UNPACK_S_8SSE_1A macro CL, TOTALCL, MaskType, ModeType - UNPACK_S_8SSE_1 CL, TOTALCL, MaskType, ModeType - endm - -;; V2-32 -UNPACK_V2_32SSE_4A macro CL, TOTALCL, MaskType, ModeType - MOVDQA XMM_R0, [VIF_SRC] - MOVDQA XMM_R2, [VIF_SRC+16] - - pshufd XMM_R1, XMM_R0, 0eeh - pshufd XMM_R3, XMM_R2, 0eeh - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 32 - endm - -UNPACK_V2_32SSE_4 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - movq XMM_R1, QWORD PTR [VIF_SRC+8] - movq XMM_R2, QWORD PTR [VIF_SRC+16] - movq XMM_R3, QWORD PTR [VIF_SRC+24] - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 32 - endm - -UNPACK_V2_32SSE_3A macro CL, TOTALCL, MaskType, ModeType - MOVDQA XMM_R0, [VIF_SRC] - movq XMM_R2, QWORD PTR [VIF_SRC+16] - pshufd XMM_R1, XMM_R0, 0eeh - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 24 - endm - -UNPACK_V2_32SSE_3 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - movq XMM_R1, QWORD PTR [VIF_SRC+8] - movq XMM_R2, QWORD PTR [VIF_SRC+16] - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 24 - endm - -UNPACK_V2_32SSE_2 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - movq XMM_R1, QWORD PTR [VIF_SRC+8] - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 16 - endm - -UNPACK_V2_32SSE_2A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V2_32SSE_2 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_V2_32SSE_1 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 8 - endm - -UNPACK_V2_32SSE_1A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V2_32SSE_1 CL, TOTALCL, MaskType, ModeType - endm - -;; V2-16 -;; due to lemmings, have to copy lower qword to the upper qword of every reg -UNPACK_V2_16SSE_4A macro CL, TOTALCL, MaskType, ModeType - punpcklwd XMM_R0, [VIF_SRC] - punpckhwd XMM_R2, [VIF_SRC] - - UNPACK_RIGHTSHIFT XMM_R0, 16 - UNPACK_RIGHTSHIFT XMM_R2, 16 - ;; move the lower 64 bits down - punpckhqdq XMM_R1, XMM_R0 - punpckhqdq XMM_R3, XMM_R2 - - punpcklqdq XMM_R0, XMM_R0 - punpcklqdq XMM_R2, XMM_R2 - punpckhqdq XMM_R1, XMM_R1 - punpckhqdq XMM_R3, XMM_R3 - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - add VIF_SRC, 16 - endm - -UNPACK_V2_16SSE_4 macro CL, TOTALCL, MaskType, ModeType - movdqu XMM_R0, [VIF_SRC] - - punpckhwd XMM_R2, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - - UNPACK_RIGHTSHIFT XMM_R0, 16 - UNPACK_RIGHTSHIFT XMM_R2, 16 - - ;; move the lower 64 bits down - punpckhqdq XMM_R1, XMM_R0 - punpckhqdq XMM_R3, XMM_R2 - - punpcklqdq XMM_R0, XMM_R0 - punpcklqdq XMM_R2, XMM_R2 - punpckhqdq XMM_R1, XMM_R1 - punpckhqdq XMM_R3, XMM_R3 - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 16 - endm - -UNPACK_V2_16SSE_3A macro CL, TOTALCL, MaskType, ModeType - punpcklwd XMM_R0, [VIF_SRC] - punpckhwd XMM_R2, [VIF_SRC] - - UNPACK_RIGHTSHIFT XMM_R0, 16 - UNPACK_RIGHTSHIFT XMM_R2, 16 - - ;; move the lower 64 bits down - punpckhqdq XMM_R1, XMM_R0 - - punpcklqdq XMM_R0, XMM_R0 - punpcklqdq XMM_R2, XMM_R2 - punpckhqdq XMM_R1, XMM_R1 - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 12 - endm - -UNPACK_V2_16SSE_3 macro CL, TOTALCL, MaskType, ModeType - movdqu XMM_R0, [VIF_SRC] - - punpckhwd XMM_R2, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - - UNPACK_RIGHTSHIFT XMM_R0, 16 - UNPACK_RIGHTSHIFT XMM_R2, 16 - - ;; move the lower 64 bits down - punpckhqdq XMM_R1, XMM_R0 - - punpcklqdq XMM_R0, XMM_R0 - punpcklqdq XMM_R2, XMM_R2 - punpckhqdq XMM_R1, XMM_R1 - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 12 - endm - -UNPACK_V2_16SSE_2A macro CL, TOTALCL, MaskType, ModeType - punpcklwd XMM_R0, [VIF_SRC] - UNPACK_RIGHTSHIFT XMM_R0, 16 - - ;; move the lower 64 bits down - punpckhqdq XMM_R1, XMM_R0 - - punpcklqdq XMM_R0, XMM_R0 - punpckhqdq XMM_R1, XMM_R1 - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 8 - endm - -UNPACK_V2_16SSE_2 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - punpcklwd XMM_R0, XMM_R0 - UNPACK_RIGHTSHIFT XMM_R0, 16 - - ;; move the lower 64 bits down - punpckhqdq XMM_R1, XMM_R0 - - punpcklqdq XMM_R0, XMM_R0 - punpckhqdq XMM_R1, XMM_R1 - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 8 - endm - -UNPACK_V2_16SSE_1A macro CL, TOTALCL, MaskType, ModeType - punpcklwd XMM_R0, [VIF_SRC] - UNPACK_RIGHTSHIFT XMM_R0, 16 - punpcklqdq XMM_R0, XMM_R0 - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 4 - endm - -UNPACK_V2_16SSE_1 macro CL, TOTALCL, MaskType, ModeType - movd XMM_R0, dword ptr [VIF_SRC] - punpcklwd XMM_R0, XMM_R0 - UNPACK_RIGHTSHIFT XMM_R0, 16 - punpcklqdq XMM_R0, XMM_R0 - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 4 - endm - -;; V2-8 -;; and1 streetball needs to copy lower qword to the upper qword of every reg -UNPACK_V2_8SSE_4 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - - punpcklbw XMM_R0, XMM_R0 - punpckhwd XMM_R2, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - - UNPACK_RIGHTSHIFT XMM_R0, 24 - UNPACK_RIGHTSHIFT XMM_R2, 24 - - ;; move the lower 64 bits down - punpckhqdq XMM_R1, XMM_R0 - punpckhqdq XMM_R3, XMM_R2 - - punpcklqdq XMM_R0, XMM_R0 - punpcklqdq XMM_R2, XMM_R2 - punpckhqdq XMM_R1, XMM_R1 - punpckhqdq XMM_R3, XMM_R3 - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 8 - endm - -UNPACK_V2_8SSE_4A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V2_8SSE_4 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_V2_8SSE_3 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - - punpcklbw XMM_R0, XMM_R0 - punpckhwd XMM_R2, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - - UNPACK_RIGHTSHIFT XMM_R0, 24 - UNPACK_RIGHTSHIFT XMM_R2, 24 - - ;; move the lower 64 bits down - punpckhqdq XMM_R1, XMM_R0 - - punpcklqdq XMM_R0, XMM_R0 - punpcklqdq XMM_R2, XMM_R2 - punpckhqdq XMM_R1, XMM_R1 - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 6 - endm - -UNPACK_V2_8SSE_3A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V2_8SSE_3 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_V2_8SSE_2 macro CL, TOTALCL, MaskType, ModeType - movd XMM_R0, dword ptr [VIF_SRC] - punpcklbw XMM_R0, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - UNPACK_RIGHTSHIFT XMM_R0, 24 - - ;; move the lower 64 bits down - punpckhqdq XMM_R1, XMM_R0 - - punpcklqdq XMM_R0, XMM_R0 - punpckhqdq XMM_R1, XMM_R1 - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 4 - endm - -UNPACK_V2_8SSE_2A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V2_8SSE_2 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_V2_8SSE_1 macro CL, TOTALCL, MaskType, ModeType - movd XMM_R0, dword ptr [VIF_SRC] - punpcklbw XMM_R0, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - UNPACK_RIGHTSHIFT XMM_R0, 24 - punpcklqdq XMM_R0, XMM_R0 - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 2 - endm - -UNPACK_V2_8SSE_1A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V2_8SSE_1 CL, TOTALCL, MaskType, ModeType - endm - -;; V3-32 -UNPACK_V3_32SSE_4x macro CL, TOTALCL, MaskType, ModeType, MOVDQA - MOVDQA XMM_R0, [VIF_SRC] - movdqu XMM_R1, [VIF_SRC+12] - - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL+0 - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R0 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R0, CL, 0, movdqa - - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL+1 - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R1 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R1, CL+1, 16, movdqa - - ;; midnight club 2 crashes because reading a qw at +36 is out of bounds - MOVDQA XMM_R3, [VIF_SRC+32] - movdqu XMM_R2, [VIF_SRC+24] - psrldq XMM_R3, 4 - - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL+2 - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R2 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R2, CL+2, 32, movdqa - - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL+3 - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R3 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R3, CL+3, 48, movdqa - - @CatStr(UNPACK_INC_DST_, TOTALCL, _, MaskType) 4 - - add VIF_SRC, 48 - endm - -UNPACK_V3_32SSE_4A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_32SSE_4x CL, TOTALCL, MaskType, ModeType, movdqa - endm -UNPACK_V3_32SSE_4 macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_32SSE_4x CL, TOTALCL, MaskType, ModeType, movdqu - endm - -UNPACK_V3_32SSE_3x macro CL, TOTALCL, MaskType, ModeType, MOVDQA - MOVDQA XMM_R0, [VIF_SRC] - movdqu XMM_R1, [VIF_SRC+12] - - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R0 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R0, CL, 0, movdqa - - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL+1 - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R1 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R1, CL+1, 16, movdqa - - movdqu XMM_R2, [VIF_SRC+24] - - @CatStr(UNPACK_Setup_, MaskType, _SSE_, ModeType, _, TOTALCL) CL+2 - @CatStr(UNPACK_, MaskType, _SSE_, ModeType) XMM_R2 - @CatStr(UNPACK_Write, TOTALCL, _, MaskType) XMM_R2, CL+2, 32, movdqa - - @CatStr(UNPACK_INC_DST_, TOTALCL, _, MaskType) 3 - - add VIF_SRC, 36 - endm - -UNPACK_V3_32SSE_3A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_32SSE_3x CL, TOTALCL, MaskType, ModeType, movdqa - endm -UNPACK_V3_32SSE_3 macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_32SSE_3x CL, TOTALCL, MaskType, ModeType, movdqu - endm - -UNPACK_V3_32SSE_2x macro CL, TOTALCL, MaskType, ModeType, MOVDQA - MOVDQA XMM_R0, [VIF_SRC] - movdqu XMM_R1, [VIF_SRC+12] - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 24 - endm - -UNPACK_V3_32SSE_2A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_32SSE_2x CL, TOTALCL, MaskType, ModeType, movdqa - endm -UNPACK_V3_32SSE_2 macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_32SSE_2x CL, TOTALCL, MaskType, ModeType, movdqu - endm - -UNPACK_V3_32SSE_1x macro CL, TOTALCL, MaskType, ModeType, MOVDQA - MOVDQA XMM_R0, [VIF_SRC] - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 12 - endm - -UNPACK_V3_32SSE_1A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_32SSE_1x CL, TOTALCL, MaskType, ModeType, movdqa - endm -UNPACK_V3_32SSE_1 macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_32SSE_1x CL, TOTALCL, MaskType, ModeType, movdqu - endm - -;; V3-16 -UNPACK_V3_16SSE_4 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - movq XMM_R1, QWORD PTR [VIF_SRC+6] - - punpcklwd XMM_R0, XMM_R0 - movq XMM_R2, QWORD PTR [VIF_SRC+12] - punpcklwd XMM_R1, XMM_R1 - UNPACK_RIGHTSHIFT XMM_R0, 16 - movq XMM_R3, QWORD PTR [VIF_SRC+18] - UNPACK_RIGHTSHIFT XMM_R1, 16 - punpcklwd XMM_R2, XMM_R2 - punpcklwd XMM_R3, XMM_R3 - - UNPACK_RIGHTSHIFT XMM_R2, 16 - UNPACK_RIGHTSHIFT XMM_R3, 16 - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 24 - endm - -UNPACK_V3_16SSE_4A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_16SSE_4 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_V3_16SSE_3 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - movq XMM_R1, QWORD PTR [VIF_SRC+6] - - punpcklwd XMM_R0, XMM_R0 - movq XMM_R2, QWORD PTR [VIF_SRC+12] - punpcklwd XMM_R1, XMM_R1 - UNPACK_RIGHTSHIFT XMM_R0, 16 - punpcklwd XMM_R2, XMM_R2 - - UNPACK_RIGHTSHIFT XMM_R1, 16 - UNPACK_RIGHTSHIFT XMM_R2, 16 - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 18 - endm - -UNPACK_V3_16SSE_3A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_16SSE_3 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_V3_16SSE_2 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - movq XMM_R1, QWORD PTR [VIF_SRC+6] - - punpcklwd XMM_R0, XMM_R0 - punpcklwd XMM_R1, XMM_R1 - - UNPACK_RIGHTSHIFT XMM_R0, 16 - UNPACK_RIGHTSHIFT XMM_R1, 16 - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 12 - endm - -UNPACK_V3_16SSE_2A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_16SSE_2 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_V3_16SSE_1 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - punpcklwd XMM_R0, XMM_R0 - UNPACK_RIGHTSHIFT XMM_R0, 16 - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 6 - endm - -UNPACK_V3_16SSE_1A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_16SSE_1 CL, TOTALCL, MaskType, ModeType - endm - -;; V3-8 -UNPACK_V3_8SSE_4 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R1, QWORD PTR [VIF_SRC] - movq XMM_R3, QWORD PTR [VIF_SRC+6] - - punpcklbw XMM_R1, XMM_R1 - punpcklbw XMM_R3, XMM_R3 - punpcklwd XMM_R0, XMM_R1 - psrldq XMM_R1, 6 - punpcklwd XMM_R2, XMM_R3 - psrldq XMM_R3, 6 - punpcklwd XMM_R1, XMM_R1 - UNPACK_RIGHTSHIFT XMM_R0, 24 - punpcklwd XMM_R3, XMM_R3 - - UNPACK_RIGHTSHIFT XMM_R2, 24 - UNPACK_RIGHTSHIFT XMM_R1, 24 - UNPACK_RIGHTSHIFT XMM_R3, 24 - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 12 - endm - -UNPACK_V3_8SSE_4A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_8SSE_4 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_V3_8SSE_3 macro CL, TOTALCL, MaskType, ModeType - movd XMM_R0, dword ptr [VIF_SRC] - movd XMM_R1, dword ptr [VIF_SRC+3] - - punpcklbw XMM_R0, XMM_R0 - movd XMM_R2, dword ptr [VIF_SRC+6] - punpcklbw XMM_R1, XMM_R1 - punpcklwd XMM_R0, XMM_R0 - punpcklbw XMM_R2, XMM_R2 - - punpcklwd XMM_R1, XMM_R1 - punpcklwd XMM_R2, XMM_R2 - - UNPACK_RIGHTSHIFT XMM_R0, 24 - UNPACK_RIGHTSHIFT XMM_R1, 24 - UNPACK_RIGHTSHIFT XMM_R2, 24 - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 9 - endm - -UNPACK_V3_8SSE_3A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_8SSE_3 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_V3_8SSE_2 macro CL, TOTALCL, MaskType, ModeType - movd XMM_R0, dword ptr [VIF_SRC] - movd XMM_R1, dword ptr [VIF_SRC+3] - - punpcklbw XMM_R0, XMM_R0 - punpcklbw XMM_R1, XMM_R1 - - punpcklwd XMM_R0, XMM_R0 - punpcklwd XMM_R1, XMM_R1 - - UNPACK_RIGHTSHIFT XMM_R0, 24 - UNPACK_RIGHTSHIFT XMM_R1, 24 - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 6 - endm - -UNPACK_V3_8SSE_2A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_8SSE_2 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_V3_8SSE_1 macro CL, TOTALCL, MaskType, ModeType - movd XMM_R0, dword ptr [VIF_SRC] - punpcklbw XMM_R0, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - UNPACK_RIGHTSHIFT XMM_R0, 24 - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 3 - endm - -UNPACK_V3_8SSE_1A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V3_8SSE_1 CL, TOTALCL, MaskType, ModeType - endm - -;; V4-32 -UNPACK_V4_32SSE_4A macro CL, TOTALCL, MaskType, ModeType - movdqa XMM_R0, [VIF_SRC] - movdqa XMM_R1, [VIF_SRC+16] - movdqa XMM_R2, [VIF_SRC+32] - movdqa XMM_R3, [VIF_SRC+48] - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 64 - endm - -UNPACK_V4_32SSE_4 macro CL, TOTALCL, MaskType, ModeType - movdqu XMM_R0, [VIF_SRC] - movdqu XMM_R1, [VIF_SRC+16] - movdqu XMM_R2, [VIF_SRC+32] - movdqu XMM_R3, [VIF_SRC+48] - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 64 - endm - -UNPACK_V4_32SSE_3A macro CL, TOTALCL, MaskType, ModeType - movdqa XMM_R0, [VIF_SRC] - movdqa XMM_R1, [VIF_SRC+16] - movdqa XMM_R2, [VIF_SRC+32] - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 48 - endm - -UNPACK_V4_32SSE_3 macro CL, TOTALCL, MaskType, ModeType - movdqu XMM_R0, [VIF_SRC] - movdqu XMM_R1, [VIF_SRC+16] - movdqu XMM_R2, [VIF_SRC+32] - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 48 - endm - -UNPACK_V4_32SSE_2A macro CL, TOTALCL, MaskType, ModeType - movdqa XMM_R0, [VIF_SRC] - movdqa XMM_R1, [VIF_SRC+16] - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 32 - endm - -UNPACK_V4_32SSE_2 macro CL, TOTALCL, MaskType, ModeType - movdqu XMM_R0, [VIF_SRC] - movdqu XMM_R1, [VIF_SRC+16] - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 32 - endm - -UNPACK_V4_32SSE_1A macro CL, TOTALCL, MaskType, ModeType - movdqa XMM_R0, [VIF_SRC] - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 16 - endm - -UNPACK_V4_32SSE_1 macro CL, TOTALCL, MaskType, ModeType - movdqu XMM_R0, [VIF_SRC] - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 16 - endm - -;; V4-16 -UNPACK_V4_16SSE_4A macro CL, TOTALCL, MaskType, ModeType - - punpcklwd XMM_R0, [VIF_SRC] - punpckhwd XMM_R1, [VIF_SRC] - punpcklwd XMM_R2, [VIF_SRC+16] - punpckhwd XMM_R3, [VIF_SRC+16] - - UNPACK_RIGHTSHIFT XMM_R1, 16 - UNPACK_RIGHTSHIFT XMM_R3, 16 - UNPACK_RIGHTSHIFT XMM_R0, 16 - UNPACK_RIGHTSHIFT XMM_R2, 16 - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 32 - endm - -UNPACK_V4_16SSE_4 macro CL, TOTALCL, MaskType, ModeType - movdqu XMM_R0, [VIF_SRC] - movdqu XMM_R2, [VIF_SRC+16] - - punpckhwd XMM_R1, XMM_R0 - punpckhwd XMM_R3, XMM_R2 - punpcklwd XMM_R0, XMM_R0 - punpcklwd XMM_R2, XMM_R2 - - UNPACK_RIGHTSHIFT XMM_R1, 16 - UNPACK_RIGHTSHIFT XMM_R3, 16 - UNPACK_RIGHTSHIFT XMM_R0, 16 - UNPACK_RIGHTSHIFT XMM_R2, 16 - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 32 - endm - -UNPACK_V4_16SSE_3A macro CL, TOTALCL, MaskType, ModeType - punpcklwd XMM_R0, [VIF_SRC] - punpckhwd XMM_R1, [VIF_SRC] - punpcklwd XMM_R2, [VIF_SRC+16] - - UNPACK_RIGHTSHIFT XMM_R0, 16 - UNPACK_RIGHTSHIFT XMM_R1, 16 - UNPACK_RIGHTSHIFT XMM_R2, 16 - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 24 - endm - -UNPACK_V4_16SSE_3 macro CL, TOTALCL, MaskType, ModeType - movdqu XMM_R0, [VIF_SRC] - movq XMM_R2, QWORD PTR [VIF_SRC+16] - - punpckhwd XMM_R1, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - punpcklwd XMM_R2, XMM_R2 - - UNPACK_RIGHTSHIFT XMM_R0, 16 - UNPACK_RIGHTSHIFT XMM_R1, 16 - UNPACK_RIGHTSHIFT XMM_R2, 16 - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 24 - endm - -UNPACK_V4_16SSE_2A macro CL, TOTALCL, MaskType, ModeType - punpcklwd XMM_R0, [VIF_SRC] - punpckhwd XMM_R1, [VIF_SRC] - - UNPACK_RIGHTSHIFT XMM_R0, 16 - UNPACK_RIGHTSHIFT XMM_R1, 16 - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 16 - endm - -UNPACK_V4_16SSE_2 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - movq XMM_R1, QWORD PTR [VIF_SRC+8] - - punpcklwd XMM_R0, XMM_R0 - punpcklwd XMM_R1, XMM_R1 - - UNPACK_RIGHTSHIFT XMM_R0, 16 - UNPACK_RIGHTSHIFT XMM_R1, 16 - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 16 - endm - -UNPACK_V4_16SSE_1A macro CL, TOTALCL, MaskType, ModeType - punpcklwd XMM_R0, [VIF_SRC] - UNPACK_RIGHTSHIFT XMM_R0, 16 - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 8 - endm - -UNPACK_V4_16SSE_1 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - punpcklwd XMM_R0, XMM_R0 - UNPACK_RIGHTSHIFT XMM_R0, 16 - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 8 - endm - -;; V4-8 -UNPACK_V4_8SSE_4A macro CL, TOTALCL, MaskType, ModeType - punpcklbw XMM_R0, [VIF_SRC] - punpckhbw XMM_R2, [VIF_SRC] - - punpckhwd XMM_R1, XMM_R0 - punpckhwd XMM_R3, XMM_R2 - punpcklwd XMM_R0, XMM_R0 - punpcklwd XMM_R2, XMM_R2 - - UNPACK_RIGHTSHIFT XMM_R1, 24 - UNPACK_RIGHTSHIFT XMM_R3, 24 - UNPACK_RIGHTSHIFT XMM_R0, 24 - UNPACK_RIGHTSHIFT XMM_R2, 24 - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 16 - endm - -UNPACK_V4_8SSE_4 macro CL, TOTALCL, MaskType, ModeType - movdqu XMM_R0, [VIF_SRC] - - punpckhbw XMM_R2, XMM_R0 - punpcklbw XMM_R0, XMM_R0 - - punpckhwd XMM_R3, XMM_R2 - punpckhwd XMM_R1, XMM_R0 - punpcklwd XMM_R2, XMM_R2 - punpcklwd XMM_R0, XMM_R0 - - UNPACK_RIGHTSHIFT XMM_R3, 24 - UNPACK_RIGHTSHIFT XMM_R2, 24 - - UNPACK_RIGHTSHIFT XMM_R0, 24 - UNPACK_RIGHTSHIFT XMM_R1, 24 - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 16 - endm - -UNPACK_V4_8SSE_3A macro CL, TOTALCL, MaskType, ModeType - punpcklbw XMM_R0, [VIF_SRC] - punpckhbw XMM_R2, [VIF_SRC] - - punpckhwd XMM_R1, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - punpcklwd XMM_R2, XMM_R2 - - UNPACK_RIGHTSHIFT XMM_R1, 24 - UNPACK_RIGHTSHIFT XMM_R0, 24 - UNPACK_RIGHTSHIFT XMM_R2, 24 - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 12 - endm - -UNPACK_V4_8SSE_3 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - movd XMM_R2, dword ptr [VIF_SRC+8] - - punpcklbw XMM_R0, XMM_R0 - punpcklbw XMM_R2, XMM_R2 - - punpckhwd XMM_R1, XMM_R0 - punpcklwd XMM_R2, XMM_R2 - punpcklwd XMM_R0, XMM_R0 - - UNPACK_RIGHTSHIFT XMM_R1, 24 - UNPACK_RIGHTSHIFT XMM_R0, 24 - UNPACK_RIGHTSHIFT XMM_R2, 24 - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 12 - endm - -UNPACK_V4_8SSE_2A macro CL, TOTALCL, MaskType, ModeType - punpcklbw XMM_R0, [VIF_SRC] - - punpckhwd XMM_R1, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - - UNPACK_RIGHTSHIFT XMM_R1, 24 - UNPACK_RIGHTSHIFT XMM_R0, 24 - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 8 - endm - -UNPACK_V4_8SSE_2 macro CL, TOTALCL, MaskType, ModeType - movq XMM_R0, QWORD PTR [VIF_SRC] - - punpcklbw XMM_R0, XMM_R0 - - punpckhwd XMM_R1, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - - UNPACK_RIGHTSHIFT XMM_R1, 24 - UNPACK_RIGHTSHIFT XMM_R0, 24 - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 8 - endm - -UNPACK_V4_8SSE_1A macro CL, TOTALCL, MaskType, ModeType - punpcklbw XMM_R0, [VIF_SRC] - punpcklwd XMM_R0, XMM_R0 - UNPACK_RIGHTSHIFT XMM_R0, 24 - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 4 - endm - -UNPACK_V4_8SSE_1 macro CL, TOTALCL, MaskType, ModeType - movd XMM_R0, dword ptr [VIF_SRC] - punpcklbw XMM_R0, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - UNPACK_RIGHTSHIFT XMM_R0, 24 - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 4 - endm - -;; V4-5 -DECOMPRESS_RGBA macro OFFSET - mov bl, al - shl bl, 3 - mov byte ptr [s_TempDecompress+OFFSET], bl - - mov bx, ax - shr bx, 2 - and bx, 0f8h - mov byte ptr [s_TempDecompress+OFFSET+1], bl - - mov bx, ax - shr bx, 7 - and bx, 0f8h - mov byte ptr [s_TempDecompress+OFFSET+2], bl - mov bx, ax - shr bx, 8 - and bx, 080h - mov byte ptr [s_TempDecompress+OFFSET+3], bl - endm - -UNPACK_V4_5SSE_4 macro CL, TOTALCL, MaskType, ModeType - mov eax, dword ptr [VIF_SRC] - DECOMPRESS_RGBA 0 - - shr eax, 16 - DECOMPRESS_RGBA 4 - - mov eax, dword ptr [VIF_SRC+4] - DECOMPRESS_RGBA 8 - - shr eax, 16 - DECOMPRESS_RGBA 12 - - ;; have to use movaps instead of movdqa -%ifdef __x86_64__ - movdqa XMM_R0, XMMWORD PTR [s_TempDecompress] -%else - movaps XMM_R0, [s_TempDecompress] -%endif - - punpckhbw XMM_R2, XMM_R0 - punpcklbw XMM_R0, XMM_R0 - - punpckhwd XMM_R3, XMM_R2 - punpckhwd XMM_R1, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - punpcklwd XMM_R2, XMM_R2 - - psrld XMM_R0, 24 - psrld XMM_R1, 24 - psrld XMM_R2, 24 - psrld XMM_R3, 24 - - UNPACK4_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 8 - endm - -UNPACK_V4_5SSE_4A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V4_5SSE_4 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_V4_5SSE_3 macro CL, TOTALCL, MaskType, ModeType - mov eax, dword ptr [VIF_SRC] - DECOMPRESS_RGBA 0 - - shr eax, 16 - DECOMPRESS_RGBA 4 - - mov eax, dword ptr [VIF_SRC] - DECOMPRESS_RGBA 8 - - ;; have to use movaps instead of movdqa -%ifdef __x86_64__ - movdqa XMM_R0, XMMWORD PTR [s_TempDecompress] -%else - movaps XMM_R0, [s_TempDecompress] -%endif - - punpckhbw XMM_R2, XMM_R0 - punpcklbw XMM_R0, XMM_R0 - - punpckhwd XMM_R1, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - punpcklwd XMM_R2, XMM_R2 - - psrld XMM_R0, 24 - psrld XMM_R1, 24 - psrld XMM_R2, 24 - - UNPACK3_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 6 - endm - -UNPACK_V4_5SSE_3A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V4_5SSE_3 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_V4_5SSE_2 macro CL, TOTALCL, MaskType, ModeType - mov eax, dword ptr [VIF_SRC] - DECOMPRESS_RGBA 0 - - shr eax, 16 - DECOMPRESS_RGBA 4 - - movq XMM_R0, QWORD PTR [s_TempDecompress] - - punpcklbw XMM_R0, XMM_R0 - - punpckhwd XMM_R1, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - - psrld XMM_R0, 24 - psrld XMM_R1, 24 - - UNPACK2_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 4 - endm - -UNPACK_V4_5SSE_2A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V4_5SSE_2 CL, TOTALCL, MaskType, ModeType - endm - -UNPACK_V4_5SSE_1 macro CL, TOTALCL, MaskType, ModeType - mov ax, word ptr [VIF_SRC] - DECOMPRESS_RGBA 0 - - movd XMM_R0, DWORD PTR [s_TempDecompress] - punpcklbw XMM_R0, XMM_R0 - punpcklwd XMM_R0, XMM_R0 - - psrld XMM_R0, 24 - - UNPACK1_SSE CL, TOTALCL, MaskType, ModeType - - add VIF_SRC, 2 - endm - -UNPACK_V4_5SSE_1A macro CL, TOTALCL, MaskType, ModeType - UNPACK_V4_5SSE_1 CL, TOTALCL, MaskType, ModeType - endm - -;; save the row reg -SAVE_ROW_REG_BASE macro - mov VIF_TMPADDR, [_vifRow] - movdqa [VIF_TMPADDR], XMM_ROW - mov VIF_TMPADDR, [_vifRegs] - movss dword ptr [VIF_TMPADDR+0100h], XMM_ROW - psrldq XMM_ROW, 4 - movss dword ptr [VIF_TMPADDR+0110h], XMM_ROW - psrldq XMM_ROW, 4 - movss dword ptr [VIF_TMPADDR+0120h], XMM_ROW - psrldq XMM_ROW, 4 - movss dword ptr [VIF_TMPADDR+0130h], XMM_ROW - endm - -SAVE_NO_REG macro - endm - -%ifdef __x86_64__ - -INIT_ARGS macro - mov rax, qword ptr [_vifRow] - mov r9, qword ptr [_vifCol] - movaps xmm6, XMMWORD PTR [rax] - movaps xmm7, XMMWORD PTR [r9] - endm - -INC_STACK macro reg - add rsp, 8 - endm - -%else - -%define STACKOFFSET 12 - -;; 32 bit versions have the args on the stack -INIT_ARGS macro - mov VIF_DST, dword ptr [esp+4+STACKOFFSET] - mov VIF_SRC, dword ptr [esp+8+STACKOFFSET] - mov VIF_SIZE, dword ptr [esp+12+STACKOFFSET] - endm - -INC_STACK macro reg - add esp, 4 - endm - -%endif - -;; qsize - bytes of compressed size of 1 decompressed qword -;; int UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType(u32* dest, u32* data, int dmasize) -defUNPACK_SkippingWrite macro name, MaskType, ModeType, qsize, sign, SAVE_ROW_REG -@CatStr(UNPACK_SkippingWrite_, name, _, sign, _, MaskType, _, ModeType) proc public -%ifdef __x86_64__ - push rdi -%else - push edi - push esi - push ebx -%endif - INIT_ARGS - mov VIF_TMPADDR, [_vifRegs] - movzx VIF_INC, byte ptr [VIF_TMPADDR + 040h] - movzx VIF_SAVEEBX, byte ptr [VIF_TMPADDR + 041h] - sub VIF_INC, VIF_SAVEEBX - shl VIF_INC, 4 - - cmp VIF_SAVEEBXd, 1 - je @CatStr(name, _, sign, _, MaskType, _, ModeType, _WL1) - cmp VIF_SAVEEBXd, 2 - je @CatStr(name, _, sign, _, MaskType, _, ModeType, _WL2) - cmp VIF_SAVEEBXd, 3 - je @CatStr(name, _, sign, _, MaskType, _, ModeType, _WL3) - jmp @CatStr(name, _, sign, _, MaskType, _, ModeType, _WL4) - -@CatStr(name, _, sign, _, MaskType, _, ModeType, _WL1): - @CatStr(UNPACK_Start_Setup_, MaskType, _SSE_, ModeType) 0 - - cmp VIF_SIZE, qsize - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Done3) - - add VIF_INC, 16 - - ;; first align VIF_SRC to 16 bytes -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Align16): - - test VIF_SRC, 15 - jz @CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_UnpackAligned) - - @CatStr(UNPACK_, name, SSE_1) 0, 1, MaskType, ModeType - - cmp VIF_SIZE, (2*qsize) - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_DoneWithDec) - sub VIF_SIZE, qsize - jmp @CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Align16) - -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_UnpackAligned): - - cmp VIF_SIZE, (2*qsize) - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Unpack1) - cmp VIF_SIZE, (3*qsize) - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Unpack2) - cmp VIF_SIZE, (4*qsize) - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Unpack3) - prefetchnta [VIF_SRC + 64] - -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Unpack4): - @CatStr(UNPACK_, name, SSE_4A) 0, 1, MaskType, ModeType - - cmp VIF_SIZE, (8*qsize) - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_DoneUnpack4) - sub VIF_SIZE, (4*qsize) - jmp @CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Unpack4) - -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_DoneUnpack4): - - sub VIF_SIZE, (4*qsize) - cmp VIF_SIZE, qsize - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Done3) - cmp VIF_SIZE, (2*qsize) - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Unpack1) - cmp VIF_SIZE, (3*qsize) - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Unpack2) - ;; fall through - -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Unpack3): - @CatStr(UNPACK_, name, SSE_3A) 0, 1, MaskType, ModeType - - sub VIF_SIZE, (3*qsize) - jmp @CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Done3) - -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Unpack2): - @CatStr(UNPACK_, name, SSE_2A) 0, 1, MaskType, ModeType - - sub VIF_SIZE, (2*qsize) - jmp @CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Done3) - -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Unpack1): - @CatStr(UNPACK_, name, SSE_1A) 0, 1, MaskType, ModeType -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_DoneWithDec): - sub VIF_SIZE, qsize -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C1_Done3): - SAVE_ROW_REG - mov eax, VIF_SIZE -%ifdef __x86_64__ - pop rdi -%else - pop ebx - pop esi - pop edi -%endif - ret - -@CatStr(name, _, sign, _, MaskType, _, ModeType, _WL2): - cmp VIF_SIZE, (2*qsize) - - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C2_Done3) -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C2_Unpack): - @CatStr(UNPACK_, name, SSE_2) 0, 0, MaskType, ModeType - - ;; take into account wl - add VIF_DST, VIF_INC - cmp VIF_SIZE, (4*qsize) - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C2_Done2) - sub VIF_SIZE, (2*qsize) - ;; unpack next - jmp @CatStr(name, _, sign, _, MaskType, _, ModeType, _C2_Unpack) - -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C2_Done2): - sub VIF_SIZE, (2*qsize) -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C2_Done3): - cmp VIF_SIZE, qsize - - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C2_Done4) - - ;; execute left over qw - @CatStr(UNPACK_, name, SSE_1) 0, 0, MaskType, ModeType - - sub VIF_SIZE, qsize -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C2_Done4): - - SAVE_ROW_REG - mov eax, VIF_SIZE -%ifdef __x86_64__ - pop rdi -%else - pop ebx - pop esi - pop edi -%endif - ret - -@CatStr(name, _, sign, _, MaskType, _, ModeType, _WL3): - cmp VIF_SIZE, (3*qsize) - - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C3_Done5) -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C3_Unpack): - @CatStr(UNPACK_, name, SSE_3) 0, 0, MaskType, ModeType - - add VIF_DST, VIF_INC - cmp VIF_SIZE, (6*qsize) - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C3_Done2) - sub VIF_SIZE, (3*qsize) - jmp @CatStr(name, _, sign, _, MaskType, _, ModeType, _C3_Unpack) -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C3_Done2): - sub VIF_SIZE, (3*qsize) -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C3_Done5): - cmp VIF_SIZE, qsize - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C3_Done4) - - cmp VIF_SIZE, (2*qsize) - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C3_Done3) - - @CatStr(UNPACK_, name, SSE_2) 0, 0, MaskType, ModeType - - sub VIF_SIZE, (2*qsize) - jmp @CatStr(name, _, sign, _, MaskType, _, ModeType, _C3_Done4) -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C3_Done3): - sub VIF_SIZE, qsize - @CatStr(UNPACK_, name, SSE_1) 0, 0, MaskType, ModeType -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C3_Done4): - SAVE_ROW_REG - mov eax, VIF_SIZE -%ifdef __x86_64__ - pop rdi -%else - pop ebx - pop esi - pop edi -%endif - ret - -@CatStr(name, _, sign, _, MaskType, _, ModeType, _WL4): - sub VIF_SAVEEBX, 3 - push VIF_INC - cmp VIF_SIZE, qsize - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_Done) - -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_Unpack): - cmp VIF_SIZE, (3*qsize) - jge @CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_Unpack3) - cmp VIF_SIZE, (2*qsize) - jge @CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_Unpack2) - - @CatStr(UNPACK_, name, SSE_1) 0, 0, MaskType, ModeType - - ;; not enough data left - sub VIF_SIZE, qsize - jmp @CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_Done) -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_Unpack2): - @CatStr(UNPACK_, name, SSE_2) 0, 0, MaskType, ModeType - - ;; not enough data left - sub VIF_SIZE, (2*qsize) - jmp @CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_Done) -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_Unpack3): - @CatStr(UNPACK_, name, SSE_3) 0, 0, MaskType, ModeType - - ;; more data left, process 1qw at a time - sub VIF_SIZE, (3*qsize) - mov VIF_INC, VIF_SAVEEBX - -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_UnpackX): - - ;; check if any data left - cmp VIF_SIZE, qsize - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_Done) - - @CatStr(UNPACK_, name, SSE_1) 3, 0, MaskType, ModeType - - sub VIF_SIZE, qsize - cmp VIF_INC, 1 - je @CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_DoneLoop) - sub VIF_INC, 1 - jmp @CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_UnpackX) -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_DoneLoop): - add VIF_DST, [VIF_ESP] - cmp VIF_SIZE, qsize - jl @CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_Done) - jmp @CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_Unpack) -@CatStr(name, _, sign, _, MaskType, _, ModeType, _C4_Done): - - SAVE_ROW_REG - INC_STACK() - mov eax, VIF_SIZE - -%ifdef __x86_64__ - pop rdi -%else - pop ebx - pop esi - pop edi -%endif - ret -@CatStr(UNPACK_SkippingWrite_, name, _, sign, _, MaskType, _, ModeType endp) -endm - -UNPACK_RIGHTSHIFT macro reg, shift - psrld reg, shift - endm - -defUNPACK_SkippingWrite2 macro name, qsize - defUNPACK_SkippingWrite name, Regular, 0, qsize, u, SAVE_NO_REG - defUNPACK_SkippingWrite name, Regular, 1, qsize, u, SAVE_NO_REG - defUNPACK_SkippingWrite name, Regular, 2, qsize, u, SAVE_ROW_REG_BASE - defUNPACK_SkippingWrite name, Mask, 0, qsize, u, SAVE_NO_REG - defUNPACK_SkippingWrite name, Mask, 1, qsize, u, SAVE_NO_REG - defUNPACK_SkippingWrite name, Mask, 2, qsize, u, SAVE_ROW_REG_BASE - defUNPACK_SkippingWrite name, WriteMask, 0, qsize, u, SAVE_NO_REG - defUNPACK_SkippingWrite name, WriteMask, 1, qsize, u, SAVE_NO_REG - defUNPACK_SkippingWrite name, WriteMask, 2, qsize, u, SAVE_ROW_REG_BASE - endm - -defUNPACK_SkippingWrite2 S_32, 4 -defUNPACK_SkippingWrite2 S_16, 2 -defUNPACK_SkippingWrite2 S_8, 1 -defUNPACK_SkippingWrite2 V2_32, 8 -defUNPACK_SkippingWrite2 V2_16, 4 -defUNPACK_SkippingWrite2 V2_8, 2 -defUNPACK_SkippingWrite2 V3_32, 12 -defUNPACK_SkippingWrite2 V3_16, 6 -defUNPACK_SkippingWrite2 V3_8, 3 -defUNPACK_SkippingWrite2 V4_32, 16 -defUNPACK_SkippingWrite2 V4_16, 8 -defUNPACK_SkippingWrite2 V4_8, 4 -defUNPACK_SkippingWrite2 V4_5, 2 - -UNPACK_RIGHTSHIFT macro reg, shift - psrad reg, shift - endm - - -defUNPACK_SkippingWrite2a macro name, qsize - defUNPACK_SkippingWrite name, Mask, 0, qsize, s, SAVE_NO_REG - defUNPACK_SkippingWrite name, Regular, 0, qsize, s, SAVE_NO_REG - defUNPACK_SkippingWrite name, Regular, 1, qsize, s, SAVE_NO_REG - defUNPACK_SkippingWrite name, Regular, 2, qsize, s, SAVE_ROW_REG_BASE - defUNPACK_SkippingWrite name, Mask, 1, qsize, s, SAVE_NO_REG - defUNPACK_SkippingWrite name, Mask, 2, qsize, s, SAVE_ROW_REG_BASE - defUNPACK_SkippingWrite name, WriteMask, 0, qsize, s, SAVE_NO_REG - defUNPACK_SkippingWrite name, WriteMask, 1, qsize, s, SAVE_NO_REG - defUNPACK_SkippingWrite name, WriteMask, 2, qsize, s, SAVE_ROW_REG_BASE - endm - -defUNPACK_SkippingWrite2a S_16, 2 -defUNPACK_SkippingWrite2a S_8, 1 -defUNPACK_SkippingWrite2a V2_16, 4 -defUNPACK_SkippingWrite2a V2_8, 2 -defUNPACK_SkippingWrite2a V3_16, 6 -defUNPACK_SkippingWrite2a V3_8, 3 -defUNPACK_SkippingWrite2a V4_16, 8 -defUNPACK_SkippingWrite2a V4_8, 4 - -end diff --git a/pcsx2/x86/iVif.cpp b/pcsx2/x86/iVif.cpp deleted file mode 100644 index 624fb3bcbe..0000000000 --- a/pcsx2/x86/iVif.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2009 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - - -#include "PrecompiledHeader.h" - -#include "Common.h" -#include "Vif.h" -#include "VUmicro.h" - -#include -#include - -// sse2 highly optimized vif (~200 separate functions are built) zerofrog(@gmail.com) -extern u32 g_vif1Masks[48], g_vif0Masks[48]; -extern u32 g_vif1HasMask3[4], g_vif0HasMask3[4]; - -// arranged in writearr, rowarr, colarr, updatearr -static const __aligned16 u32 s_maskarr[16][4] = { - {0xffffffff, 0x00000000, 0x00000000, 0xffffffff}, - {0xffff0000, 0x0000ffff, 0x00000000, 0xffffffff}, - {0xffff0000, 0x00000000, 0x0000ffff, 0xffffffff}, - {0xffff0000, 0x00000000, 0x00000000, 0xffff0000}, - {0x0000ffff, 0xffff0000, 0x00000000, 0xffffffff}, - {0x00000000, 0xffffffff, 0x00000000, 0xffffffff}, - {0x00000000, 0xffff0000, 0x0000ffff, 0xffffffff}, - {0x00000000, 0xffff0000, 0x00000000, 0xffff0000}, - {0x0000ffff, 0x00000000, 0xffff0000, 0xffffffff}, - {0x00000000, 0x0000ffff, 0xffff0000, 0xffffffff}, - {0x00000000, 0x00000000, 0xffffffff, 0xffffffff}, - {0x00000000, 0x00000000, 0xffff0000, 0xffff0000}, - {0x0000ffff, 0x00000000, 0x00000000, 0x0000ffff}, - {0x00000000, 0x0000ffff, 0x00000000, 0x0000ffff}, - {0x00000000, 0x00000000, 0x0000ffff, 0x0000ffff}, - {0x00000000, 0x00000000, 0x00000000, 0x00000000} -}; - -extern u8 s_maskwrite[256]; - -// Dear C++: Please don't mangle this name, thanks! -extern "C" __aligned16 u32 s_TempDecompress[4]; -__aligned16 u32 s_TempDecompress[4] = {0}; - -// Note: this function used to break regularly on Linux due to stack alignment. -// Refer to old revisions of this code if it breaks again for workarounds. -void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask) -{ - u32 i; - u32 prev = 0; - - XMMRegisters::Freeze(); - for(i = 0; i < 4; ++i, mask >>= 8, oldmask >>= 8, vif1masks += 16) { - - prev |= s_maskwrite[mask&0xff]; - hasmask[i] = prev; - - if ((mask&0xff) != (oldmask&0xff)) - { - __m128i r0, r1, r2, r3; - r0 = _mm_load_si128((__m128i*)&s_maskarr[mask&15][0]); // Tends to crash Linux, - r2 = _mm_unpackhi_epi16(r0, r0); - r0 = _mm_unpacklo_epi16(r0, r0); - - r1 = _mm_load_si128((__m128i*)&s_maskarr[(mask>>4)&15][0]); - r3 = _mm_unpackhi_epi16(r1, r1); - r1 = _mm_unpacklo_epi16(r1, r1); - - _mm_storel_pi((__m64*)&vif1masks[0], *(__m128*)&r0); - _mm_storel_pi((__m64*)&vif1masks[2], *(__m128*)&r1); - _mm_storeh_pi((__m64*)&vif1masks[4], *(__m128*)&r0); - _mm_storeh_pi((__m64*)&vif1masks[6], *(__m128*)&r1); - - _mm_storel_pi((__m64*)&vif1masks[8], *(__m128*)&r2); - _mm_storel_pi((__m64*)&vif1masks[10], *(__m128*)&r3); - _mm_storeh_pi((__m64*)&vif1masks[12], *(__m128*)&r2); - _mm_storeh_pi((__m64*)&vif1masks[14], *(__m128*)&r3); - } - } - XMMRegisters::Thaw(); -} diff --git a/pcsx2/x86/newVif.h b/pcsx2/x86/newVif.h index 8a6a431577..06635804f1 100644 --- a/pcsx2/x86/newVif.h +++ b/pcsx2/x86/newVif.h @@ -21,8 +21,6 @@ #include "x86emitter/x86emitter.h" using namespace x86Emitter; -#if newVif - // newVif_HashBucket.h uses this typedef, so it has to be decared first. typedef u32 (__fastcall *nVifCall)(void*, void*); typedef void (__fastcall *nVifrecCall)(uptr dest, uptr src); @@ -104,4 +102,3 @@ extern __aligned16 u32 nVifMask[3][4][4]; // [MaskNumber][CycleNumber][Vector] static const bool useOldUnpack = 0; // Use code in newVif_OldUnpack.inl static const bool newVifDynaRec = 1; // Use code in newVif_Dynarec.inl -#endif diff --git a/pcsx2/x86/newVif_OldUnpack.inl b/pcsx2/x86/newVif_OldUnpack.inl index 8bfb1b0c99..19ddcbd081 100644 --- a/pcsx2/x86/newVif_OldUnpack.inl +++ b/pcsx2/x86/newVif_OldUnpack.inl @@ -24,24 +24,20 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) { VURegs * VU; u8 *cdata = (u8*)data; u32 tempsize = 0; - const u32 memlimit = vif_size(VIFdmanum); + const u32 memlimit = (VIFdmanum == 0) ? 0x1000 : 0x4000; if (VIFdmanum == 0) { VU = &VU0; vifRegs = vif0Regs; - vifMaskRegs = g_vif0Masks; vif = &vif0; - vifRow = g_vifmask.Row0; } else { VU = &VU1; vifRegs = vif1Regs; - vifMaskRegs = g_vif1Masks; vif = &vif1; - vifRow = g_vifmask.Row1; } - u32 *dest = (u32*)(VU->Mem + v->addr); + u32 *dest = (u32*)(VU->Mem + v->addr); const VIFUnpackFuncTable& ft( VIFfuncTable[ v->cmd & 0x1f ] ); UNPACKFUNCTYPE func = vif->usn ? ft.funcU : ft.funcS; diff --git a/pcsx2/x86/newVif_Unpack.cpp b/pcsx2/x86/newVif_Unpack.cpp index 391c89118e..f4e4d09143 100644 --- a/pcsx2/x86/newVif_Unpack.cpp +++ b/pcsx2/x86/newVif_Unpack.cpp @@ -21,8 +21,6 @@ #include "Common.h" #include "VifDma_internal.h" #include "newVif.h" - -#if newVif #include "newVif_OldUnpack.inl" __aligned16 nVifStruct nVif[2]; @@ -271,4 +269,4 @@ _f void _nVifUnpack(int idx, u8 *data, u32 size, bool isFill) { const bool doMode = !!vifRegs->mode; UnpackLoopTable[idx][doMode][isFill]( data, size ); } -#endif +