diff --git a/pcsx2/Config.h b/pcsx2/Config.h index 721593dc84..ec1c60b3f0 100644 --- a/pcsx2/Config.h +++ b/pcsx2/Config.h @@ -618,7 +618,6 @@ TraceLogFilters& SetTraceConfig(); #endif #define EE_CONST_PROP // rec2 - enables constant propagation (faster) -//#define NON_SSE_UNPACKS // Turns off SSE Unpacks (slower) // Uncomment this if working on getting PS1 emulation working. // This disables the exception normally caused by trying to load PS1 diff --git a/pcsx2/Linux/pcsx2.cbp b/pcsx2/Linux/pcsx2.cbp index d1768d95b1..116011c92d 100644 --- a/pcsx2/Linux/pcsx2.cbp +++ b/pcsx2/Linux/pcsx2.cbp @@ -1,548 +1,549 @@ - - - - - - + + + + + + diff --git a/pcsx2/VIFunpack.cpp b/pcsx2/VIFunpack.cpp new file mode 100644 index 0000000000..331c9153c7 --- /dev/null +++ b/pcsx2/VIFunpack.cpp @@ -0,0 +1,385 @@ +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + + +#include "PrecompiledHeader.h" +#include "Common.h" + +#include + +#include "Vif.h" +#include "VifDma_internal.h" + +enum UnpackOffset +{ + OFFSET_X = 0, + OFFSET_Y = 1, + OFFSET_Z = 2, + OFFSET_W = 3 +}; + +static __forceinline u32 setVifRowRegs(u32 reg, u32 data) +{ + switch (reg) + { + case 0: + vifRegs->r0 = data; + break; + case 1: + vifRegs->r1 = data; + break; + case 2: + vifRegs->r2 = data; + break; + case 3: + vifRegs->r3 = data; + break; + jNO_DEFAULT; + } + return data; +} + +static __forceinline u32 getVifRowRegs(u32 reg) +{ + switch (reg) + { + case 0: + return vifRegs->r0; + break; + case 1: + return vifRegs->r1; + break; + case 2: + return vifRegs->r2; + break; + case 3: + return vifRegs->r3; + break; + jNO_DEFAULT; + } + return 0; // unreachable... +} + +static __forceinline u32 setVifColRegs(u32 reg, u32 data) +{ + switch (reg) + { + case 0: + vifRegs->c0 = data; + break; + case 1: + vifRegs->c1 = data; + break; + case 2: + vifRegs->c2 = data; + break; + case 3: + vifRegs->c3 = data; + break; + jNO_DEFAULT; + } + return data; +} + +static __forceinline u32 getVifColRegs(u32 reg) +{ + switch (reg) + { + case 0: + return vifRegs->c0; + break; + case 1: + return vifRegs->c1; + break; + case 2: + return vifRegs->c2; + break; + case 3: + return vifRegs->c3; + break; + jNO_DEFAULT; + } + return 0; // unreachable... +} + +template< bool doMask > +static __releaseinline void writeXYZW(u32 offnum, u32 &dest, u32 data) +{ + int n; + u32 vifRowReg = getVifRowRegs(offnum); + + if (doMask) + { + switch (vif->cl) + { + case 0: + if (offnum == OFFSET_X) + n = (vifRegs->mask) & 0x3; + else + n = (vifRegs->mask >> (offnum * 2)) & 0x3; + break; + case 1: + n = (vifRegs->mask >> ( 8 + (offnum * 2))) & 0x3; + break; + case 2: + n = (vifRegs->mask >> (16 + (offnum * 2))) & 0x3; + break; + default: + n = (vifRegs->mask >> (24 + (offnum * 2))) & 0x3; + break; + } + } + else n = 0; + + switch (n) + { + case 0: + if ((vif->cmd & 0x6F) == 0x6f) + { + dest = data; + } + else switch (vifRegs->mode) + { + case 1: + dest = data + vifRowReg; + break; + case 2: + // vifRowReg isn't used after this, or I would make it equal to dest here. + dest = setVifRowRegs(offnum, vifRowReg + data); + break; + default: + dest = data; + break; + } + break; + case 1: + dest = vifRowReg; + break; + case 2: + dest = getVifColRegs((vif->cl > 2) ? 3 : vif->cl); + break; + case 3: + break; + } +// VIF_LOG("writeX %8.8x : Mode %d, r0 = %x, data %8.8x", *dest,vifRegs->mode,vifRegs->r0,data); +} + +template < bool doMask, class T > +static __forceinline void __fastcall UNPACK_S(u32 *dest, T *data, int size) +{ + //S-# will always be a complete packet, no matter what. So we can skip the offset bits + writeXYZW(OFFSET_X, *dest++, *data); + writeXYZW(OFFSET_Y, *dest++, *data); + writeXYZW(OFFSET_Z, *dest++, *data); + writeXYZW(OFFSET_W, *dest , *data); +} + +template +static __forceinline void __fastcall UNPACK_V2(u32 *dest, T *data, int size) +{ + if (vifRegs->offset == OFFSET_X) + { + if (size > 0) + { + writeXYZW(vifRegs->offset, *dest++, *data++); + vifRegs->offset = OFFSET_Y; + size--; + } + } + + if (vifRegs->offset == OFFSET_Y) + { + if (size > 0) + { + writeXYZW(vifRegs->offset, *dest++, *data); + vifRegs->offset = OFFSET_Z; + size--; + } + } + + if (vifRegs->offset == OFFSET_Z) + { + writeXYZW(vifRegs->offset, *dest++, *dest-2); + vifRegs->offset = OFFSET_W; + } + + if (vifRegs->offset == OFFSET_W) + { + writeXYZW(vifRegs->offset, *dest, *data); + vifRegs->offset = OFFSET_X; + } +} + +template +static __forceinline void __fastcall UNPACK_V3(u32 *dest, T *data, int size) +{ + if(vifRegs->offset == OFFSET_X) + { + if (size > 0) + { + writeXYZW(vifRegs->offset, *dest++, *data++); + vifRegs->offset = OFFSET_Y; + size--; + } + } + + if(vifRegs->offset == OFFSET_Y) + { + if (size > 0) + { + writeXYZW(vifRegs->offset, *dest++, *data++); + vifRegs->offset = OFFSET_Z; + size--; + } + } + + if(vifRegs->offset == OFFSET_Z) + { + if (size > 0) + { + writeXYZW(vifRegs->offset, *dest++, *data++); + vifRegs->offset = OFFSET_W; + size--; + } + } + + if(vifRegs->offset == OFFSET_W) + { + //V3-# does some bizzare thing with alignment, every 6qw of data the W becomes 0 (strange console!) + //Ape Escape doesnt seem to like it tho (what the hell?) gonna have to investigate + writeXYZW(vifRegs->offset, *dest, *data); + vifRegs->offset = OFFSET_X; + } +} + +template +static __forceinline void __fastcall UNPACK_V4(u32 *dest, T *data , int size) +{ + while (size > 0) + { + writeXYZW(vifRegs->offset, *dest++, *data++); + vifRegs->offset++; + size--; + } + + if (vifRegs->offset > OFFSET_W) vifRegs->offset = OFFSET_X; +} + +template< bool doMask > +static __releaseinline void __fastcall UNPACK_V4_5(u32 *dest, u32 *data, int size) +{ + //As with S-#, this will always be a complete packet + writeXYZW(OFFSET_X, *dest++, ((*data & 0x001f) << 3)); + writeXYZW(OFFSET_Y, *dest++, ((*data & 0x03e0) >> 2)); + writeXYZW(OFFSET_Z, *dest++, ((*data & 0x7c00) >> 7)); + writeXYZW(OFFSET_W, *dest, ((*data & 0x8000) >> 8)); +} + +// ===================================================================================================== + +template < bool doMask, int size, class T > +static void __fastcall fUNPACK_S(u32 *dest, T *data) +{ + UNPACK_S( dest, data, size ); +} + +template +static void __fastcall fUNPACK_V2(u32 *dest, T *data) +{ + UNPACK_V2( dest, data, size ); +} + +template +static void __fastcall fUNPACK_V3(u32 *dest, T *data) +{ + UNPACK_V3( dest, data, size ); +} + +template +static void __fastcall fUNPACK_V4(u32 *dest, T *data) +{ + UNPACK_V4( dest, data, size ); +} + +template< bool doMask > +static void __fastcall fUNPACK_V4_5(u32 *dest, u32 *data) +{ + UNPACK_V4_5(dest, data, 0); // size is ignored. +} + +#define _upk (UNPACKFUNCTYPE) +#define _odd (UNPACKFUNCTYPE_ODD) + +// -------------------------------------------------------------------------------------- +// Main table for function unpacking. +// -------------------------------------------------------------------------------------- +// The extra data bsize/dsize/etc are all duplicated between the doMask enabled and +// disabled versions. This is probably simpler and more efficient than bothering +// to generate separate tables. + +// 32-bits versions are unsigned-only!! +#define UnpackFuncPair32( sizefac, vt, doMask ) \ + _upk fUNPACK_##vt, \ + _upk fUNPACK_##vt, \ + _odd UNPACK_##vt, \ + _odd UNPACK_##vt, + +#define UnpackFuncPair( sizefac, vt, bits, doMask ) \ + _upk fUNPACK_##vt, \ + _upk fUNPACK_##vt, \ + _odd UNPACK_##vt, \ + _odd UNPACK_##vt, + +#define UnpackFuncSet( doMask ) \ + { UnpackFuncPair32( 4, S, doMask ) /* 0x0 - S-32 */ \ + 1, 4, 4, 4 }, \ + { UnpackFuncPair ( 4, S, 16, doMask ) /* 0x1 - S-16 */ \ + 2, 2, 2, 4 }, \ + { UnpackFuncPair ( 4, S, 8, doMask ) /* 0x2 - S-8 */ \ + 4, 1, 1, 4 }, \ + { NULL, NULL, NULL, NULL, 0, 0, 0, 0 }, /* 0x3 (NULL) */ \ + \ + { UnpackFuncPair32( 2, V2, doMask ) /* 0x4 - V2-32 */ \ + 24, 4, 8, 2 }, \ + { UnpackFuncPair ( 2, V2, 16, doMask ) /* 0x5 - V2-16 */ \ + 12, 2, 4, 2 }, \ + { UnpackFuncPair ( 2, V2, 8, doMask ) /* 0x6 - V2-8 */ \ + 6, 1, 2, 2 }, \ + { NULL, NULL, NULL, NULL,0, 0, 0, 0 }, /* 0x7 (NULL) */ \ + \ + { UnpackFuncPair32( 3, V3, doMask ) /* 0x8 - V3-32 */ \ + 36, 4, 12, 3 }, \ + { UnpackFuncPair ( 3, V3, 16, doMask ) /* 0x9 - V3-16 */ \ + 18, 2, 6, 3 }, \ + { UnpackFuncPair ( 3, V3, 8, doMask ) /* 0xA - V3-8 */ \ + 9, 1, 3, 3 }, \ + { NULL, NULL, NULL, NULL,0, 0, 0, 0 }, /* 0xB (NULL) */ \ + \ + { UnpackFuncPair32( 4, V4, doMask ) /* 0xC - V4-32 */ \ + 48, 4, 16, 4 }, \ + { UnpackFuncPair ( 4, V4, 16, doMask ) /* 0xD - V4-16 */ \ + 24, 2, 8, 4 }, \ + { UnpackFuncPair ( 4, V4, 8, doMask ) /* 0xE - V4-8 */ \ + 12, 1, 4, 4 }, \ + { /* 0xF - V4-5 */ \ + _upk fUNPACK_V4_5, _upk fUNPACK_V4_5, \ + _odd UNPACK_V4_5, _odd UNPACK_V4_5, \ + 6, 2, 2, 4 }, + +const __aligned16 VIFUnpackFuncTable VIFfuncTable[32] = +{ + UnpackFuncSet( false ) + UnpackFuncSet( true ) +}; diff --git a/pcsx2/Vif.cpp b/pcsx2/Vif.cpp index 91e22ab558..6cef9f7344 100644 --- a/pcsx2/Vif.cpp +++ b/pcsx2/Vif.cpp @@ -18,7 +18,6 @@ #include "Common.h" #include -#include #include "Vif.h" #include "VifDma.h" @@ -33,386 +32,6 @@ __aligned16 VifMaskTypes g_vifmask; extern int g_vifCycles; -enum UnpackOffset -{ - OFFSET_X = 0, - OFFSET_Y = 1, - OFFSET_Z = 2, - OFFSET_W = 3 -}; - -static __forceinline u32 setVifRowRegs(u32 reg, u32 data) -{ - switch (reg) - { - case 0: - vifRegs->r0 = data; - break; - case 1: - vifRegs->r1 = data; - break; - case 2: - vifRegs->r2 = data; - break; - case 3: - vifRegs->r3 = data; - break; - jNO_DEFAULT; - } - return data; -} - -static __forceinline u32 getVifRowRegs(u32 reg) -{ - switch (reg) - { - case 0: - return vifRegs->r0; - break; - case 1: - return vifRegs->r1; - break; - case 2: - return vifRegs->r2; - break; - case 3: - return vifRegs->r3; - break; - jNO_DEFAULT; - } - return 0; // unreachable... -} - -static __forceinline u32 setVifColRegs(u32 reg, u32 data) -{ - switch (reg) - { - case 0: - vifRegs->c0 = data; - break; - case 1: - vifRegs->c1 = data; - break; - case 2: - vifRegs->c2 = data; - break; - case 3: - vifRegs->c3 = data; - break; - jNO_DEFAULT; - } - return data; -} - -static __forceinline u32 getVifColRegs(u32 reg) -{ - switch (reg) - { - case 0: - return vifRegs->c0; - break; - case 1: - return vifRegs->c1; - break; - case 2: - return vifRegs->c2; - break; - case 3: - return vifRegs->c3; - break; - jNO_DEFAULT; - } - return 0; // unreachable... -} - - -static __releaseinline void writeXYZW(u32 offnum, u32 &dest, u32 data) -{ - int n; - u32 vifRowReg = getVifRowRegs(offnum); - - if (vifRegs->code & 0x10000000) - { - switch (vif->cl) - { - case 0: - if (offnum == OFFSET_X) - n = (vifRegs->mask) & 0x3; - else - n = (vifRegs->mask >> (offnum * 2)) & 0x3; - break; - case 1: - n = (vifRegs->mask >> ( 8 + (offnum * 2))) & 0x3; - break; - case 2: - n = (vifRegs->mask >> (16 + (offnum * 2))) & 0x3; - break; - default: - n = (vifRegs->mask >> (24 + (offnum * 2))) & 0x3; - break; - } - } - else n = 0; - - switch (n) - { - case 0: - if ((vif->cmd & 0x6F) == 0x6f) - { - dest = data; - } - else switch (vifRegs->mode) - { - case 1: - dest = data + vifRowReg; - break; - case 2: - // vifRowReg isn't used after this, or I would make it equal to dest here. - dest = setVifRowRegs(offnum, vifRowReg + data); - break; - default: - dest = data; - break; - } - break; - case 1: - dest = vifRowReg; - break; - case 2: - dest = getVifColRegs((vif->cl > 2) ? 3 : vif->cl); - break; - case 3: - break; - } -// VIF_LOG("writeX %8.8x : Mode %d, r0 = %x, data %8.8x", *dest,vifRegs->mode,vifRegs->r0,data); -} - -template -void __fastcall UNPACK_S(u32 *dest, T *data, int size) -{ - //S-# will always be a complete packet, no matter what. So we can skip the offset bits - writeXYZW(OFFSET_X, *dest++, *data); - writeXYZW(OFFSET_Y, *dest++, *data); - writeXYZW(OFFSET_Z, *dest++, *data); - writeXYZW(OFFSET_W, *dest , *data); -} - -template -void __fastcall UNPACK_V2(u32 *dest, T *data, int size) -{ - if (vifRegs->offset == OFFSET_X) - { - if (size > 0) - { - writeXYZW(vifRegs->offset, *dest++, *data++); - vifRegs->offset = OFFSET_Y; - size--; - } - } - - if (vifRegs->offset == OFFSET_Y) - { - if (size > 0) - { - writeXYZW(vifRegs->offset, *dest++, *data); - vifRegs->offset = OFFSET_Z; - size--; - } - } - - if (vifRegs->offset == OFFSET_Z) - { - writeXYZW(vifRegs->offset, *dest++, *dest-2); - vifRegs->offset = OFFSET_W; - } - - if (vifRegs->offset == OFFSET_W) - { - writeXYZW(vifRegs->offset, *dest, *data); - vifRegs->offset = OFFSET_X; - } -} - -template -void __fastcall UNPACK_V3(u32 *dest, T *data, int size) -{ - if(vifRegs->offset == OFFSET_X) - { - if (size > 0) - { - writeXYZW(vifRegs->offset, *dest++, *data++); - vifRegs->offset = OFFSET_Y; - size--; - } - } - - if(vifRegs->offset == OFFSET_Y) - { - if (size > 0) - { - writeXYZW(vifRegs->offset, *dest++, *data++); - vifRegs->offset = OFFSET_Z; - size--; - } - } - - if(vifRegs->offset == OFFSET_Z) - { - if (size > 0) - { - writeXYZW(vifRegs->offset, *dest++, *data++); - vifRegs->offset = OFFSET_W; - size--; - } - } - - if(vifRegs->offset == OFFSET_W) - { - //V3-# does some bizzare thing with alignment, every 6qw of data the W becomes 0 (strange console!) - //Ape Escape doesnt seem to like it tho (what the hell?) gonna have to investigate - writeXYZW(vifRegs->offset, *dest, *data); - vifRegs->offset = OFFSET_X; - } -} - -template -void __fastcall UNPACK_V4(u32 *dest, T *data , int size) -{ - while (size > 0) - { - writeXYZW(vifRegs->offset, *dest++, *data++); - vifRegs->offset++; - size--; - } - - if (vifRegs->offset > OFFSET_W) vifRegs->offset = OFFSET_X; -} - -void __fastcall UNPACK_V4_5(u32 *dest, u32 *data, int size) -{ - //As with S-#, this will always be a complete packet - writeXYZW(OFFSET_X, *dest++, ((*data & 0x001f) << 3)); - writeXYZW(OFFSET_Y, *dest++, ((*data & 0x03e0) >> 2)); - writeXYZW(OFFSET_Z, *dest++, ((*data & 0x7c00) >> 7)); - writeXYZW(OFFSET_W, *dest, ((*data & 0x8000) >> 8)); -} - -void __fastcall UNPACK_S_32(u32 *dest, u32 *data, int size) -{ - UNPACK_S(dest, data, size); -} - -void __fastcall UNPACK_S_16s(u32 *dest, u32 *data, int size) -{ - s16 *sdata = (s16*)data; - UNPACK_S(dest, sdata, size); -} - -void __fastcall UNPACK_S_16u(u32 *dest, u32 *data, int size) -{ - u16 *sdata = (u16*)data; - UNPACK_S(dest, sdata, size); -} - -void __fastcall UNPACK_S_8s(u32 *dest, u32 *data, int size) -{ - s8 *cdata = (s8*)data; - UNPACK_S(dest, cdata, size); -} - -void __fastcall UNPACK_S_8u(u32 *dest, u32 *data, int size) -{ - u8 *cdata = (u8*)data; - UNPACK_S(dest, cdata, size); -} - -void __fastcall UNPACK_V2_32(u32 *dest, u32 *data, int size) -{ - UNPACK_V2(dest, data, size); -} - -void __fastcall UNPACK_V2_16s(u32 *dest, u32 *data, int size) -{ - s16 *sdata = (s16*)data; - UNPACK_V2(dest, sdata, size); -} - -void __fastcall UNPACK_V2_16u(u32 *dest, u32 *data, int size) -{ - u16 *sdata = (u16*)data; - UNPACK_V2(dest, sdata, size); -} - -void __fastcall UNPACK_V2_8s(u32 *dest, u32 *data, int size) -{ - s8 *cdata = (s8*)data; - UNPACK_V2(dest, cdata, size); -} - -void __fastcall UNPACK_V2_8u(u32 *dest, u32 *data, int size) -{ - u8 *cdata = (u8*)data; - UNPACK_V2(dest, cdata, size); -} - -void __fastcall UNPACK_V3_32(u32 *dest, u32 *data, int size) -{ - UNPACK_V3(dest, data, size); -} - -void __fastcall UNPACK_V3_16s(u32 *dest, u32 *data, int size) -{ - s16 *sdata = (s16*)data; - UNPACK_V3(dest, sdata, size); -} - -void __fastcall UNPACK_V3_16u(u32 *dest, u32 *data, int size) -{ - u16 *sdata = (u16*)data; - UNPACK_V3(dest, sdata, size); -} - -void __fastcall UNPACK_V3_8s(u32 *dest, u32 *data, int size) -{ - s8 *cdata = (s8*)data; - UNPACK_V3(dest, cdata, size); -} - -void __fastcall UNPACK_V3_8u(u32 *dest, u32 *data, int size) -{ - u8 *cdata = (u8*)data; - UNPACK_V3(dest, cdata, size); -} - -void __fastcall UNPACK_V4_32(u32 *dest, u32 *data , int size) -{ - UNPACK_V4(dest, data, size); -} - -void __fastcall UNPACK_V4_16s(u32 *dest, u32 *data, int size) -{ - s16 *sdata = (s16*)data; - UNPACK_V4(dest, sdata, size); -} - -void __fastcall UNPACK_V4_16u(u32 *dest, u32 *data, int size) -{ - u16 *sdata = (u16*)data; - UNPACK_V4(dest, sdata, size); -} - -void __fastcall UNPACK_V4_8s(u32 *dest, u32 *data, int size) -{ - s8 *cdata = (s8*)data; - UNPACK_V4(dest, cdata, size); -} - -void __fastcall UNPACK_V4_8u(u32 *dest, u32 *data, int size) -{ - u8 *cdata = (u8*)data; - UNPACK_V4(dest, cdata, size); -} - static __forceinline bool mfifoVIF1rbTransfer() { u32 maddr = dmacRegs->rbor.ADDR; diff --git a/pcsx2/Vif0Dma.cpp b/pcsx2/Vif0Dma.cpp index 53c5316352..38b059bd61 100644 --- a/pcsx2/Vif0Dma.cpp +++ b/pcsx2/Vif0Dma.cpp @@ -17,7 +17,6 @@ #include "PrecompiledHeader.h" #include "Common.h" -#include "VifDma.h" #include "VifDma_internal.h" #include "VUmicro.h" diff --git a/pcsx2/Vif1Dma.cpp b/pcsx2/Vif1Dma.cpp index 873ffd8c33..2931b80829 100644 --- a/pcsx2/Vif1Dma.cpp +++ b/pcsx2/Vif1Dma.cpp @@ -17,7 +17,6 @@ #include "PrecompiledHeader.h" #include "Common.h" -#include "VifDma.h" #include "VifDma_internal.h" #include "GS.h" diff --git a/pcsx2/VifDma.cpp b/pcsx2/VifDma.cpp index a27a88f0bb..16d6aa0dbd 100644 --- a/pcsx2/VifDma.cpp +++ b/pcsx2/VifDma.cpp @@ -16,7 +16,6 @@ #include "PrecompiledHeader.h" #include "Common.h" -#include "VifDma.h" #include "VifDma_internal.h" #include "VUmicro.h" @@ -32,53 +31,9 @@ extern "C" extern u32* vifRow; } -extern vifStruct *vif; - int g_vifCycles = 0; u8 s_maskwrite[256]; -/* block size; data size; group size; qword size; */ -#define _UNPACK_TABLE32(name, bsize, dsize, gsize, qsize) \ - { UNPACK_##name, UNPACK_##name, \ - bsize, dsize, gsize, qsize }, - -#define _UNPACK_TABLE(name, bsize, dsize, gsize, qsize) \ - { UNPACK_##name##u, UNPACK_##name##s, \ - bsize, dsize, gsize, qsize }, - -// Main table for function unpacking -const VIFUnpackFuncTable VIFfuncTable[16] = -{ - _UNPACK_TABLE32(S_32, 1, 4, 4, 4) // 0x0 - S-32 - _UNPACK_TABLE(S_16, 2, 2, 2, 4) // 0x1 - S-16 - _UNPACK_TABLE(S_8, 4, 1, 1, 4) // 0x2 - S-8 - { - NULL, NULL, 0, 0, 0, 0 - } - , // 0x3 - - _UNPACK_TABLE32(V2_32, 24, 4, 8, 2) // 0x4 - V2-32 - _UNPACK_TABLE(V2_16, 12, 2, 4, 2) // 0x5 - V2-16 - _UNPACK_TABLE(V2_8, 6, 1, 2, 2) // 0x6 - V2-8 - { - NULL, NULL, 0, 0, 0, 0 - } - , // 0x7 - - _UNPACK_TABLE32(V3_32, 36, 4, 12, 3) // 0x8 - V3-32 - _UNPACK_TABLE(V3_16, 18, 2, 6, 3) // 0x9 - V3-16 - _UNPACK_TABLE(V3_8, 9, 1, 3, 3) // 0xA - V3-8 - { - NULL, NULL, 0, 0, 0, 0 - } - , // 0xB - - _UNPACK_TABLE32(V4_32, 48, 4, 16, 4) // 0xC - V4-32 - _UNPACK_TABLE(V4_16, 24, 2, 8, 4) // 0xD - V4-16 - _UNPACK_TABLE(V4_8, 12, 1, 4, 4) // 0xE - V4-8 - _UNPACK_TABLE32(V4_5, 6, 2, 2, 4) // 0xF - V4-5 -}; - struct VIFSSEUnpackTable { // regular 0, 1, 2; mask 0, 1, 2 @@ -171,6 +126,9 @@ template void ProcessMemSkip(u32 size, u32 unpackType) { const VIFUnpackFuncTable *unpack; + // unpackType is only 0->0xf but that's ok, because the data we're using here is + // just duplicated in 0x10->0x1f. + unpack = &VIFfuncTable[ unpackType ]; switch (unpackType) @@ -259,9 +217,6 @@ template u32 VIFalign<1>(u32 *data, vifCode *v, u32 size); template u32 VIFalign(u32 *data, vifCode *v, u32 size) { u32 *dest; - u32 unpackType; - UNPACKFUNCTYPE func; - const VIFUnpackFuncTable *ft; VURegs * VU; u8 *cdata = (u8*)data; @@ -290,11 +245,8 @@ template u32 VIFalign(u32 *data, vifCode *v, u32 size) VIF_LOG("VIF%d UNPACK Align: Mode=%x, v->size=%d, size=%d, v->addr=%x v->num=%x", VIFdmanum, v->cmd & 0xf, v->size, size, v->addr, vifRegs->num); - // The unpack type - unpackType = v->cmd & 0xf; - - ft = &VIFfuncTable[ unpackType ]; - func = vif->usn ? ft->funcU : ft->funcS; + const VIFUnpackFuncTable& ft( VIFfuncTable[ v->cmd & 0x1f ] ); + UNPACKFUNCTYPE func = vif->usn ? ft.funcU : ft.funcS; size <<= 2; memsize = size; @@ -311,17 +263,17 @@ template u32 VIFalign(u32 *data, vifCode *v, u32 size) VIFUNPACK_LOG("Aligning packet size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr); - if (((u32)size / (u32)ft->dsize) < ((u32)ft->qsize - vifRegs->offset)) + if (((u32)size / (u32)ft.dsize) < ((u32)ft.qsize - vifRegs->offset)) { - DevCon.Error("Wasn't enough left size/dsize = %x left to write %x", (size / ft->dsize), (ft->qsize - vifRegs->offset)); + DevCon.Error("Wasn't enough left size/dsize = %x left to write %x", (size / ft.dsize), (ft.qsize - vifRegs->offset)); } - unpacksize = min((size / ft->dsize), (ft->qsize - vifRegs->offset)); + unpacksize = min((size / ft.dsize), (ft.qsize - vifRegs->offset)); - VIFUNPACK_LOG("Increasing dest by %x from offset %x", (4 - ft->qsize) + unpacksize, vifRegs->offset); + VIFUNPACK_LOG("Increasing dest by %x from offset %x", (4 - ft.qsize) + unpacksize, vifRegs->offset); - func(dest, (u32*)cdata, unpacksize); - size -= unpacksize * ft->dsize; + (vif->usn ? ft.oddU : ft.oddS)(dest, (u32*)cdata, unpacksize); + size -= unpacksize * ft.dsize; if(vifRegs->offset == 0) { @@ -339,13 +291,13 @@ template u32 VIFalign(u32 *data, vifCode *v, u32 size) { if (vifRegs->cycle.cl != vifRegs->cycle.wl) { - vif->tag.addr += (((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + ((4 - ft->qsize) + unpacksize)) * 4; - dest += ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + (4 - ft->qsize) + unpacksize; + vif->tag.addr += (((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + ((4 - ft.qsize) + unpacksize)) * 4; + dest += ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + (4 - ft.qsize) + unpacksize; } else { - vif->tag.addr += ((4 - ft->qsize) + unpacksize) * 4; - dest += (4 - ft->qsize) + unpacksize; + vif->tag.addr += ((4 - ft.qsize) + unpacksize) * 4; + dest += (4 - ft.qsize) + unpacksize; } if (vif->tag.addr >= (u32)vif_size(VIFdmanum)) @@ -354,7 +306,7 @@ template u32 VIFalign(u32 *data, vifCode *v, u32 size) dest = (u32*)(VU->Mem + v->addr); } - cdata += unpacksize * ft->dsize; + cdata += unpacksize * ft.dsize; vif->cl = 0; VIFUNPACK_LOG("Aligning packet done size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr); if ((size & 0xf) == 0) return size >> 2; @@ -362,8 +314,8 @@ template u32 VIFalign(u32 *data, vifCode *v, u32 size) } else { - vif->tag.addr += ((4 - ft->qsize) + unpacksize) * 4; - dest += (4 - ft->qsize) + unpacksize; + vif->tag.addr += ((4 - ft.qsize) + unpacksize) * 4; + dest += (4 - ft.qsize) + unpacksize; if (vif->tag.addr >= (u32)vif_size(VIFdmanum)) { @@ -371,7 +323,7 @@ template u32 VIFalign(u32 *data, vifCode *v, u32 size) dest = (u32*)(VU->Mem + v->addr); } - cdata += unpacksize * ft->dsize; + cdata += unpacksize * ft.dsize; VIFUNPACK_LOG("Aligning packet done size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr); } } @@ -391,11 +343,11 @@ template u32 VIFalign(u32 *data, vifCode *v, u32 size) VIFUNPACK_LOG("Continuing last stream size = %d offset %d addr %x", size, vifRegs->offset, vif->tag.addr); incdest = ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + 4; - while ((size >= ft->gsize) && (vifRegs->num > 0)) + while ((size >= ft.gsize) && (vifRegs->num > 0)) { - func(dest, (u32*)cdata, ft->qsize); - cdata += ft->gsize; - size -= ft->gsize; + func(dest, (u32*)cdata); + cdata += ft.gsize; + size -= ft.gsize; vifRegs->num--; ++vif->cl; @@ -431,20 +383,20 @@ template u32 VIFalign(u32 *data, vifCode *v, u32 size) } } - if (size >= ft->dsize && vifRegs->num > 0 && ((size & 0xf) != 0 || vif->cl != 0)) + if (size >= ft.dsize && vifRegs->num > 0 && ((size & 0xf) != 0 || vif->cl != 0)) { //VIF_LOG("warning, end with size = %d", size); /* unpack one qword */ - if(vif->tag.addr + ((size / ft->dsize) * 4) >= (u32)vif_size(VIFdmanum)) + if(vif->tag.addr + ((size / ft.dsize) * 4) >= (u32)vif_size(VIFdmanum)) { //DevCon.Warning("Overflow"); vif->tag.addr &= (u32)(vif_size(VIFdmanum) - 1); dest = (u32*)(VU->Mem + v->addr); } - vif->tag.addr += (size / ft->dsize) * 4; + vif->tag.addr += (size / ft.dsize) * 4; - func(dest, (u32*)cdata, size / ft->dsize); + (vif->usn ? ft.oddU : ft.oddS)(dest, (u32*)cdata, size / ft.dsize); size = 0; if(vifRegs->mode == 2) @@ -468,9 +420,6 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) { //DevCon.WriteLn("vif#%d, size = %d [%x]", VIFdmanum, size, data); u32 *dest; - u32 unpackType; - UNPACKFUNCTYPE func; - const VIFUnpackFuncTable *ft; VURegs * VU; u8 *cdata = (u8*)data; u32 tempsize = 0; @@ -507,13 +456,10 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) VIFUNPACK_LOG("USN %x Masking %x Mask %x Mode %x CL %x WL %x Offset %x", vif->usn, (vifRegs->code & 0x10000000) >> 28, vifRegs->mask, vifRegs->mode, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->offset); - // The unpack type - unpackType = v->cmd & 0xf; - _mm_prefetch((char*)data + 128, _MM_HINT_NTA); - ft = &VIFfuncTable[ unpackType ]; - func = vif->usn ? ft->funcU : ft->funcS; + const VIFUnpackFuncTable& ft( VIFfuncTable[ v->cmd & 0x1f ] ); + UNPACKFUNCTYPE func = vif->usn ? ft.funcU : ft.funcS; size <<= 2; @@ -528,12 +474,12 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) dest = (u32*)(VU->Mem + v->addr); } - size = min(size, (int)vifRegs->num * ft->gsize); //size will always be the same or smaller + size = std::min(size, vifRegs->num * ft.gsize); //size will always be the same or smaller tempsize = vif->tag.addr + ((((vifRegs->num-1) / vifRegs->cycle.wl) * (vifRegs->cycle.cl - vifRegs->cycle.wl)) * 16) + (vifRegs->num * 16); - /*tempsize = vif->tag.addr + (((size / (ft->gsize * vifRegs->cycle.wl)) * + /*tempsize = vif->tag.addr + (((size / (ft.gsize * vifRegs->cycle.wl)) * (vifRegs->cycle.cl - vifRegs->cycle.wl)) * 16) + (vifRegs->num * 16);*/ //Sanity Check (memory overflow) @@ -562,7 +508,7 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) #endif } - if (size >= ft->gsize) + if (size >= ft.gsize) { const UNPACKPARTFUNCTYPESSE* pfn; int writemask; @@ -612,7 +558,7 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) vifRegs->cycle.cl = vifRegs->cycle.wl = 1; } - pfn = vif->usn ? VIFfuncTableSSE[unpackType].funcU : VIFfuncTableSSE[unpackType].funcS; + pfn = vif->usn ? VIFfuncTableSSE[v->cmd & 0xf].funcU : VIFfuncTableSSE[v->cmd & 0xf].funcS; writemask = VIFdmanum ? g_vif1HasMask3[min(vifRegs->cycle.wl,(u8)3)] : g_vif0HasMask3[min(vifRegs->cycle.wl,(u8)3)]; writemask = pfn[(((vifRegs->code & 0x10000000)>>28)<mode](dest, (u32*)cdata, size); @@ -630,20 +576,20 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) // if size is left over, update the src,dst pointers if (writemask > 0) { - int left = (size - writemask) / ft->gsize; - cdata += left * ft->gsize; + int left = (size - writemask) / ft.gsize; + cdata += left * ft.gsize; dest = (u32*)((u8*)dest + ((left / vifRegs->cycle.wl) * vifRegs->cycle.cl + left % vifRegs->cycle.wl) * 16); vifRegs->num -= left; - vif->cl = (size % (ft->gsize * vifRegs->cycle.wl)) / ft->gsize; + vif->cl = (size % (ft.gsize * vifRegs->cycle.wl)) / ft.gsize; size = writemask; - if (size >= ft->dsize && vifRegs->num > 0) + if (size >= ft.dsize && vifRegs->num > 0) { VIF_LOG("warning, end with size = %d", size); /* unpack one qword */ - //vif->tag.addr += (size / ft->dsize) * 4; - func(dest, (u32*)cdata, size / ft->dsize); + //vif->tag.addr += (size / ft.dsize) * 4; + (vif->usn ? ft.oddU : ft.oddS)(dest, (u32*)cdata, size / ft.dsize); size = 0; if(vifRegs->mode == 2) @@ -659,8 +605,8 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) } else { - vifRegs->num -= size / ft->gsize; - if (vifRegs->num > 0) vif->cl = (size % (ft->gsize * vifRegs->cycle.wl)) / ft->gsize; + vifRegs->num -= size / ft.gsize; + if (vifRegs->num > 0) vif->cl = (size % (ft.gsize * vifRegs->cycle.wl)) / ft.gsize; size = 0; } } @@ -669,11 +615,14 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) int incdest = ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + 4; size = 0; int addrstart = v->addr; - if((tempsize >> 2) != vif->tag.size) DevCon.Warning("split when size != tagsize"); + + #ifndef NON_SSE_UNPACKS // spams pointlessly when SSE unpacks are disabled + //if((tempsize >> 2) != vif->tag.size) DevCon.Warning("split when size != tagsize"); + #endif VIFUNPACK_LOG("sorting tempsize :p, size %d, vifnum %d, addr %x", tempsize, vifRegs->num, vif->tag.addr); - while ((tempsize >= ft->gsize) && (vifRegs->num > 0)) + while ((tempsize >= ft.gsize) && (vifRegs->num > 0)) { if(v->addr >= memlimit) { @@ -682,9 +631,9 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) dest = (u32*)(VU->Mem + v->addr); } - func(dest, (u32*)cdata, ft->qsize); - cdata += ft->gsize; - tempsize -= ft->gsize; + func(dest, (u32*)cdata); + cdata += ft.gsize; + tempsize -= ft.gsize; vifRegs->num--; ++vif->cl; @@ -721,13 +670,13 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) if(tempsize > 0) size = tempsize; } - if (size >= ft->dsize && vifRegs->num > 0) //Else write what we do have + if (size >= ft.dsize && vifRegs->num > 0) //Else write what we do have { VIF_LOG("warning, end with size = %d", size); /* unpack one qword */ - //vif->tag.addr += (size / ft->dsize) * 4; - func(dest, (u32*)cdata, size / ft->dsize); + //vif->tag.addr += (size / ft.dsize) * 4; + (vif->usn ? ft.oddU : ft.oddS)(dest, (u32*)cdata, size / ft.dsize); size = 0; if(vifRegs->mode == 2) @@ -745,8 +694,8 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) { if(vifRegs->cycle.cl > 0) // Quicker and avoids zero division :P - if((u32)(((size / ft->gsize) / vifRegs->cycle.cl) * vifRegs->cycle.wl) < vifRegs->num) - DevCon.Warning("Filling write warning! %x < %x and CL = %x WL = %x", (size / ft->gsize), vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl); + if((u32)(((size / ft.gsize) / vifRegs->cycle.cl) * vifRegs->cycle.wl) < vifRegs->num) + DevCon.Warning("Filling write warning! %x < %x and CL = %x WL = %x", (size / ft.gsize), vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl); //DevCon.Warning("filling write %d cl %d, wl %d mask %x mode %x unpacktype %x addr %x", vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mask, vifRegs->mode, unpackType, vif->tag.addr); while (vifRegs->num > 0) @@ -758,15 +707,16 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) if (vif->cl < vifRegs->cycle.cl) /* unpack one qword */ { - if(size < ft->gsize) + if(size < ft.gsize) { VIF_LOG("Out of Filling write data"); break; } - func(dest, (u32*)cdata, ft->qsize); - cdata += ft->gsize; - size -= ft->gsize; + func(dest, (u32*)cdata); + cdata += ft.gsize; + size -= ft.gsize; + vif->cl++; vifRegs->num--; @@ -777,7 +727,7 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) } else { - func(dest, (u32*)cdata, ft->qsize); + func(dest, (u32*)cdata); vif->tag.addr += 16; vifRegs->num--; ++vif->cl; diff --git a/pcsx2/VifDma.h b/pcsx2/VifDma.h index 0882ac91e6..840670a322 100644 --- a/pcsx2/VifDma.h +++ b/pcsx2/VifDma.h @@ -47,40 +47,6 @@ extern vifStruct vif0, vif1; extern u8 schedulepath3msk; static const int VifCycleVoodoo = 4; -void __fastcall UNPACK_S_32( u32 *dest, u32 *data, int size ); - -void __fastcall UNPACK_S_16u( u32 *dest, u32 *data, int size ); -void __fastcall UNPACK_S_16s( u32 *dest, u32 *data, int size ); - -void __fastcall UNPACK_S_8u( u32 *dest, u32 *data, int size ); -void __fastcall UNPACK_S_8s( u32 *dest, u32 *data, int size ); - -void __fastcall UNPACK_V2_32( u32 *dest, u32 *data, int size ); - -void __fastcall UNPACK_V2_16u( u32 *dest, u32 *data, int size ); -void __fastcall UNPACK_V2_16s( u32 *dest, u32 *data, int size ); - -void __fastcall UNPACK_V2_8u( u32 *dest, u32 *data, int size ); -void __fastcall UNPACK_V2_8s( u32 *dest, u32 *data, int size ); - -void __fastcall UNPACK_V3_32( u32 *dest, u32 *data, int size ); - -void __fastcall UNPACK_V3_16u( u32 *dest, u32 *data, int size ); -void __fastcall UNPACK_V3_16s( u32 *dest, u32 *data, int size ); - -void __fastcall UNPACK_V3_8u( u32 *dest, u32 *data, int size ); -void __fastcall UNPACK_V3_8s( u32 *dest, u32 *data, int size ); - -void __fastcall UNPACK_V4_32( u32 *dest, u32 *data, int size ); - -void __fastcall UNPACK_V4_16u( u32 *dest, u32 *data, int size ); -void __fastcall UNPACK_V4_16s( u32 *dest, u32 *data, int size ); - -void __fastcall UNPACK_V4_8u( u32 *dest, u32 *data, int size ); -void __fastcall UNPACK_V4_8s( u32 *dest, u32 *data, int size ); - -void __fastcall UNPACK_V4_5( u32 *dest, u32 *data, int size ); - extern void vifDmaInit(); extern void vif0Init(); diff --git a/pcsx2/VifDma_internal.h b/pcsx2/VifDma_internal.h index 7ce8556ebf..af0651e607 100644 --- a/pcsx2/VifDma_internal.h +++ b/pcsx2/VifDma_internal.h @@ -16,6 +16,8 @@ #ifndef __VIFDMA_INTERNAL_H__ #define __VIFDMA_INTERNAL_H__ +#include "VifDma.h" + enum VifModes { VIF_NORMAL_TO_MEM_MODE = 0, @@ -27,7 +29,8 @@ enum VifModes static const unsigned int VIF0intc = 4; static const unsigned int VIF1intc = 5; -typedef void (__fastcall *UNPACKFUNCTYPE)(u32 *dest, u32 *data, int size); +typedef void (__fastcall *UNPACKFUNCTYPE)(u32 *dest, u32 *data); +typedef void (__fastcall *UNPACKFUNCTYPE_ODD)(u32 *dest, u32 *data, int size); typedef int (*UNPACKPARTFUNCTYPESSE)(u32 *dest, u32 *data, int size); struct VIFUnpackFuncTable @@ -35,18 +38,23 @@ struct VIFUnpackFuncTable UNPACKFUNCTYPE funcU; UNPACKFUNCTYPE funcS; - u32 bsize; // currently unused - u32 dsize; // byte size of one channel - u32 gsize; // size of data in bytes used for each write cycle - u32 qsize; // used for unpack parts, num of vectors that + UNPACKFUNCTYPE_ODD oddU; // needed for old-style vif only, remove when old vif is removed. + UNPACKFUNCTYPE_ODD oddS; // needed for old-style vif only, remove when old vif is removed. + + u8 bsize; // currently unused + u8 dsize; // byte size of one channel + u8 gsize; // size of data in bytes used for each write cycle + u8 qsize; // used for unpack parts, num of vectors that // will be decompressed from data for 1 cycle }; -extern const VIFUnpackFuncTable VIFfuncTable[16]; +extern const __aligned16 VIFUnpackFuncTable VIFfuncTable[32]; + extern __aligned16 u32 g_vif0Masks[64], g_vif1Masks[64]; extern u32 g_vif0HasMask3[4], g_vif1HasMask3[4]; extern int g_vifCycles; extern u8 s_maskwrite[256]; +extern vifStruct *vif; template void ProcessMemSkip(u32 size, u32 unpackType); template u32 VIFalign(u32 *data, vifCode *v, u32 size); @@ -63,4 +71,9 @@ static __forceinline u32 vif_size(u8 num) //#define newVif // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code) //#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined) //#define newVif0 // Use New Code for Vif0 Unpacks (not implemented) + +#ifndef newVif +//# define NON_SSE_UNPACKS // Turns off SSE Unpacks (slower) +#endif + #endif diff --git a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj index 7baa9e8d78..7fec7f62fd 100644 --- a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj +++ b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj @@ -824,6 +824,10 @@ RelativePath="..\..\VifDma_internal.h" > + + diff --git a/pcsx2/x86/newVif.h b/pcsx2/x86/newVif.h index db1ad9fed0..07ff4a176d 100644 --- a/pcsx2/x86/newVif.h +++ b/pcsx2/x86/newVif.h @@ -24,8 +24,8 @@ extern void _nVifUnpack(int idx, u8 *data, u32 size); typedef u32 (__fastcall *nVifCall)(void*, void*); -static __pagealigned u8 nVifUpkExec[__pagesize*16]; -static __aligned16 nVifCall nVifUpk[(2*2*16)*4]; // ([USN][Masking][Unpack Type]) [curCycle] +static __pagealigned u8 nVifUpkExec[__pagesize*4]; +static __aligned16 nVifCall nVifUpk[(2*2*16) *4 ]; // ([USN][Masking][Unpack Type]) [curCycle] static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector] #define _1mb (0x100000) @@ -57,7 +57,30 @@ struct nVifStruct { BlockBuffer* vifCache; // Block Buffer }; -static const u32 nVifT[16] = { +// Contents of this table are doubled up for doMast(false) and doMask(true) lookups. +// (note: currently unused, I'm using gsize in the interp tables instead since it +// seems to be faster for now, which may change when nVif isn't reliant on interpreted +// unpackers anymore --air) +static const u32 nVifT[32] = { + 4, // S-32 + 2, // S-16 + 1, // S-8 + 0, // ---- + 8, // V2-32 + 4, // V2-16 + 2, // V2-8 + 0, // ---- + 12,// V3-32 + 6, // V3-16 + 3, // V3-8 + 0, // ---- + 16,// V4-32 + 8, // V4-16 + 4, // V4-8 + 2, // V4-5 + + // Second verse, same as the first! + 4, // S-32 2, // S-16 1, // S-8 @@ -77,8 +100,8 @@ static const u32 nVifT[16] = { }; #include "newVif_OldUnpack.inl" -#include "newVif_UnpackGen.inl" #include "newVif_Unpack.inl" +#include "newVif_UnpackGen.inl" //#include "newVif_Dynarec.inl" diff --git a/pcsx2/x86/newVif_OldUnpack.inl b/pcsx2/x86/newVif_OldUnpack.inl index 6dcaebf2b1..8bfb1b0c99 100644 --- a/pcsx2/x86/newVif_OldUnpack.inl +++ b/pcsx2/x86/newVif_OldUnpack.inl @@ -21,8 +21,6 @@ template void VIFunpack<0>(u32 *data, vifCode *v, u32 size); template void VIFunpack<1>(u32 *data, vifCode *v, u32 size); template void VIFunpack(u32 *data, vifCode *v, u32 size) { //if (!VIFdmanum) DevCon.WriteLn("vif#%d, size = %d [%x]", VIFdmanum, size, data); - UNPACKFUNCTYPE func; - const VIFUnpackFuncTable *ft; VURegs * VU; u8 *cdata = (u8*)data; u32 tempsize = 0; @@ -44,10 +42,10 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) { } u32 *dest = (u32*)(VU->Mem + v->addr); - u32 unpackType = v->cmd & 0xf; - ft = &VIFfuncTable[ unpackType ]; - func = vif->usn ? ft->funcU : ft->funcS; + const VIFUnpackFuncTable& ft( VIFfuncTable[ v->cmd & 0x1f ] ); + UNPACKFUNCTYPE func = vif->usn ? ft.funcU : ft.funcS; + size <<= 2; if (vifRegs->cycle.cl >= vifRegs->cycle.wl) { // skipping write @@ -57,7 +55,7 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) { dest = (u32*)(VU->Mem + v->addr); } - size = min(size, (int)vifRegs->num * ft->gsize); //size will always be the same or smaller + size = std::min(size, vifRegs->num * ft.gsize); //size will always be the same or smaller tempsize = v->addr + ((((vifRegs->num-1) / vifRegs->cycle.wl) * (vifRegs->cycle.cl - vifRegs->cycle.wl)) * 16) + (vifRegs->num * 16); @@ -90,16 +88,16 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) { VIFUNPACK_LOG("sorting tempsize :p, size %d, vifnum %d, addr %x", tempsize, vifRegs->num, v->addr); - while ((tempsize >= ft->gsize) && (vifRegs->num > 0)) { + while ((tempsize >= ft.gsize) && (vifRegs->num > 0)) { if(v->addr >= memlimit) { DevCon.Warning("Mem limit overflow"); v->addr &= (memlimit - 1); dest = (u32*)(VU->Mem + v->addr); } - func(dest, (u32*)cdata, ft->qsize); - cdata += ft->gsize; - tempsize -= ft->gsize; + func(dest, (u32*)cdata); + cdata += ft.gsize; + tempsize -= ft.gsize; vifRegs->num--; vif->cl++; @@ -122,32 +120,32 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) { if(tempsize > 0) size = tempsize; } - if (size >= ft->dsize && vifRegs->num > 0) { //Else write what we do have + if (size >= ft.dsize && vifRegs->num > 0) { //Else write what we do have DevCon.Warning("huh!!!!!!!!!!!!!!!!!!!!!!"); VIF_LOG("warning, end with size = %d", size); // unpack one qword - //v->addr += (size / ft->dsize) * 4; - func(dest, (u32*)cdata, size / ft->dsize); + //v->addr += (size / ft.dsize) * 4; + (vif->usn ? ft.oddU : ft.oddS)(dest, (u32*)cdata, size / ft.dsize); size = 0; VIFUNPACK_LOG("leftover done, size %d, vifnum %d, addr %x", size, vifRegs->num, v->addr); } } else { // filling write if(vifRegs->cycle.cl > 0) // Quicker and avoids zero division :P - if((u32)(((size / ft->gsize) / vifRegs->cycle.cl) * vifRegs->cycle.wl) < vifRegs->num) - DevCon.Warning("Filling write warning! %x < %x and CL = %x WL = %x", (size / ft->gsize), vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl); + if((u32)(((size / ft.gsize) / vifRegs->cycle.cl) * vifRegs->cycle.wl) < vifRegs->num) + DevCon.Warning("Filling write warning! %x < %x and CL = %x WL = %x", (size / ft.gsize), vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl); - DevCon.Warning("filling write %d cl %d, wl %d mask %x mode %x unpacktype %x addr %x", vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mask, vifRegs->mode, unpackType, vif->tag.addr); + DevCon.Warning("filling write %d cl %d, wl %d mask %x mode %x unpacktype %x addr %x", vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mask, vifRegs->mode, v->cmd & 0xf, vif->tag.addr); while (vifRegs->num > 0) { if (vif->cl == vifRegs->cycle.wl) { vif->cl = 0; } // unpack one qword if (vif->cl < vifRegs->cycle.cl) { - if(size < ft->gsize) { DevCon.WriteLn("Out of Filling write data!"); break; } - func(dest, (u32*)cdata, ft->qsize); - cdata += ft->gsize; - size -= ft->gsize; + if(size < ft.gsize) { DevCon.WriteLn("Out of Filling write data!"); break; } + func(dest, (u32*)cdata); + cdata += ft.gsize; + size -= ft.gsize; vif->cl++; vifRegs->num--; if (vif->cl == vifRegs->cycle.wl) { @@ -155,7 +153,7 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) { } } else { - func(dest, (u32*)cdata, ft->qsize); + func(dest, (u32*)cdata); v->addr += 16; vifRegs->num--; vif->cl++; diff --git a/pcsx2/x86/newVif_Unpack.inl b/pcsx2/x86/newVif_Unpack.inl index 8ae66f019b..fad9bf3ce0 100644 --- a/pcsx2/x86/newVif_Unpack.inl +++ b/pcsx2/x86/newVif_Unpack.inl @@ -21,29 +21,6 @@ static __aligned16 nVifStruct nVif[2]; -void initNewVif(int idx) { - nVif[idx].idx = idx; - nVif[idx].VU = idx ? &VU1 : &VU0; - nVif[idx].vif = idx ? &vif1 : &vif0; - nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs; - nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000)); - nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0; - nVif[idx].vifCache = NULL; - - HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false); - memset8<0xcc>( nVifUpkExec ); - - xSetPtr( nVifUpkExec ); - - for (int a = 0; a < 2; a++) { - for (int b = 0; b < 2; b++) { - for (int c = 0; c < 4; c++) { - nVifGen(a, b, c); - }}} - - HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true); -} - int nVifUnpack(int idx, u32 *data) { XMMRegisters::Freeze(); int ret = aMin(vif1.vifpacketsize, vif1.tag.size); @@ -108,65 +85,76 @@ static void setMasks(int idx, const VIFregisters& v) { // ---------------------------------------------------------------------------- // Unpacking Optimization notes: // ---------------------------------------------------------------------------- -// Some games send a LOT of small packets. This is a problem because the new VIF unpacker -// has a lot of setup code to establish which unpack function to call. The best way to -// optimize this is to cache the unpack function's base (see fnbase below) and update it -// when the variables it depends on are modified: writes to vif->tag.cmd and vif->usn. -// Problem: vif->tag.cmd is modified a lot. Like, constantly. So won't work. +// Some games send a LOT of single-cycle packets (God of War, SotC, TriAce games, etc), +// so we always need to be weary of keeping loop setup code optimized. It's not always +// a "win" to move code outside the loop, like normally in most other loop scenarios. // -// A secondary optimization would be adding special handlers for packets where vifRegs->num==1. -// (which would remove the loop, simplify the incVUptr code, etc). But checking for it has -// to be simple enough that it doesn't offset the benefits (which I'm not sure is possible). -// -- air +// The biggest bottleneck of the current code is the call/ret needed to invoke the SSE +// unpackers. A better option is to generate the entire vifRegs->num loop code as part +// of the SSE template, and inline the SSE code into the heart of it. This both avoids +// the call/ret and opens the door for resolving some register dependency chains in the +// current emitted functions. (this is what zero's SSE does to get it's final bit of +// speed advantage over the new vif). --air +// +// As a secondary optimization to above, special handlers could be generated for the +// cycleSize==1 case, which is used frequently enough, and results in enough code +// elimination that it would probably be a win in most cases (and for sure in many +// "slow" games that need it most). --air template< int idx, bool doMode, bool isFill > __releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size) { - const int usn = !!(vif->usn); - const int doMask = !!(vif->tag.cmd & 0x10); - const int upkNum = vif->tag.cmd & 0xf; - const u32& vift = nVifT[upkNum]; - - u8* dest = setVUptr(idx, vif->tag.addr); - const VIFUnpackFuncTable& ft = VIFfuncTable[upkNum]; - UNPACKFUNCTYPE func = usn ? ft.funcU : ft.funcS; - - // Did a bunch of work to make it so I could optimize this index lookup to outside - // the main loop but it was for naught -- too often the loop is only 1-2 iterations, - // so this setup code ends up being slower (1 iter) or same speed (2 iters). - const nVifCall* fnbase = &nVifUpk[ ((usn*2*16) + (doMask*16) + (upkNum)) * (4*1) ]; - const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl; const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl; const int skipSize = blockSize - cycleSize; + //if (skipSize > 2) //DevCon.WriteLn("[num = %d][cl = %d][bl = %d][diff = %d]", vifRegs->num, vif->cl, blockSize, skipSize); - // This condition doesn't appear to ever occur, and really it never should. - // Normally it wouldn't matter, but even simple setup code matters here (see - // optimization notes above) >_< + if (vif->cmd & 0x10) setMasks(idx, *vifRegs); + + const int usn = !!(vif->usn); + const int upkNum = vif->cmd & 0x1f; + //const s8& vift = nVifT[upkNum]; // might be useful later when other SSE paths are finished. + + // Recompiled Unpacker, used when doMode is false. + // Did a bunch of work to make it so I could optimize this index lookup to outside + // the main loop but it was for naught -- too often the loop is only 1-2 iterations, + // so this setup code ends up being slower (1 iter) or same speed (2 iters). + const nVifCall* fnbase = &nVifUpk[ ((usn*2*16) + upkNum) * (4*1) ]; + + // Interpreted Unpacker, used if doMode is true OR if isFill is true. Lookup is + // always performed for now, due to ft.gsize reference (seems faster than using + // nVifT for now) + const VIFUnpackFuncTable& ft = VIFfuncTable[upkNum]; + UNPACKFUNCTYPE func = usn ? ft.funcU : ft.funcS; + + u8* dest = setVUptr(idx, vif->tag.addr); + if (vif->cl >= blockSize) vif->cl = 0; - if (doMask) setMasks(idx, *vifRegs); while (vifRegs->num /*&& size*/) { if (vif->cl < cycleSize) { if (doMode /*|| doMask*/) { //if (doMask) //DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum); - func((u32*)dest, (u32*)data, ft.qsize); + func((u32*)dest, (u32*)data); } else { //DevCon.WriteLn("SSE Unpack!"); - fnbase[aMin(vif->cl, 4)](dest, data); + + // Opt note: removing this min check (which isn't needed right now?) is +1% + // or more. Just something to keep in mind. :) --air + fnbase[0/*aMin(vif->cl, 4)*/](dest, data); } - data += vift; - size -= vift; + data += ft.gsize; + size -= ft.gsize; vifRegs->num--; incVUptr(idx, dest, 16); if (++vif->cl == blockSize) vif->cl = 0; } else if (isFill) { - func((u32*)dest, (u32*)data, ft.qsize); + func((u32*)dest, (u32*)data); vifRegs->num--; incVUptr(idx, dest, 16); if (++vif->cl == blockSize) vif->cl = 0; @@ -179,7 +167,24 @@ __releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size) { //if (size > 0) DevCon.WriteLn("size = %d", size); } -void _nVifUnpack(int idx, u8 *data, u32 size) { +typedef void (__fastcall* Fnptr_VifUnpackLoop)(u8 *data, u32 size); + +static const __aligned16 Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = +{ + { + { _nVifUnpackLoop<0,false,false>, _nVifUnpackLoop<0,false,true> }, + { _nVifUnpackLoop<0,true,false>, _nVifUnpackLoop<0,true,true> }, + }, + + { + { _nVifUnpackLoop<1,false,false>, _nVifUnpackLoop<1,false,true> }, + { _nVifUnpackLoop<1,true,false>, _nVifUnpackLoop<1,true,true> }, + }, + +}; + + +static _f void _nVifUnpack(int idx, u8 *data, u32 size) { /*if (nVif[idx].vifRegs->cycle.cl >= nVif[idx].vifRegs->cycle.wl) { // skipping write if (!idx) VIFunpack<0>((u32*)data, &vif0.tag, size>>2); else VIFunpack<1>((u32*)data, &vif1.tag, size>>2); @@ -192,19 +197,7 @@ void _nVifUnpack(int idx, u8 *data, u32 size) { const bool doMode = vifRegs->mode && !(vif->tag.cmd & 0x10); const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl); - //UnpackLoopTable[idx][doMode][isFill]( data, size ); - - if (idx) { - if (doMode) { - if (isFill) _nVifUnpackLoop<1,true,true> (data, size); - else _nVifUnpackLoop<1,true,false> (data, size); - } - else { - if (isFill) _nVifUnpackLoop<1,false,true> (data, size); - else _nVifUnpackLoop<1,false,false>(data, size); - } - } - else pxFailDev( "No VIF0 support yet, sorry!" ); + UnpackLoopTable[idx][doMode][isFill]( data, size ); //if (isFill) //DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok")); diff --git a/pcsx2/x86/newVif_UnpackGen.inl b/pcsx2/x86/newVif_UnpackGen.inl index 6f24a7aea5..89e39d1c96 100644 --- a/pcsx2/x86/newVif_UnpackGen.inl +++ b/pcsx2/x86/newVif_UnpackGen.inl @@ -43,7 +43,7 @@ struct VifUnpackIndexer { int packpart = packType; int curpart = curCycle; - return nVifUpk[((usnpart+maskpart+packpart)*4) + (curpart)]; + return nVifUpk[((usnpart+maskpart+packpart) * 4) + (curpart)]; } void xSetCall(int packType) const { @@ -158,6 +158,12 @@ void nVifGen(int usn, int mask, int curCycle) { // A | B5 | G5 | R5 // ..0.. A 0000000 | ..0.. B 000 | ..0.. G 000 | ..0.. R 000 + + // Optimization: This function has a *really* long dependency chain. + // It would be better if the [edx] is loaded into multiple regs and + // then the regs are shifted each independently, instead of using the + // progressive shift->move pattern below. --air + indexer.xSetCall(0xf); // V4-5 xMOV16 (xmm0, ptr32[edx]); xMOVAPS (xmm1, xmm0); @@ -184,3 +190,27 @@ void nVifGen(int usn, int mask, int curCycle) { pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) ); } + +void initNewVif(int idx) { + nVif[idx].idx = idx; + nVif[idx].VU = idx ? &VU1 : &VU0; + nVif[idx].vif = idx ? &vif1 : &vif0; + nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs; + nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000)); + nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0; + nVif[idx].vifCache = NULL; + + HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false); + memset8<0xcc>( nVifUpkExec ); + + xSetPtr( nVifUpkExec ); + + for (int a = 0; a < 2; a++) { + for (int b = 0; b < 2; b++) { + for (int c = 0; c < 4; c++) { + nVifGen(a, b, c); + } + }} + + HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true); +}