Significant VIFunpack retooling. Interpreters are considerably more efficient, and Recompilers are slightly more efficient. Details:

* All remaining code for handling partial/fragmented unpacks removed.
 * vifRegs.NUM is now accurately simulated when queuing data from fragmented unpacks.
 * Reduced the VIFunpack fragment buffer from 1MB to 4KB (max size of an unpack due to NUM being limited to 8 bits).
 * Removed vif/vifRegs globals formally used by VIF interpreters (everything relies on the templated vifIdx now -- simpler and faster!)
 * g_vifMask vars are integrated into vifStruct.
 * All VIF mask register stuff uses the SSE-friendly vifStruct.MaskRow/Col vars now.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3762 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2010-09-13 21:13:10 +00:00
parent e2cb52becf
commit 2f8f86a3eb
13 changed files with 374 additions and 388 deletions

View File

@ -48,7 +48,15 @@ mem32_t __fastcall _hwRead32(u32 mem)
case 0x02: return ipuRead32( mem );
case 0x03: return dmacRead32<0x03>( mem );
case 0x03:
if (mem >= EEMemoryMap::VIF0_Start)
{
if(mem >= EEMemoryMap::VIF1_Start)
return vifRead32<1>(mem);
else
return vifRead32<0>(mem);
}
return dmacRead32<0x03>( mem );
case 0x04:
case 0x05:

View File

@ -21,8 +21,8 @@
#include "GS.h"
#include "Gif.h"
vifStruct vif0;
vifStruct vif1;
__aligned16 vifStruct vif0, vif1;
tGSTransferStatus GSTransferStatus((STOPPED_MODE<<8) | (STOPPED_MODE<<4) | STOPPED_MODE);
void vif0Reset()
@ -31,10 +31,7 @@ void vif0Reset()
memzero(vif0);
memzero(vif0Regs);
vif0Regs.stat.VPS = VPS_IDLE;
vif0Regs.stat.FQC = 0;
vif0.done = false;
vif0.regs = &vif0Regs;
resetNewVif(0);
}
@ -45,11 +42,7 @@ void vif1Reset()
memzero(vif1);
memzero(vif1Regs);
vif1Regs.stat.VPS = VPS_IDLE;
vif1Regs.stat.FQC = 0; // FQC=0
vif1.done = false;
cpuRegs.interrupt &= ~((1 << 1) | (1 << 10)); //Stop all vif1 DMA's
vif1.regs = &vif1Regs;
resetNewVif(1);
}
@ -58,7 +51,6 @@ void SaveStateBase::vif0Freeze()
{
FreezeTag("VIFdma");
Freeze(g_vifCycles); // Dunno if this one is needed, but whatever, it's small. :)
Freeze(g_vifmask); // mask settings for VIF0 and VIF1
Freeze(vif0);
Freeze(nVif[0].bSize);
@ -88,6 +80,7 @@ __fi void vif0FBRST(u32 value) {
//Console.WriteLn("Vif0 Reset %x", vif0Regs.stat._u32);
memzero(vif0);
vif0.regs = &vif0Regs;
vif0ch.qwc = 0; //?
cpuRegs.interrupt &= ~1; //Stop all vif0 DMA's
psHu64(VIF0_FIFO) = 0;
@ -147,6 +140,8 @@ __fi void vif1FBRST(u32 value) {
if (FBRST(value).RST) // Reset Vif.
{
memzero(vif1);
vif1.regs = &vif1Regs;
//cpuRegs.interrupt &= ~((1 << 1) | (1 << 10)); //Stop all vif1 DMA's
vif1ch.qwc -= min((int)vif1ch.qwc, 16); //?
psHu64(VIF1_FIFO) = 0;
@ -271,9 +266,29 @@ __fi void vif1STAT(u32 value) {
#define caseVif(x) (idx ? VIF1_##x : VIF0_##x)
_vifT __fi u32 vifRead32(u32 mem) {
vifStruct& vif = GetVifX;
switch (mem) {
case caseVif(ROW0): return vif.MaskRow._u32[0];
case caseVif(ROW1): return vif.MaskRow._u32[1];
case caseVif(ROW2): return vif.MaskRow._u32[2];
case caseVif(ROW3): return vif.MaskRow._u32[3];
case caseVif(COL0): return vif.MaskCol._u32[0];
case caseVif(COL1): return vif.MaskCol._u32[1];
case caseVif(COL2): return vif.MaskCol._u32[2];
case caseVif(COL3): return vif.MaskCol._u32[3];
}
return psHu32(mem);
}
// returns FALSE if no writeback is needed (or writeback is handled internally)
// returns TRUE if the caller should writeback the value to the eeHw register map.
_vifT __fi bool vifWrite32(u32 mem, u32 value) {
vifStruct& vif = GetVifX;
switch (mem) {
case caseVif(MARK):
VIF_LOG("VIF%d_MARK write32 0x%8.8x", idx, value);
@ -297,33 +312,23 @@ _vifT __fi bool vifWrite32(u32 mem, u32 value) {
// standard register writes -- handled by caller.
break;
case caseVif(ROW0):
case caseVif(ROW1):
case caseVif(ROW2):
case caseVif(ROW3):
// Here's a neat way to obfuscate code. This is a super-fancy-complicated version
// of a standard psHu32(mem) = value; writeback. Handled by caller for us, thanks! --air
//if (!idx) g_vifmask.Row0[ (mem>>4)&3 ] = value;
//else g_vifmask.Row1[ (mem>>4)&3 ] = value;
//((u32*)&vifXRegs.r0) [((mem>>4)&3)*4] = value;
break;
case caseVif(ROW0): vif.MaskRow._u32[0] = value; return false;
case caseVif(ROW1): vif.MaskRow._u32[1] = value; return false;
case caseVif(ROW2): vif.MaskRow._u32[2] = value; return false;
case caseVif(ROW3): vif.MaskRow._u32[3] = value; return false;
case caseVif(COL0):
case caseVif(COL1):
case caseVif(COL2):
case caseVif(COL3):
// Here's a neat way to obfuscate code. This is a super-fancy-complicated version
// of a standard psHu32(mem) = value; writeback. Handled by caller for us, thanks! --air
//if (!idx) g_vifmask.Col0[ (mem>>4)&3 ] = value;
//else g_vifmask.Col1[ (mem>>4)&3 ] = value;
//((u32*)&vifXRegs.c0) [((mem>>4)&3)*4] = value;
break;
case caseVif(COL0): vif.MaskCol._u32[0] = value; return false;
case caseVif(COL1): vif.MaskCol._u32[1] = value; return false;
case caseVif(COL2): vif.MaskCol._u32[2] = value; return false;
case caseVif(COL3): vif.MaskCol._u32[3] = value; return false;
}
// fall-through case: issue standard writeback behavior.
return true;
}
template u32 vifRead32<0>(u32 mem);
template u32 vifRead32<1>(u32 mem);
template bool vifWrite32<0>(u32 mem, u32 value);
template bool vifWrite32<1>(u32 mem, u32 value);

View File

@ -213,8 +213,6 @@ struct VIFregisters {
u32 addr;
};
extern VIFregisters *vifRegs;
static VIFregisters& vif0Regs = (VIFregisters&)eeHw[0x3800];
static VIFregisters& vif1Regs = (VIFregisters&)eeHw[0x3C00];

View File

@ -19,16 +19,12 @@
#include "Gif.h"
#include "Vif_Dma.h"
VIFregisters *vifRegs;
vifStruct *vif;
u16 vifqwc = 0;
u32 g_vifCycles = 0;
u32 g_vu0Cycles = 0;
u32 g_vu1Cycles = 0;
u32 g_packetsizeonvu = 0;
__aligned16 VifMaskTypes g_vifmask;
extern u32 g_vifCycles;
static u32 qwctag(u32 mask)

View File

@ -38,8 +38,8 @@ static __fi void vifFlush(int idx) {
}
static __fi void vuExecMicro(int idx, u32 addr) {
VURegs* VU = nVif[idx].VU;
VIFregisters& vifRegs = VU->GetVifRegs();
VURegs& VU = vuRegs[idx];
VIFregisters& vifRegs = vifXRegs;
int startcycles = 0;
//vifFlush(idx);
@ -423,7 +423,7 @@ vifOp(vifCode_Offset) {
return 0;
}
template<int idx> static __fi int _vifCode_STColRow(const u32* data, u32* pmem1, u32* pmem2) {
template<int idx> static __fi int _vifCode_STColRow(const u32* data, u32* pmem2) {
vifStruct& vifX = GetVifX;
int ret = min(4 - vifX.tag.addr, vifX.vifpacketsize);
@ -432,16 +432,12 @@ template<int idx> static __fi int _vifCode_STColRow(const u32* data, u32* pmem1,
switch (ret) {
case 4:
pmem1[12] = data[3];
pmem2[3] = data[3];
case 3:
pmem1[8] = data[2];
pmem2[2] = data[2];
case 2:
pmem1[4] = data[1];
pmem2[1] = data[1];
case 1:
pmem1[0] = data[0];
pmem2[0] = data[0];
break;
jNO_DEFAULT
@ -462,10 +458,7 @@ vifOp(vifCode_STCol) {
return 1;
}
pass2 {
u32* cols = idx ? g_vifmask.Col1 : g_vifmask.Col0;
u32* pmem1 = &vifXRegs.c0 + (vifX.tag.addr << 2);
u32* pmem2 = cols + vifX.tag.addr;
return _vifCode_STColRow<idx>(data, pmem1, pmem2);
return _vifCode_STColRow<idx>(data, &vifX.MaskCol._u32[vifX.tag.addr]);
}
pass3 { VifCodeLog("STCol"); }
return 0;
@ -480,10 +473,7 @@ vifOp(vifCode_STRow) {
return 1;
}
pass2 {
u32* rows = idx ? g_vifmask.Row1 : g_vifmask.Row0;
u32* pmem1 = &vifXRegs.r0 + (vifX.tag.addr << 2);
u32* pmem2 = rows + vifX.tag.addr;
return _vifCode_STColRow<idx>(data, pmem1, pmem2);
return _vifCode_STColRow<idx>(data, &vifX.MaskRow._u32[vifX.tag.addr]);
}
pass3 { VifCodeLog("STRow"); }
return 0;
@ -516,11 +506,10 @@ vifOp(vifCode_STMod) {
vifOp(vifCode_Unpack) {
pass1 {
if (!idx) vifUnpackSetup<0>(data);
else vifUnpackSetup<1>(data);
vifUnpackSetup<idx>(data);
return 1;
}
pass2 { return nVifUnpack(idx, (u8*)data); }
pass2 { return nVifUnpack<idx>((u8*)data); }
pass3 { VifCodeLog("Unpack"); }
return 0;
}

View File

@ -56,6 +56,8 @@ union tTRXREG {
// NOTE, if debugging vif stalls, use sega classics, spyro, gt4, and taito
struct vifStruct {
u128 MaskRow, MaskCol;
vifCode tag;
int cmd;
int irq;
@ -67,6 +69,8 @@ struct vifStruct {
bool vifstalled;
bool stallontag;
VIFregisters* regs;
// GS registers used for calculating the size of the last local->host transfer initiated on the GS
// Transfer size calculation should be restricted to GS emulation in the future
tBITBLTBUF BITBLTBUF;
@ -82,10 +86,10 @@ struct vifStruct {
u8 GifWaitState; // 0 = General PATH checking, 1 = Flush path 3, 2 == Wait for VU1
};
extern vifStruct* vif;
extern vifStruct vif0, vif1;
extern __aligned16 vifStruct vif0, vif1;
extern u8 schedulepath3msk;
_vifT extern u32 vifRead32(u32 mem);
_vifT extern bool vifWrite32(u32 mem, u32 value);
extern void vif0Interrupt();
@ -122,15 +126,3 @@ extern u32 g_vu1Cycles;
extern u32 g_packetsizeonvu;
extern void vif0FLUSH();
extern void vif1FLUSH();
//------------------------------------------------------------------
// newVif SSE-optimized Row/Col Structs
//------------------------------------------------------------------
struct VifMaskTypes
{
u32 Row0[4], Col0[4];
u32 Row1[4], Col1[4];
};
extern __aligned16 VifMaskTypes g_vifmask; // This struct is used by newVif

View File

@ -25,110 +25,95 @@ enum UnpackOffset {
OFFSET_W = 3
};
static __fi u32 setVifRowRegs(u32 reg, u32 data) {
switch (reg) {
case 0: vifRegs->r0 = data; break;
case 1: vifRegs->r1 = data; break;
case 2: vifRegs->r2 = data; break;
case 3: vifRegs->r3 = data; break;
jNO_DEFAULT;
}
static __fi u32 setVifRow(vifStruct& vif, u32 reg, u32 data) {
vif.MaskRow._u32[reg] = data;
return data;
}
static __fi u32 getVifRowRegs(u32 reg) {
switch (reg) {
case 0: return vifRegs->r0; break;
case 1: return vifRegs->r1; break;
case 2: return vifRegs->r2; break;
case 3: return vifRegs->r3; break;
jNO_DEFAULT;
}
return 0; // unreachable...
}
static __fi u32 getVifColRegs(u32 reg) {
switch (reg) {
case 0: return vifRegs->c0; break;
case 1: return vifRegs->c1; break;
case 2: return vifRegs->c2; break;
default: return vifRegs->c3; break;
}
return 0; // unreachable...
}
template< bool doMask >
static __ri void writeXYZW(u32 offnum, u32 &dest, u32 data) {
u32 vifRowReg = getVifRowRegs(offnum);
// cycle derives from vif.cl
// mode derives from vifRegs.mode
template< uint idx, uint mode, bool doMask >
static __ri void writeXYZW(u32 offnum, u32 &dest, u32 data, bool isV4_5 = false) {
int n = 0;
vifStruct& vif = GetVifX;
if (doMask) {
switch (vif->cl) {
case 0: n = (vifRegs->mask >> (offnum * 2)) & 0x3; break;
case 1: n = (vifRegs->mask >> ( 8 + (offnum * 2))) & 0x3; break;
case 2: n = (vifRegs->mask >> (16 + (offnum * 2))) & 0x3; break;
default: n = (vifRegs->mask >> (24 + (offnum * 2))) & 0x3; break;
const VIFregisters& regs = vifXRegs;
switch (vif.cl) {
case 0: n = (regs.mask >> (offnum * 2)) & 0x3; break;
case 1: n = (regs.mask >> ( 8 + (offnum * 2))) & 0x3; break;
case 2: n = (regs.mask >> (16 + (offnum * 2))) & 0x3; break;
default: n = (regs.mask >> (24 + (offnum * 2))) & 0x3; break;
}
}
// Four possible types of masking are handled below:
// 0 - Data
// 1 - MaskRow
// 2 - MaskCol
// 3 - Write protect
switch (n) {
case 0:
if ((vif->cmd & 0x6F) != 0x6f) {
switch (vifRegs->mode) {
case 1: dest = data + vifRowReg; break;
case 2: dest = setVifRowRegs(offnum, vifRowReg + data); break;
switch (mode) {
case 1: dest = data + vif.MaskRow._u32[offnum]; break;
case 2: dest = setVifRow(vif, offnum, vif.MaskRow._u32[offnum] + data); break;
default: dest = data; break;
}
}
else dest = data; // v4-5 Unpack Mode
break;
case 1: dest = vifRowReg; break;
case 2: dest = getVifColRegs(vif->cl); break;
case 1: dest = vif.MaskRow._u32[offnum]; break;
case 2: dest = vif.MaskCol._u32[min(vif.cl,3)]; break;
case 3: break;
}
}
#define tParam idx,mode,doMask
template < bool doMask, class T >
static void __fastcall UNPACK_S(u32 *dest, const T *data)
template < uint idx, uint mode, bool doMask, class T >
static void __fastcall UNPACK_S(u32* dest, const T* src)
{
u32 data = *src;
//S-# will always be a complete packet, no matter what. So we can skip the offset bits
writeXYZW<doMask>(OFFSET_X, *dest++, *data);
writeXYZW<doMask>(OFFSET_Y, *dest++, *data);
writeXYZW<doMask>(OFFSET_Z, *dest++, *data);
writeXYZW<doMask>(OFFSET_W, *dest , *data);
writeXYZW<tParam>(OFFSET_X, *(dest+0), data);
writeXYZW<tParam>(OFFSET_Y, *(dest+1), data);
writeXYZW<tParam>(OFFSET_Z, *(dest+2), data);
writeXYZW<tParam>(OFFSET_W, *(dest+3), data);
}
// The PS2 console actually writes v1v0v1v0 for all V2 unpacks -- the second v1v0 pair
// being officially "indeterminate" but some games very much depend on it.
template <bool doMask, class T>
static void __fastcall UNPACK_V2(u32 *dest, const T *data)
template < uint idx, uint mode, bool doMask, class T >
static void __fastcall UNPACK_V2(u32* dest, const T* src)
{
writeXYZW<doMask>(0, *dest++, *data);
writeXYZW<doMask>(1, *dest++, *(data+1));
writeXYZW<doMask>(2, *dest++, *data);
writeXYZW<doMask>(3, *dest++, *(data+1));
writeXYZW<tParam>(OFFSET_X, *(dest+0), *(src+0));
writeXYZW<tParam>(OFFSET_Y, *(dest+1), *(src+1));
writeXYZW<tParam>(OFFSET_Z, *(dest+2), *(src+0));
writeXYZW<tParam>(OFFSET_W, *(dest+3), *(src+1));
}
// V3 and V4 unpacks both use the V4 unpack logic, even though most of the OFFSET_W fields
// during V3 unpacking end up being overwritten by the next unpack. This is confirmed real
// hardware behavior that games such as Ape Escape 3 depend on.
template <bool doMask, class T>
static void __fastcall UNPACK_V4(u32 *dest, const T *data)
template < uint idx, uint mode, bool doMask, class T >
static void __fastcall UNPACK_V4(u32* dest, const T* src)
{
writeXYZW<doMask>(OFFSET_X, *dest++, *data++);
writeXYZW<doMask>(OFFSET_Y, *dest++, *data++);
writeXYZW<doMask>(OFFSET_Z, *dest++, *data++);
writeXYZW<doMask>(OFFSET_W, *dest , *data);
writeXYZW<tParam>(OFFSET_X, *(dest+0), *(src+0));
writeXYZW<tParam>(OFFSET_Y, *(dest+1), *(src+1));
writeXYZW<tParam>(OFFSET_Z, *(dest+2), *(src+2));
writeXYZW<tParam>(OFFSET_W, *(dest+3), *(src+3));
}
template< bool doMask >
static void __fastcall UNPACK_V4_5(u32 *dest, const u32 *data)
// V4_5 unpacks do not support the MODE register, and act as mode==0 always.
template< uint idx, bool doMask >
static void __fastcall UNPACK_V4_5(u32 *dest, const u32* src)
{
//As with S-#, this will always be a complete packet
writeXYZW<doMask>(OFFSET_X, *dest++, ((*data & 0x001f) << 3));
writeXYZW<doMask>(OFFSET_Y, *dest++, ((*data & 0x03e0) >> 2));
writeXYZW<doMask>(OFFSET_Z, *dest++, ((*data & 0x7c00) >> 7));
writeXYZW<doMask>(OFFSET_W, *dest, ((*data & 0x8000) >> 8));
u32 data = *src;
writeXYZW<idx,0,doMask>(OFFSET_X, *(dest+0), ((data & 0x001f) << 3), true);
writeXYZW<idx,0,doMask>(OFFSET_Y, *(dest+1), ((data & 0x03e0) >> 2), true);
writeXYZW<idx,0,doMask>(OFFSET_Z, *(dest+2), ((data & 0x7c00) >> 7), true);
writeXYZW<idx,0,doMask>(OFFSET_W, *(dest+3), ((data & 0x8000) >> 8), true);
}
// =====================================================================================================
@ -148,45 +133,50 @@ static void __fastcall UNPACK_V4_5(u32 *dest, const u32 *data)
//
#define _upk (UNPACKFUNCTYPE)
#define _odd (UNPACKFUNCTYPE_ODD)
#define _unpk_s(bits) (UNPACKFUNCTYPE_S##bits)
#define _unpk_u(bits) (UNPACKFUNCTYPE_U##bits)
#define _unpk(usn, bits) (UNPACKFUNCTYPE_##usn##bits)
// 32-bits versions are unsigned-only!!
#define UnpackFuncPair32( vt, doMask ) \
(UNPACKFUNCTYPE)_unpk_u(32) UNPACK_##vt<doMask, u32>, \
(UNPACKFUNCTYPE)_unpk_u(32) UNPACK_##vt<doMask, u32>
#define UnpackFuncSet( vt, idx, mode, usn, doMask ) \
(UNPACKFUNCTYPE)_unpk(u,32) UNPACK_##vt<idx, mode, doMask, u32>, \
(UNPACKFUNCTYPE)_unpk(usn,16) UNPACK_##vt<idx, mode, doMask, usn##16>, \
(UNPACKFUNCTYPE)_unpk(usn,8) UNPACK_##vt<idx, mode, doMask, usn##8> \
#define UnpackFuncPair( vt, bits, doMask ) \
(UNPACKFUNCTYPE)_unpk_u(bits) UNPACK_##vt<doMask, u##bits>, \
(UNPACKFUNCTYPE)_unpk_s(bits) UNPACK_##vt<doMask, s##bits>
#define UnpackV4_5set(idx, doMask) \
(UNPACKFUNCTYPE)_unpk(u,32) UNPACK_V4_5<idx, doMask> \
#define UnpackFuncSet( doMask ) \
{ UnpackFuncPair32( S, doMask ), 4, 4 }, /* 0x0 - S-32 */ \
{ UnpackFuncPair ( S, 16, doMask ), 2, 4 }, /* 0x1 - S-16 */ \
{ UnpackFuncPair ( S, 8, doMask ), 1, 4 }, /* 0x2 - S-8 */ \
{ NULL, NULL, 0, 0 }, /* 0x3 (NULL) */ \
{ UnpackFuncPair32( V2, doMask ), 8, 2 }, /* 0x4 - V2-32 */ \
{ UnpackFuncPair ( V2, 16, doMask ), 4, 2 }, /* 0x5 - V2-16 */ \
{ UnpackFuncPair ( V2, 8, doMask ), 2, 2 }, /* 0x6 - V2-8 */ \
{ NULL, NULL, 0, 0 }, /* 0x7 (NULL) */ \
{ UnpackFuncPair32( V4, doMask ), 12, 3 }, /* 0x8 - V3-32 */ \
{ UnpackFuncPair ( V4, 16, doMask ), 6, 3 }, /* 0x9 - V3-16 */ \
{ UnpackFuncPair ( V4, 8, doMask ), 3, 3 }, /* 0xA - V3-8 */ \
{ NULL, NULL, 0, 0 }, /* 0xB (NULL) */ \
{ UnpackFuncPair32( V4, doMask ), 16, 4 }, /* 0xC - V4-32 */ \
{ UnpackFuncPair ( V4, 16, doMask ), 8, 4 }, /* 0xD - V4-16 */ \
{ UnpackFuncPair ( V4, 8, doMask ), 4, 4 }, /* 0xE - V4-8 */ \
{ /* 0xF - V4-5 */ \
(UNPACKFUNCTYPE)_unpk_u(32)UNPACK_V4_5<doMask>, \
(UNPACKFUNCTYPE)_unpk_u(32)UNPACK_V4_5<doMask>, \
2, 4 \
#define UnpackModeSet(idx, mode) \
UnpackFuncSet( S, idx, mode, s, 0 ), NULL, \
UnpackFuncSet( V2, idx, mode, s, 0 ), NULL, \
UnpackFuncSet( V4, idx, mode, s, 0 ), NULL, \
UnpackFuncSet( V4, idx, mode, s, 0 ), UnpackV4_5set(idx, 0), \
\
UnpackFuncSet( S, idx, mode, s, 1 ), NULL, \
UnpackFuncSet( V2, idx, mode, s, 1 ), NULL, \
UnpackFuncSet( V4, idx, mode, s, 1 ), NULL, \
UnpackFuncSet( V4, idx, mode, s, 1 ), UnpackV4_5set(idx, 1), \
\
UnpackFuncSet( S, idx, mode, u, 0 ), NULL, \
UnpackFuncSet( V2, idx, mode, u, 0 ), NULL, \
UnpackFuncSet( V4, idx, mode, u, 0 ), NULL, \
UnpackFuncSet( V4, idx, mode, u, 0 ), UnpackV4_5set(idx, 0), \
\
UnpackFuncSet( S, idx, mode, u, 1 ), NULL, \
UnpackFuncSet( V2, idx, mode, u, 1 ), NULL, \
UnpackFuncSet( V4, idx, mode, u, 1 ), NULL, \
UnpackFuncSet( V4, idx, mode, u, 1 ), UnpackV4_5set(idx, 1)
__aligned16 const UNPACKFUNCTYPE VIFfuncTable[2][3][4 * 4 * 2 * 2] =
{
{
{ UnpackModeSet(0,0) },
{ UnpackModeSet(0,1) },
{ UnpackModeSet(0,2) }
},
const __aligned16 VIFUnpackFuncTable VIFfuncTable[32] =
{
UnpackFuncSet( false )
UnpackFuncSet( true )
{
{ UnpackModeSet(1,0) },
{ UnpackModeSet(1,1) },
{ UnpackModeSet(1,2) }
}
};
//----------------------------------------------------------------------------
@ -212,16 +202,23 @@ _vifT void vifUnpackSetup(const u32 *data) {
if (vifNum == 0) vifNum = 256;
vifXRegs.num = vifNum;
// Traditional-style way of calculating the gsize, based on VN/VL parameters.
// Useful when VN/VL are known template params, but currently they are not so we use
// the LUT instead (for now).
//uint vl = vifX.cmd & 0x03;
//uint vn = (vifX.cmd >> 2) & 0x3;
//uint gsize = ((32 >> vl) * (vn+1)) / 8;
const u8& gsize = nVifT[vifX.cmd & 0x0f];
if (vifXRegs.cycle.wl <= vifXRegs.cycle.cl) {
if (!idx) vif0.tag.size = ((vifNum * VIFfuncTable[ vif0.cmd & 0xf ].gsize) + 3) >> 2;
else vif1.tag.size = ((vifNum * VIFfuncTable[ vif1.cmd & 0xf ].gsize) + 3) >> 2;
vifX.tag.size = ((vifNum * gsize) + 3) / 4;
}
else {
int n = vifXRegs.cycle.cl * (vifNum / vifXRegs.cycle.wl) +
_limit(vifNum % vifXRegs.cycle.wl, vifXRegs.cycle.cl);
if (!idx) vif0.tag.size = ((n * VIFfuncTable[ vif0.cmd & 0xf ].gsize) + 3) >> 2;
else vif1.tag.size = ((n * VIFfuncTable[ vif1.cmd & 0xf ].gsize) + 3) >> 2;
vifX.tag.size = ((n * gsize) + 3) >> 2;
}
u32 addr = vifXRegs.code;

View File

@ -15,38 +15,28 @@
#pragma once
typedef void (__fastcall *UNPACKFUNCTYPE)(u32 *dest, const u32 *data);
typedef int (*UNPACKPARTFUNCTYPESSE)(u32 *dest, const u32 *data, int size);
struct vifStruct;
#define create_unpack_u_type(bits) typedef void (__fastcall *UNPACKFUNCTYPE_U##bits)(u32 *dest, const u##bits *data);
#define create_unpack_odd_u_type(bits) typedef void (__fastcall *UNPACKFUNCTYPE_ODD_U##bits)(u32 *dest, const u##bits *data, int size);
#define create_unpack_s_type(bits) typedef void (__fastcall *UNPACKFUNCTYPE_S##bits)(u32 *dest, const s##bits *data);
#define create_unpack_odd_s_type(bits) typedef void (__fastcall *UNPACKFUNCTYPE_ODD_S##bits)(u32 *dest, const s##bits *data, int size);
typedef void (__fastcall *UNPACKFUNCTYPE)(void* dest, const void* src);
#define create_unpack_u_type(bits) typedef void (__fastcall *UNPACKFUNCTYPE_u##bits)(u32* dest);
#define create_unpack_s_type(bits) typedef void (__fastcall *UNPACKFUNCTYPE_s##bits)(u32* dest);
#define create_some_unpacks(bits) \
create_unpack_u_type(bits); \
create_unpack_odd_u_type(bits); \
create_unpack_s_type(bits); \
create_unpack_odd_s_type(bits);
create_some_unpacks(32);
create_some_unpacks(16);
create_some_unpacks(8);
struct VIFUnpackFuncTable
{
UNPACKFUNCTYPE funcU;
UNPACKFUNCTYPE funcS;
extern __aligned16 const u8 nVifT[16];
u8 gsize; // size of data in bytes used for each write cycle
u8 qsize; // used for unpack parts, num of vectors that
// will be decompressed from data for 1 cycle
};
// Array sub-dimension order: [vifidx] [mode] (VN * VL * USN * doMask)
extern __aligned16 const UNPACKFUNCTYPE VIFfuncTable[2][3][(4 * 4 * 2 * 2)];
extern const __aligned16 VIFUnpackFuncTable VIFfuncTable[32];
extern int nVifUnpack (int idx, const u8 *data);
_vifT extern int nVifUnpack (const u8* data);
extern void resetNewVif(int idx);
template< int idx >
extern void vifUnpackSetup(const u32 *data);
extern void vifUnpackSetup(const u32* data);

View File

@ -32,12 +32,13 @@ typedef void (__fastcall *nVifrecCall)(uptr dest, uptr src);
#include "newVif_HashBucket.h"
extern void mVUmergeRegs(const xRegisterSSE& dest, const xRegisterSSE& src, int xyzw, bool modXYZW = 0);
extern void _nVifUnpack (int idx, const u8 *data, u32 size, bool isFill);
extern void dVifUnpack (int idx, const u8 *data, u32 size, bool isFill);
extern void _nVifUnpack (int idx, const u8* data, uint mode, bool isFill);
extern void dVifReset (int idx);
extern void dVifClose (int idx);
extern void VifUnpackSSE_Init();
_vifT extern void dVifUnpack (const u8* data, bool isFill);
#define VUFT VIFUnpackFuncTable
#define _v0 0
#define _v1 0x55
@ -62,9 +63,9 @@ struct __aligned16 nVifBlock {
u8 num; // [00] Num Field
u8 upkType; // [01] Unpack Type [usn*1:mask*1:upk*4]
u8 mode; // [02] Mode Field
u8 scl; // [03] Start Cycle
u8 cl; // [04] CL Field
u8 wl; // [05] WL Field
u8 scl; // [03] Start Cycle
u32 mask; // [06] Mask Field
u8 padding[2];// [10] through [11]
uptr startPtr; // [12] Start Ptr of RecGen Code
@ -78,14 +79,14 @@ struct __aligned16 nVifBlock {
#define _cmpS (sizeof(nVifBlock) - (4))
#define _tParams nVifBlock, _hSize, _cmpS
struct nVifStruct {
u32 idx; // VIF0 or VIF1
vifStruct* vif; // Vif Struct ptr
VIFregisters* vifRegs; // Vif Regs ptr
VURegs* VU; // VU Regs ptr
u32 vuMemLimit; // Use for fast AND
// Buffer for partial transfers (should always be first to ensure alignment)
// Maximum buffer size is 256 (vifRegs.Num max range) * 16 (quadword)
__aligned16 u8 buffer[256*16];
u32 bSize; // Size of 'buffer'
u32 bPtr;
u8 buffer[_1mb]; // Buffer for partial transfers
u32 idx; // VIF0 or VIF1
u8* recPtr; // Cur Pos to recompile to
u8* recEnd; // 'Safe' End of Rec Cache
BlockBuffer* vifCache; // Block Buffer
@ -103,7 +104,6 @@ struct nVifStruct {
};
extern __aligned16 nVifStruct nVif[2];
extern __aligned16 const u8 nVifT[16];
extern __aligned16 nVifCall nVifUpk[(2*2*16)*4]; // ([USN][Masking][Unpack Type]) [curCycle]
extern __aligned16 u32 nVifMask[3][4][4]; // [MaskNumber][CycleNumber][Vector]

View File

@ -58,6 +58,7 @@ VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlo
usn = (vB.upkType>>5) & 1;
doMask = (vB.upkType>>4) & 1;
doMode = vB.mode & 3;
vCL = 0;
}
#define makeMergeMask(x) { \
@ -65,15 +66,15 @@ VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlo
}
__fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const {
const vifStruct& vif = v.idx ? vif1 : vif0;
u32 m0 = vB.mask;
u32 m1 = m0 & 0xaaaaaaaa;
u32 m2 =(~m1>>1) & m0;
u32 m3 = (m1>>1) & ~m0;
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0;
if((m2&&(doMask||isFill))||doMode) { xMOVAPS(xmmRow, ptr32[row]); }
if((m2&&(doMask||isFill))||doMode) { xMOVAPS(xmmRow, ptr128[&vif.MaskRow]); }
if (m3&&(doMask||isFill)) {
xMOVAPS(xmmCol0, ptr32[col]);
xMOVAPS(xmmCol0, ptr128[&vif.MaskCol]);
if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3);
@ -95,8 +96,8 @@ void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
makeMergeMask(m3);
makeMergeMask(m4);
if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]); } // Load Write Protect
if (doMask&&m2) { mergeVectors(regX, xmmRow, t, m2); } // Merge Row
if (doMask&&m3) { mergeVectors(regX, xRegisterSSE(xmmCol0.Id+cc), t, m3); } // Merge Col
if (doMask&&m2) { mergeVectors(regX, xmmRow, t, m2); } // Merge MaskRow
if (doMask&&m3) { mergeVectors(regX, xRegisterSSE(xmmCol0.Id+cc), t, m3); } // Merge MaskCol
if (doMask&&m4) { mergeVectors(regX, xmmTemp, t, m4); } // Merge Write Protect
if (doMode) {
u32 m5 = (~m1>>1) & ~m0;
@ -117,8 +118,7 @@ void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
}
void VifUnpackSSE_Dynarec::writeBackRow() const {
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
xMOVAPS(ptr32[row], xmmRow);
xMOVAPS(ptr128[&((v.idx ? vif1 : vif0).MaskRow)], xmmRow);
DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
// ToDo: Do we need to write back to vifregs.rX too!? :/
}
@ -138,16 +138,17 @@ static void ShiftDisplacementWindow( xAddressVoid& addr, const xRegister32& modR
if(addImm) xADD(modReg, addImm);
}
void VifUnpackSSE_Dynarec::CompileRoutine() {
const int upkNum = v.vif->cmd & 0xf;
void VifUnpackSSE_Dynarec::CompileRoutine(vifStruct& vif) {
const int upkNum = vB.upkType & 0xf;
const u8& vift = nVifT[upkNum];
const int cycleSize = isFill ? vB.cl : vB.wl;
const int blockSize = isFill ? vB.wl : vB.cl;
const int skipSize = blockSize - cycleSize;
int vNum = v.vifRegs->num;
vCL = v.vif->cl;
doMode = upkNum == 0xf ? 0 : doMode;
uint vNum = vB.num;
doMode = (upkNum == 0xf) ? 0 : doMode; // V4_5 has no mode feature.
pxAssume(vCL == 0);
// Value passed determines # of col regs we need to load
SetMasks(isFill ? blockSize : cycleSize);
@ -189,14 +190,17 @@ void VifUnpackSSE_Dynarec::CompileRoutine() {
}
if (doMode==2) writeBackRow();
xMOV(ptr32[&v.vif->cl], vCL);
xMOV(ptr32[&v.vifRegs->num], vNum);
xRET();
}
static __noinline u8* dVifsetVUptr(const nVifStruct& v, int cl, int wl, bool isFill) {
u8* startmem = v.VU->Mem + (v.vif->tag.addr & v.vuMemLimit);
u8* endmem = v.VU->Mem + (v.vuMemLimit+0x10);
_vifT static __fi u8* dVifsetVUptr(uint cl, uint wl, bool isFill) {
vifStruct& vif = GetVifX;
VIFregisters& vifRegs = vifXRegs;
const VURegs& VU = vuRegs[idx];
const uint vuMemLimit = idx ? 0x4000 : 0x1000;
u8* startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10));
u8* endmem = VU.Mem + vuMemLimit;
uint length = _vBlock.num * 16;
if (!isFill) {
@ -204,15 +208,15 @@ static __noinline u8* dVifsetVUptr(const nVifStruct& v, int cl, int wl, bool isF
// shouldn't count as wrapped data. Otherwise, a trailing skip can cause the emu to drop back
// to the interpreter. -- Refraction (test with MGS3)
int skipSize = (cl - wl) * 16;
int blocks = _vBlock.num / wl;
uint skipSize = (cl - wl) * 16;
uint blocks = _vBlock.num / wl;
length += (blocks-1) * skipSize;
}
if ( (startmem+length) <= endmem ) {
return startmem;
}
//Console.WriteLn("nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x", v.idx, v.vif->tag.addr, v.vif->tag.addr + (_vBlock.num * 16), _vBlock.num, wl, cl);
//Console.WriteLn("nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x", v.idx, vif.tag.addr, vif.tag.addr + (_vBlock.num * 16), _vBlock.num, wl, cl);
return NULL; // Fall Back to Interpreters which have wrap-around logic
}
@ -227,50 +231,57 @@ static __fi void dVifRecLimit(int idx) {
}
}
// Gcc complains about recursive functions being inlined.
void dVifUnpack(int idx, const u8 *data, u32 size, bool isFill) {
_vifT static __fi bool dVifExecuteUnpack(const u8* data, bool isFill)
{
const nVifStruct& v = nVif[idx];
const u8 upkType = v.vif->cmd & 0x1f | ((!!v.vif->usn) << 5);
const int doMask = v.vif->cmd & 0x10;
const int cycle_cl = v.vifRegs->cycle.cl;
const int cycle_wl = v.vifRegs->cycle.wl;
const int blockSize = isFill ? cycle_wl : cycle_cl;
if (v.vif->cl >= blockSize) v.vif->cl = 0;
_vBlock.upkType = upkType;
_vBlock.num = (u8&)v.vifRegs->num;
_vBlock.mode = (u8&)v.vifRegs->mode;
_vBlock.scl = v.vif->cl;
_vBlock.cl = cycle_cl;
_vBlock.wl = cycle_wl;
// Zero out the mask parameter if it's unused -- games leave random junk
// values here which cause false recblock cache misses.
_vBlock.mask = doMask ? v.vifRegs->mask : 0;
VIFregisters& vifRegs = vifXRegs;
if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
if (u8* dest = dVifsetVUptr(v, cycle_cl, cycle_wl, isFill)) {
if (u8* dest = dVifsetVUptr<idx>(vifRegs.cycle.cl, vifRegs.cycle.wl, isFill)) {
//DevCon.WriteLn("Running Recompiled Block!");
((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
}
else {
//DevCon.WriteLn("Running Interpreter Block");
_nVifUnpack(idx, data, size, isFill);
_nVifUnpack(idx, data, vifRegs.mode, isFill);
}
return;
return true;
}
return false;
}
_vifT __fi void dVifUnpack(const u8* data, bool isFill) {
const nVifStruct& v = nVif[idx];
vifStruct& vif = GetVifX;
VIFregisters& vifRegs = vifXRegs;
const u8 upkType = vif.cmd & 0x1f | ((!!vif.usn) << 5);
const int doMask = vif.cmd & 0x10;
_vBlock.upkType = upkType;
_vBlock.num = (u8&)vifRegs.num;
_vBlock.mode = (u8&)vifRegs.mode;
//_vBlock.scl = vif.cl; // scl is always zero now (effectively padding)
_vBlock.cl = vifRegs.cycle.cl;
_vBlock.wl = vifRegs.cycle.wl;
// Zero out the mask parameter if it's unused -- games leave random junk
// values here which cause false recblock cache misses.
_vBlock.mask = doMask ? vifRegs.mask : 0;
//DevCon.WriteLn("nVif%d: Recompiled Block! [%d]", idx, nVif[idx].numBlocks++);
//DevCon.WriteLn(L"[num=% 3d][upkType=0x%02x][scl=%d][cl=%d][wl=%d][mode=%d][m=%d][mask=%s]",
// _vBlock.num, _vBlock.upkType, _vBlock.scl, _vBlock.cl, _vBlock.wl, _vBlock.mode,
// doMask >> 4, doMask ? wxsFormat( L"0x%08x", _vBlock.mask ).c_str() : L"ignored"
//);
if (dVifExecuteUnpack<idx>(data, isFill)) return;
xSetPtr(v.recPtr);
_vBlock.startPtr = (uptr)xGetAlignedCallTarget();
v.vifBlocks->add(_vBlock);
VifUnpackSSE_Dynarec( v, _vBlock ).CompileRoutine();
VifUnpackSSE_Dynarec( v, _vBlock ).CompileRoutine(vif);
nVif[idx].recPtr = xGetPtr();
// [TODO] : Ideally we should test recompile buffer limits prior to each instruction,
@ -279,5 +290,9 @@ void dVifUnpack(int idx, const u8 *data, u32 size, bool isFill) {
// Run the block we just compiled. Various conditions may force us to still use
// the interpreter unpacker though, so a recursive call is the safest way here...
dVifUnpack(idx, data, size, isFill);
//dVifUnpack<idx,isFill>(data);
dVifExecuteUnpack<idx>(data, isFill);
}
template void dVifUnpack<0>(const u8* data, bool isFill);
template void dVifUnpack<1>(const u8* data, bool isFill);

View File

@ -23,9 +23,18 @@
#include "newVif.h"
__aligned16 nVifStruct nVif[2];
__aligned16 nVifCall nVifUpk[(2*2*16) *4]; // ([USN][Masking][Unpack Type]) [curCycle]
__aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector]
// Interpreter-style SSE unpacks. Array layout matches the interpreter C unpacks.
// ([USN][Masking][Unpack Type]) [curCycle]
__aligned16 nVifCall nVifUpk[(2*2*16) *4];
// This is used by the interpreted SSE unpacks only. Recompiled SSE unpacks
// and the interpreted C unpacks use the vif.MaskRow/MaskCol members directly.
// [MaskNumber][CycleNumber][Vector]
__aligned16 u32 nVifMask[3][4][4] = {0};
// Number of bytes of data in the source stream needed for each vector.
// [equivalent to ((32 >> VL) * (VN+1)) / 8]
__aligned16 const u8 nVifT[16] = {
4, // S-32
2, // S-16
@ -47,9 +56,9 @@ __aligned16 const u8 nVifT[16] = {
// ----------------------------------------------------------------------------
template< int idx, bool doMode, bool isFill >
__ri void __fastcall _nVifUnpackLoop(const u8 *data, u32 size);
__ri void __fastcall _nVifUnpackLoop(const u8* data);
typedef void __fastcall FnType_VifUnpackLoop(const u8 *data, u32 size);
typedef void __fastcall FnType_VifUnpackLoop(const u8* data);
typedef FnType_VifUnpackLoop* Fnptr_VifUnpackLoop;
// Unpacks Until 'Num' is 0
@ -67,10 +76,6 @@ void resetNewVif(int idx)
// changed for some reason.
nVif[idx].idx = idx;
nVif[idx].VU = idx ? &VU1 : &VU0;
nVif[idx].vuMemLimit = idx ? 0x3ff0 : 0xff0;
nVif[idx].vif = &GetVifX;
nVif[idx].vifRegs = &vifXRegs;
nVif[idx].bSize = 0;
memzero(nVif[idx].buffer);
@ -81,65 +86,75 @@ void closeNewVif(int idx) {
if (newVifDynaRec) dVifClose(idx);
}
static __fi u8* setVUptr(int vuidx, const u8* vuMemBase, int offset) {
return (u8*)(vuMemBase + ( offset & (vuidx ? 0x3ff0 : 0xff0) ));
static __fi u8* getVUptr(uint idx, int offset) {
return (u8*)(vuRegs[idx].Mem + ( offset & (idx ? 0x3ff0 : 0xff0) ));
}
static __fi void incVUptr(int vuidx, u8* &ptr, const u8* vuMemBase, int amount) {
pxAssume( ((uptr)ptr & 0xf) == 0 ); // alignment check
ptr += amount;
vif->tag.addr += amount;
int diff = ptr - (vuMemBase + (vuidx ? 0x4000 : 0x1000));
if (diff >= 0) {
ptr = (u8*)(vuMemBase + diff);
}
}
static __fi void incVUptrBy16(int vuidx, u8* &ptr, const u8* vuMemBase) {
pxAssume( ((uptr)ptr & 0xf) == 0 ); // alignment check
ptr += 16;
vif->tag.addr += 16;
if( ptr == (vuMemBase + (vuidx ? 0x4000 : 0x1000)) ) {
ptr -= (vuidx ? 0x4000 : 0x1000);
}
}
int nVifUnpack(int idx, const u8* data) {
_vifT int nVifUnpack(const u8* data) {
nVifStruct& v = nVif[idx];
vif = v.vif;
vifRegs = v.vifRegs;
vifStruct& vif = GetVifX;
VIFregisters& vifRegs = vifXRegs;
const int ret = aMin(vif->vifpacketsize, vif->tag.size);
const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
const uint ret = aMin(vif.vifpacketsize, vif.tag.size);
const bool isFill = (vifRegs.cycle.cl < vifRegs.cycle.wl);
s32 size = ret << 2;
if (ret == v.vif->tag.size) { // Full Transfer
if (ret == vif.tag.size) { // Full Transfer
if (v.bSize) { // Last transfer was partial
memcpy_fast(&v.buffer[v.bSize], data, size);
v.bSize += size;
data = v.buffer;
size = v.bSize;
vifRegs.num = (vifXRegs.code >> 16) & 0xff; // grab NUM form the original VIFcode input.
}
if (size > 0 || isFill) {
if (newVifDynaRec) dVifUnpack(idx, data, size, isFill);
else _nVifUnpack(idx, data, size, isFill);
}
vif->tag.size = 0;
vif->cmd = 0;
vif.cl = 0;
if (newVifDynaRec) dVifUnpack<idx>(data, isFill);
else _nVifUnpack(idx, data, vifRegs.mode, isFill);
vif.tag.size = 0;
vif.cmd = 0;
vifRegs.num = 0;
v.bSize = 0;
}
else { // Partial Transfer
memcpy_fast(&v.buffer[v.bSize], data, size);
v.bSize += size;
vif->tag.size -= ret;
vif.tag.size -= ret;
const u8& vSize = nVifT[vif.cmd & 0x0f];
// We need to provide accurate accounting of the NUM register, in case games decided
// to read back from it mid-transfer. Since so few games actually use partial transfers
// of VIF unpacks, this code should not be any bottleneck.
while (size >= vSize) {
--vifRegs.num;
++vif.cl;
if (isFill) {
if (vif.cl < vifRegs.cycle.cl) size -= vSize;
else if (vif.cl == vifRegs.cycle.wl) vif.cl = 0;
}
else
{
size -= vSize;
if (vif.cl >= vifRegs.cycle.wl) vif.cl = 0;
}
}
}
return ret;
}
static void setMasks(int idx, const VIFregisters& v) {
u32* row = idx ? g_vifmask.Row1 : g_vifmask.Row0;
u32* col = idx ? g_vifmask.Col1 : g_vifmask.Col0;
template int nVifUnpack<0>(const u8* data);
template int nVifUnpack<1>(const u8* data);
// This is used by the interpreted SSE unpacks only. Recompiled SSE unpacks
// and the interpreted C unpacks use the vif.MaskRow/MaskCol members directly.
static void setMasks(const vifStruct& vif, const VIFregisters& v) {
for (int i = 0; i < 16; i++) {
int m = (v.mask >> (i*2)) & 3;
switch (m) {
@ -148,15 +163,15 @@ static void setMasks(int idx, const VIFregisters& v) {
nVifMask[1][i/4][i%4] = 0;
nVifMask[2][i/4][i%4] = 0;
break;
case 1: // Row
case 1: // MaskRow
nVifMask[0][i/4][i%4] = 0;
nVifMask[1][i/4][i%4] = 0;
nVifMask[2][i/4][i%4] = newVifDynaRec ? row[i%4] : ((u32*)&v.r0)[(i%4)*4];
nVifMask[2][i/4][i%4] = vif.MaskRow._u32[i%4];
break;
case 2: // Col
case 2: // MaskCol
nVifMask[0][i/4][i%4] = 0;
nVifMask[1][i/4][i%4] = 0;
nVifMask[2][i/4][i%4] = newVifDynaRec ? col[i/4] : ((u32*)&v.c0)[(i/4)*4];
nVifMask[2][i/4][i%4] = vif.MaskCol._u32[i/4];
break;
case 3: // Write Protect
nVifMask[0][i/4][i%4] = 0;
@ -175,74 +190,81 @@ static void setMasks(int idx, const VIFregisters& v) {
// a "win" to move code outside the loop, like normally in most other loop scenarios.
//
// The biggest bottleneck of the current code is the call/ret needed to invoke the SSE
// unpackers. A better option is to generate the entire vifRegs->num loop code as part
// unpackers. A better option is to generate the entire vifRegs.num loop code as part
// of the SSE template, and inline the SSE code into the heart of it. This both avoids
// the call/ret and opens the door for resolving some register dependency chains in the
// current emitted functions. (this is what zero's SSE does to get it's final bit of
// speed advantage over the new vif). --air
//
// As a secondary optimization to above, special handlers could be generated for the
// cycleSize==1 case, which is used frequently enough, and results in enough code
// elimination that it would probably be a win in most cases (and for sure in many
// "slow" games that need it most). --air
// The BEST optimizatin strategy here is to use data available to us from the UNPACK dispatch
// -- namely the unpack type and mask flag -- in combination mode and usn values -- to
// generate ~600 special versions of this function. But since it's an interpreter, who gives
// a crap? Really? :p
//
// size - size of the packet fragment incoming from DMAC.
template< int idx, bool doMode, bool isFill >
__ri void __fastcall _nVifUnpackLoop(const u8 *data, u32 size) {
__ri void __fastcall _nVifUnpackLoop(const u8* data) {
const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
const int skipSize = blockSize - cycleSize;
//DevCon.WriteLn("[%d][%d][%d][num=%d][upk=%d][cl=%d][bl=%d][skip=%d]", isFill, doMask, doMode, vifRegs->num, upkNum, vif->cl, blockSize, skipSize);
vifStruct& vif = GetVifX;
VIFregisters& vifRegs = vifXRegs;
if (vif->cmd & 0x10) setMasks(idx, *vifRegs);
// skipSize used for skipping writes only
const int skipSize = (vifRegs.cycle.cl - vifRegs.cycle.wl) * 16;
const int usn = !!(vif->usn);
const int upkNum = vif->cmd & 0x1f;
//const s8& vift = nVifT[upkNum]; // might be useful later when other SSE paths are finished.
//DevCon.WriteLn("[%d][%d][%d][num=%d][upk=%d][cl=%d][bl=%d][skip=%d]", isFill, doMask, doMode, vifRegs.num, upkNum, vif.cl, blockSize, skipSize);
if (!doMode && (vif.cmd & 0x10)) setMasks(vif, vifRegs);
const int usn = !!vif.usn;
const int upkNum = vif.cmd & 0x1f;
const u8& vSize = nVifT[upkNum & 0x0f];
//uint vl = vif.cmd & 0x03;
//uint vn = (vif.cmd >> 2) & 0x3;
//uint vSize = ((32 >> vl) * (vn+1)) / 8; // size of data (in bytes) used for each write cycle
const nVifCall* fnbase = &nVifUpk[ ((usn*2*16) + upkNum) * (4*1) ];
const VIFUnpackFuncTable& ft = VIFfuncTable[upkNum];
UNPACKFUNCTYPE func = usn ? ft.funcU : ft.funcS;
const UNPACKFUNCTYPE ft = VIFfuncTable[idx][doMode ? vifRegs.mode : 0][ ((usn*2*16) + upkNum) ];
const u8* vuMemBase = (idx ? VU1 : VU0).Mem;
u8* dest = setVUptr(idx, vuMemBase, vif->tag.addr);
if (vif->cl >= blockSize) vif->cl = 0;
pxAssume (vif.cl == 0);
pxAssume (vifRegs.cycle.wl > 0);
do {
u8* dest = getVUptr(idx, vif.tag.addr);
while (vifRegs->num) {
if (vif->cl < cycleSize) {
// This should always be true as per the _1mb buffer used to merge partial transfers.
pxAssume (size >= ft.gsize);
if (doMode) {
//DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum);
func((u32*)dest, (u32*)data);
//if (1) {
ft(dest, data);
}
else {
//DevCon.WriteLn("SSE Unpack!");
fnbase[aMin(vif->cl, 3)](dest, data);
uint cl3 = aMin(vif.cl,3);
fnbase[cl3](dest, data);
}
data += ft.gsize;
size -= ft.gsize;
vifRegs->num--;
incVUptrBy16(idx, dest, vuMemBase);
if (++vif->cl == blockSize) vif->cl = 0;
}
else if (isFill) {
vif.tag.addr += 16;
--vifRegs.num;
++vif.cl;
if (isFill) {
//DevCon.WriteLn("isFill!");
func((u32*)dest, (u32*)data);
vifRegs->num--;
incVUptrBy16(idx, dest, vuMemBase);
if (++vif->cl == blockSize) vif->cl = 0;
if (vif.cl < vifRegs.cycle.cl) data += vSize;
else if (vif.cl == vifRegs.cycle.wl) vif.cl = 0;
}
else {
incVUptr(idx, dest, vuMemBase, 16 * skipSize);
vif->cl = 0;
else
{
data += vSize;
if (vif.cl >= vifRegs.cycle.wl) {
vif.tag.addr += skipSize;
vif.cl = 0;
}
}
} while (vifRegs.num);
}
__fi void _nVifUnpack(int idx, const u8 *data, u32 size, bool isFill) {
__fi void _nVifUnpack(int idx, const u8* data, uint mode, bool isFill) {
const bool doMode = !!vifRegs->mode;
UnpackLoopTable[idx][doMode][isFill]( data, size );
UnpackLoopTable[idx][!!mode][isFill]( data );
}

View File

@ -36,31 +36,6 @@ void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xy
}
}
// Loads Row/Col Data from vifRegs instead of g_vifmask
// Useful for testing vifReg and g_vifmask inconsistency.
void loadRowCol(nVifStruct& v) {
xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]);
xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]);
xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]);
xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]);
xPSHUF.D(xmm0, xmm0, _v0);
xPSHUF.D(xmm1, xmm1, _v0);
xPSHUF.D(xmm2, xmm2, _v0);
xPSHUF.D(xmm6, xmm6, _v0);
mVUmergeRegs(xmm6, xmm0, 8);
mVUmergeRegs(xmm6, xmm1, 4);
mVUmergeRegs(xmm6, xmm2, 2);
xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]);
xPSHUF.D(xmm2, xmm2, _v0);
xPSHUF.D(xmm3, xmm3, _v0);
xPSHUF.D(xmm4, xmm4, _v0);
xPSHUF.D(xmm5, xmm5, _v0);
}
// =====================================================================================================
// VifUnpackSSE_Base Section
// =====================================================================================================

View File

@ -25,7 +25,6 @@
using namespace x86Emitter;
extern void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xyzw);
extern void loadRowCol(nVifStruct& v);
// --------------------------------------------------------------------------------------
// VifUnpackSSE_Base
@ -127,7 +126,7 @@ public:
virtual bool IsUnmaskedOp() const{ return !doMode && !doMask; }
void CompileRoutine();
void CompileRoutine(vifStruct& vif);
protected:
virtual void doMaskWrite(const xRegisterSSE& regX) const;