mirror of https://github.com/PCSX2/pcsx2.git
Significant VIFunpack retooling. Interpreters are considerably more efficient, and Recompilers are slightly more efficient. Details:
* All remaining code for handling partial/fragmented unpacks removed. * vifRegs.NUM is now accurately simulated when queuing data from fragmented unpacks. * Reduced the VIFunpack fragment buffer from 1MB to 4KB (max size of an unpack due to NUM being limited to 8 bits). * Removed vif/vifRegs globals formally used by VIF interpreters (everything relies on the templated vifIdx now -- simpler and faster!) * g_vifMask vars are integrated into vifStruct. * All VIF mask register stuff uses the SSE-friendly vifStruct.MaskRow/Col vars now. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3762 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
e2cb52becf
commit
2f8f86a3eb
|
@ -48,7 +48,15 @@ mem32_t __fastcall _hwRead32(u32 mem)
|
|||
|
||||
case 0x02: return ipuRead32( mem );
|
||||
|
||||
case 0x03: return dmacRead32<0x03>( mem );
|
||||
case 0x03:
|
||||
if (mem >= EEMemoryMap::VIF0_Start)
|
||||
{
|
||||
if(mem >= EEMemoryMap::VIF1_Start)
|
||||
return vifRead32<1>(mem);
|
||||
else
|
||||
return vifRead32<0>(mem);
|
||||
}
|
||||
return dmacRead32<0x03>( mem );
|
||||
|
||||
case 0x04:
|
||||
case 0x05:
|
||||
|
|
|
@ -21,8 +21,8 @@
|
|||
#include "GS.h"
|
||||
#include "Gif.h"
|
||||
|
||||
vifStruct vif0;
|
||||
vifStruct vif1;
|
||||
__aligned16 vifStruct vif0, vif1;
|
||||
|
||||
tGSTransferStatus GSTransferStatus((STOPPED_MODE<<8) | (STOPPED_MODE<<4) | STOPPED_MODE);
|
||||
|
||||
void vif0Reset()
|
||||
|
@ -31,10 +31,7 @@ void vif0Reset()
|
|||
memzero(vif0);
|
||||
memzero(vif0Regs);
|
||||
|
||||
vif0Regs.stat.VPS = VPS_IDLE;
|
||||
vif0Regs.stat.FQC = 0;
|
||||
|
||||
vif0.done = false;
|
||||
vif0.regs = &vif0Regs;
|
||||
|
||||
resetNewVif(0);
|
||||
}
|
||||
|
@ -45,11 +42,7 @@ void vif1Reset()
|
|||
memzero(vif1);
|
||||
memzero(vif1Regs);
|
||||
|
||||
vif1Regs.stat.VPS = VPS_IDLE;
|
||||
vif1Regs.stat.FQC = 0; // FQC=0
|
||||
|
||||
vif1.done = false;
|
||||
cpuRegs.interrupt &= ~((1 << 1) | (1 << 10)); //Stop all vif1 DMA's
|
||||
vif1.regs = &vif1Regs;
|
||||
|
||||
resetNewVif(1);
|
||||
}
|
||||
|
@ -58,7 +51,6 @@ void SaveStateBase::vif0Freeze()
|
|||
{
|
||||
FreezeTag("VIFdma");
|
||||
Freeze(g_vifCycles); // Dunno if this one is needed, but whatever, it's small. :)
|
||||
Freeze(g_vifmask); // mask settings for VIF0 and VIF1
|
||||
Freeze(vif0);
|
||||
|
||||
Freeze(nVif[0].bSize);
|
||||
|
@ -88,6 +80,7 @@ __fi void vif0FBRST(u32 value) {
|
|||
//Console.WriteLn("Vif0 Reset %x", vif0Regs.stat._u32);
|
||||
|
||||
memzero(vif0);
|
||||
vif0.regs = &vif0Regs;
|
||||
vif0ch.qwc = 0; //?
|
||||
cpuRegs.interrupt &= ~1; //Stop all vif0 DMA's
|
||||
psHu64(VIF0_FIFO) = 0;
|
||||
|
@ -147,6 +140,8 @@ __fi void vif1FBRST(u32 value) {
|
|||
if (FBRST(value).RST) // Reset Vif.
|
||||
{
|
||||
memzero(vif1);
|
||||
vif1.regs = &vif1Regs;
|
||||
|
||||
//cpuRegs.interrupt &= ~((1 << 1) | (1 << 10)); //Stop all vif1 DMA's
|
||||
vif1ch.qwc -= min((int)vif1ch.qwc, 16); //?
|
||||
psHu64(VIF1_FIFO) = 0;
|
||||
|
@ -271,9 +266,29 @@ __fi void vif1STAT(u32 value) {
|
|||
|
||||
#define caseVif(x) (idx ? VIF1_##x : VIF0_##x)
|
||||
|
||||
_vifT __fi u32 vifRead32(u32 mem) {
|
||||
vifStruct& vif = GetVifX;
|
||||
|
||||
switch (mem) {
|
||||
case caseVif(ROW0): return vif.MaskRow._u32[0];
|
||||
case caseVif(ROW1): return vif.MaskRow._u32[1];
|
||||
case caseVif(ROW2): return vif.MaskRow._u32[2];
|
||||
case caseVif(ROW3): return vif.MaskRow._u32[3];
|
||||
|
||||
case caseVif(COL0): return vif.MaskCol._u32[0];
|
||||
case caseVif(COL1): return vif.MaskCol._u32[1];
|
||||
case caseVif(COL2): return vif.MaskCol._u32[2];
|
||||
case caseVif(COL3): return vif.MaskCol._u32[3];
|
||||
}
|
||||
|
||||
return psHu32(mem);
|
||||
}
|
||||
|
||||
// returns FALSE if no writeback is needed (or writeback is handled internally)
|
||||
// returns TRUE if the caller should writeback the value to the eeHw register map.
|
||||
_vifT __fi bool vifWrite32(u32 mem, u32 value) {
|
||||
vifStruct& vif = GetVifX;
|
||||
|
||||
switch (mem) {
|
||||
case caseVif(MARK):
|
||||
VIF_LOG("VIF%d_MARK write32 0x%8.8x", idx, value);
|
||||
|
@ -297,33 +312,23 @@ _vifT __fi bool vifWrite32(u32 mem, u32 value) {
|
|||
// standard register writes -- handled by caller.
|
||||
break;
|
||||
|
||||
case caseVif(ROW0):
|
||||
case caseVif(ROW1):
|
||||
case caseVif(ROW2):
|
||||
case caseVif(ROW3):
|
||||
// Here's a neat way to obfuscate code. This is a super-fancy-complicated version
|
||||
// of a standard psHu32(mem) = value; writeback. Handled by caller for us, thanks! --air
|
||||
//if (!idx) g_vifmask.Row0[ (mem>>4)&3 ] = value;
|
||||
//else g_vifmask.Row1[ (mem>>4)&3 ] = value;
|
||||
//((u32*)&vifXRegs.r0) [((mem>>4)&3)*4] = value;
|
||||
break;
|
||||
case caseVif(ROW0): vif.MaskRow._u32[0] = value; return false;
|
||||
case caseVif(ROW1): vif.MaskRow._u32[1] = value; return false;
|
||||
case caseVif(ROW2): vif.MaskRow._u32[2] = value; return false;
|
||||
case caseVif(ROW3): vif.MaskRow._u32[3] = value; return false;
|
||||
|
||||
case caseVif(COL0):
|
||||
case caseVif(COL1):
|
||||
case caseVif(COL2):
|
||||
case caseVif(COL3):
|
||||
// Here's a neat way to obfuscate code. This is a super-fancy-complicated version
|
||||
// of a standard psHu32(mem) = value; writeback. Handled by caller for us, thanks! --air
|
||||
//if (!idx) g_vifmask.Col0[ (mem>>4)&3 ] = value;
|
||||
//else g_vifmask.Col1[ (mem>>4)&3 ] = value;
|
||||
//((u32*)&vifXRegs.c0) [((mem>>4)&3)*4] = value;
|
||||
break;
|
||||
case caseVif(COL0): vif.MaskCol._u32[0] = value; return false;
|
||||
case caseVif(COL1): vif.MaskCol._u32[1] = value; return false;
|
||||
case caseVif(COL2): vif.MaskCol._u32[2] = value; return false;
|
||||
case caseVif(COL3): vif.MaskCol._u32[3] = value; return false;
|
||||
}
|
||||
|
||||
// fall-through case: issue standard writeback behavior.
|
||||
return true;
|
||||
}
|
||||
|
||||
template u32 vifRead32<0>(u32 mem);
|
||||
template u32 vifRead32<1>(u32 mem);
|
||||
|
||||
template bool vifWrite32<0>(u32 mem, u32 value);
|
||||
template bool vifWrite32<1>(u32 mem, u32 value);
|
||||
|
|
|
@ -213,8 +213,6 @@ struct VIFregisters {
|
|||
u32 addr;
|
||||
};
|
||||
|
||||
extern VIFregisters *vifRegs;
|
||||
|
||||
static VIFregisters& vif0Regs = (VIFregisters&)eeHw[0x3800];
|
||||
static VIFregisters& vif1Regs = (VIFregisters&)eeHw[0x3C00];
|
||||
|
||||
|
|
|
@ -19,16 +19,12 @@
|
|||
#include "Gif.h"
|
||||
#include "Vif_Dma.h"
|
||||
|
||||
VIFregisters *vifRegs;
|
||||
vifStruct *vif;
|
||||
u16 vifqwc = 0;
|
||||
u32 g_vifCycles = 0;
|
||||
u32 g_vu0Cycles = 0;
|
||||
u32 g_vu1Cycles = 0;
|
||||
u32 g_packetsizeonvu = 0;
|
||||
|
||||
__aligned16 VifMaskTypes g_vifmask;
|
||||
|
||||
extern u32 g_vifCycles;
|
||||
|
||||
static u32 qwctag(u32 mask)
|
||||
|
|
|
@ -38,8 +38,8 @@ static __fi void vifFlush(int idx) {
|
|||
}
|
||||
|
||||
static __fi void vuExecMicro(int idx, u32 addr) {
|
||||
VURegs* VU = nVif[idx].VU;
|
||||
VIFregisters& vifRegs = VU->GetVifRegs();
|
||||
VURegs& VU = vuRegs[idx];
|
||||
VIFregisters& vifRegs = vifXRegs;
|
||||
int startcycles = 0;
|
||||
//vifFlush(idx);
|
||||
|
||||
|
@ -423,7 +423,7 @@ vifOp(vifCode_Offset) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
template<int idx> static __fi int _vifCode_STColRow(const u32* data, u32* pmem1, u32* pmem2) {
|
||||
template<int idx> static __fi int _vifCode_STColRow(const u32* data, u32* pmem2) {
|
||||
vifStruct& vifX = GetVifX;
|
||||
|
||||
int ret = min(4 - vifX.tag.addr, vifX.vifpacketsize);
|
||||
|
@ -432,16 +432,12 @@ template<int idx> static __fi int _vifCode_STColRow(const u32* data, u32* pmem1,
|
|||
|
||||
switch (ret) {
|
||||
case 4:
|
||||
pmem1[12] = data[3];
|
||||
pmem2[3] = data[3];
|
||||
case 3:
|
||||
pmem1[8] = data[2];
|
||||
pmem2[2] = data[2];
|
||||
case 2:
|
||||
pmem1[4] = data[1];
|
||||
pmem2[1] = data[1];
|
||||
case 1:
|
||||
pmem1[0] = data[0];
|
||||
pmem2[0] = data[0];
|
||||
break;
|
||||
jNO_DEFAULT
|
||||
|
@ -462,10 +458,7 @@ vifOp(vifCode_STCol) {
|
|||
return 1;
|
||||
}
|
||||
pass2 {
|
||||
u32* cols = idx ? g_vifmask.Col1 : g_vifmask.Col0;
|
||||
u32* pmem1 = &vifXRegs.c0 + (vifX.tag.addr << 2);
|
||||
u32* pmem2 = cols + vifX.tag.addr;
|
||||
return _vifCode_STColRow<idx>(data, pmem1, pmem2);
|
||||
return _vifCode_STColRow<idx>(data, &vifX.MaskCol._u32[vifX.tag.addr]);
|
||||
}
|
||||
pass3 { VifCodeLog("STCol"); }
|
||||
return 0;
|
||||
|
@ -480,10 +473,7 @@ vifOp(vifCode_STRow) {
|
|||
return 1;
|
||||
}
|
||||
pass2 {
|
||||
u32* rows = idx ? g_vifmask.Row1 : g_vifmask.Row0;
|
||||
u32* pmem1 = &vifXRegs.r0 + (vifX.tag.addr << 2);
|
||||
u32* pmem2 = rows + vifX.tag.addr;
|
||||
return _vifCode_STColRow<idx>(data, pmem1, pmem2);
|
||||
return _vifCode_STColRow<idx>(data, &vifX.MaskRow._u32[vifX.tag.addr]);
|
||||
}
|
||||
pass3 { VifCodeLog("STRow"); }
|
||||
return 0;
|
||||
|
@ -516,11 +506,10 @@ vifOp(vifCode_STMod) {
|
|||
|
||||
vifOp(vifCode_Unpack) {
|
||||
pass1 {
|
||||
if (!idx) vifUnpackSetup<0>(data);
|
||||
else vifUnpackSetup<1>(data);
|
||||
vifUnpackSetup<idx>(data);
|
||||
return 1;
|
||||
}
|
||||
pass2 { return nVifUnpack(idx, (u8*)data); }
|
||||
pass2 { return nVifUnpack<idx>((u8*)data); }
|
||||
pass3 { VifCodeLog("Unpack"); }
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -56,6 +56,8 @@ union tTRXREG {
|
|||
|
||||
// NOTE, if debugging vif stalls, use sega classics, spyro, gt4, and taito
|
||||
struct vifStruct {
|
||||
u128 MaskRow, MaskCol;
|
||||
|
||||
vifCode tag;
|
||||
int cmd;
|
||||
int irq;
|
||||
|
@ -67,6 +69,8 @@ struct vifStruct {
|
|||
bool vifstalled;
|
||||
bool stallontag;
|
||||
|
||||
VIFregisters* regs;
|
||||
|
||||
// GS registers used for calculating the size of the last local->host transfer initiated on the GS
|
||||
// Transfer size calculation should be restricted to GS emulation in the future
|
||||
tBITBLTBUF BITBLTBUF;
|
||||
|
@ -82,10 +86,10 @@ struct vifStruct {
|
|||
u8 GifWaitState; // 0 = General PATH checking, 1 = Flush path 3, 2 == Wait for VU1
|
||||
};
|
||||
|
||||
extern vifStruct* vif;
|
||||
extern vifStruct vif0, vif1;
|
||||
extern __aligned16 vifStruct vif0, vif1;
|
||||
extern u8 schedulepath3msk;
|
||||
|
||||
_vifT extern u32 vifRead32(u32 mem);
|
||||
_vifT extern bool vifWrite32(u32 mem, u32 value);
|
||||
|
||||
extern void vif0Interrupt();
|
||||
|
@ -122,15 +126,3 @@ extern u32 g_vu1Cycles;
|
|||
extern u32 g_packetsizeonvu;
|
||||
extern void vif0FLUSH();
|
||||
extern void vif1FLUSH();
|
||||
|
||||
//------------------------------------------------------------------
|
||||
// newVif SSE-optimized Row/Col Structs
|
||||
//------------------------------------------------------------------
|
||||
|
||||
struct VifMaskTypes
|
||||
{
|
||||
u32 Row0[4], Col0[4];
|
||||
u32 Row1[4], Col1[4];
|
||||
};
|
||||
|
||||
extern __aligned16 VifMaskTypes g_vifmask; // This struct is used by newVif
|
||||
|
|
|
@ -25,110 +25,95 @@ enum UnpackOffset {
|
|||
OFFSET_W = 3
|
||||
};
|
||||
|
||||
static __fi u32 setVifRowRegs(u32 reg, u32 data) {
|
||||
switch (reg) {
|
||||
case 0: vifRegs->r0 = data; break;
|
||||
case 1: vifRegs->r1 = data; break;
|
||||
case 2: vifRegs->r2 = data; break;
|
||||
case 3: vifRegs->r3 = data; break;
|
||||
jNO_DEFAULT;
|
||||
}
|
||||
static __fi u32 setVifRow(vifStruct& vif, u32 reg, u32 data) {
|
||||
vif.MaskRow._u32[reg] = data;
|
||||
return data;
|
||||
}
|
||||
|
||||
static __fi u32 getVifRowRegs(u32 reg) {
|
||||
switch (reg) {
|
||||
case 0: return vifRegs->r0; break;
|
||||
case 1: return vifRegs->r1; break;
|
||||
case 2: return vifRegs->r2; break;
|
||||
case 3: return vifRegs->r3; break;
|
||||
jNO_DEFAULT;
|
||||
}
|
||||
return 0; // unreachable...
|
||||
}
|
||||
|
||||
static __fi u32 getVifColRegs(u32 reg) {
|
||||
switch (reg) {
|
||||
case 0: return vifRegs->c0; break;
|
||||
case 1: return vifRegs->c1; break;
|
||||
case 2: return vifRegs->c2; break;
|
||||
default: return vifRegs->c3; break;
|
||||
}
|
||||
return 0; // unreachable...
|
||||
}
|
||||
|
||||
template< bool doMask >
|
||||
static __ri void writeXYZW(u32 offnum, u32 &dest, u32 data) {
|
||||
u32 vifRowReg = getVifRowRegs(offnum);
|
||||
// cycle derives from vif.cl
|
||||
// mode derives from vifRegs.mode
|
||||
template< uint idx, uint mode, bool doMask >
|
||||
static __ri void writeXYZW(u32 offnum, u32 &dest, u32 data, bool isV4_5 = false) {
|
||||
int n = 0;
|
||||
|
||||
vifStruct& vif = GetVifX;
|
||||
|
||||
if (doMask) {
|
||||
switch (vif->cl) {
|
||||
case 0: n = (vifRegs->mask >> (offnum * 2)) & 0x3; break;
|
||||
case 1: n = (vifRegs->mask >> ( 8 + (offnum * 2))) & 0x3; break;
|
||||
case 2: n = (vifRegs->mask >> (16 + (offnum * 2))) & 0x3; break;
|
||||
default: n = (vifRegs->mask >> (24 + (offnum * 2))) & 0x3; break;
|
||||
const VIFregisters& regs = vifXRegs;
|
||||
switch (vif.cl) {
|
||||
case 0: n = (regs.mask >> (offnum * 2)) & 0x3; break;
|
||||
case 1: n = (regs.mask >> ( 8 + (offnum * 2))) & 0x3; break;
|
||||
case 2: n = (regs.mask >> (16 + (offnum * 2))) & 0x3; break;
|
||||
default: n = (regs.mask >> (24 + (offnum * 2))) & 0x3; break;
|
||||
}
|
||||
}
|
||||
|
||||
// Four possible types of masking are handled below:
|
||||
// 0 - Data
|
||||
// 1 - MaskRow
|
||||
// 2 - MaskCol
|
||||
// 3 - Write protect
|
||||
|
||||
switch (n) {
|
||||
case 0:
|
||||
if ((vif->cmd & 0x6F) != 0x6f) {
|
||||
switch (vifRegs->mode) {
|
||||
case 1: dest = data + vifRowReg; break;
|
||||
case 2: dest = setVifRowRegs(offnum, vifRowReg + data); break;
|
||||
switch (mode) {
|
||||
case 1: dest = data + vif.MaskRow._u32[offnum]; break;
|
||||
case 2: dest = setVifRow(vif, offnum, vif.MaskRow._u32[offnum] + data); break;
|
||||
default: dest = data; break;
|
||||
}
|
||||
}
|
||||
else dest = data; // v4-5 Unpack Mode
|
||||
break;
|
||||
case 1: dest = vifRowReg; break;
|
||||
case 2: dest = getVifColRegs(vif->cl); break;
|
||||
case 1: dest = vif.MaskRow._u32[offnum]; break;
|
||||
case 2: dest = vif.MaskCol._u32[min(vif.cl,3)]; break;
|
||||
case 3: break;
|
||||
}
|
||||
}
|
||||
#define tParam idx,mode,doMask
|
||||
|
||||
template < bool doMask, class T >
|
||||
static void __fastcall UNPACK_S(u32 *dest, const T *data)
|
||||
template < uint idx, uint mode, bool doMask, class T >
|
||||
static void __fastcall UNPACK_S(u32* dest, const T* src)
|
||||
{
|
||||
u32 data = *src;
|
||||
|
||||
//S-# will always be a complete packet, no matter what. So we can skip the offset bits
|
||||
writeXYZW<doMask>(OFFSET_X, *dest++, *data);
|
||||
writeXYZW<doMask>(OFFSET_Y, *dest++, *data);
|
||||
writeXYZW<doMask>(OFFSET_Z, *dest++, *data);
|
||||
writeXYZW<doMask>(OFFSET_W, *dest , *data);
|
||||
writeXYZW<tParam>(OFFSET_X, *(dest+0), data);
|
||||
writeXYZW<tParam>(OFFSET_Y, *(dest+1), data);
|
||||
writeXYZW<tParam>(OFFSET_Z, *(dest+2), data);
|
||||
writeXYZW<tParam>(OFFSET_W, *(dest+3), data);
|
||||
}
|
||||
|
||||
// The PS2 console actually writes v1v0v1v0 for all V2 unpacks -- the second v1v0 pair
|
||||
// being officially "indeterminate" but some games very much depend on it.
|
||||
template <bool doMask, class T>
|
||||
static void __fastcall UNPACK_V2(u32 *dest, const T *data)
|
||||
template < uint idx, uint mode, bool doMask, class T >
|
||||
static void __fastcall UNPACK_V2(u32* dest, const T* src)
|
||||
{
|
||||
writeXYZW<doMask>(0, *dest++, *data);
|
||||
writeXYZW<doMask>(1, *dest++, *(data+1));
|
||||
writeXYZW<doMask>(2, *dest++, *data);
|
||||
writeXYZW<doMask>(3, *dest++, *(data+1));
|
||||
writeXYZW<tParam>(OFFSET_X, *(dest+0), *(src+0));
|
||||
writeXYZW<tParam>(OFFSET_Y, *(dest+1), *(src+1));
|
||||
writeXYZW<tParam>(OFFSET_Z, *(dest+2), *(src+0));
|
||||
writeXYZW<tParam>(OFFSET_W, *(dest+3), *(src+1));
|
||||
}
|
||||
|
||||
// V3 and V4 unpacks both use the V4 unpack logic, even though most of the OFFSET_W fields
|
||||
// during V3 unpacking end up being overwritten by the next unpack. This is confirmed real
|
||||
// hardware behavior that games such as Ape Escape 3 depend on.
|
||||
template <bool doMask, class T>
|
||||
static void __fastcall UNPACK_V4(u32 *dest, const T *data)
|
||||
template < uint idx, uint mode, bool doMask, class T >
|
||||
static void __fastcall UNPACK_V4(u32* dest, const T* src)
|
||||
{
|
||||
writeXYZW<doMask>(OFFSET_X, *dest++, *data++);
|
||||
writeXYZW<doMask>(OFFSET_Y, *dest++, *data++);
|
||||
writeXYZW<doMask>(OFFSET_Z, *dest++, *data++);
|
||||
writeXYZW<doMask>(OFFSET_W, *dest , *data);
|
||||
writeXYZW<tParam>(OFFSET_X, *(dest+0), *(src+0));
|
||||
writeXYZW<tParam>(OFFSET_Y, *(dest+1), *(src+1));
|
||||
writeXYZW<tParam>(OFFSET_Z, *(dest+2), *(src+2));
|
||||
writeXYZW<tParam>(OFFSET_W, *(dest+3), *(src+3));
|
||||
}
|
||||
|
||||
template< bool doMask >
|
||||
static void __fastcall UNPACK_V4_5(u32 *dest, const u32 *data)
|
||||
// V4_5 unpacks do not support the MODE register, and act as mode==0 always.
|
||||
template< uint idx, bool doMask >
|
||||
static void __fastcall UNPACK_V4_5(u32 *dest, const u32* src)
|
||||
{
|
||||
//As with S-#, this will always be a complete packet
|
||||
writeXYZW<doMask>(OFFSET_X, *dest++, ((*data & 0x001f) << 3));
|
||||
writeXYZW<doMask>(OFFSET_Y, *dest++, ((*data & 0x03e0) >> 2));
|
||||
writeXYZW<doMask>(OFFSET_Z, *dest++, ((*data & 0x7c00) >> 7));
|
||||
writeXYZW<doMask>(OFFSET_W, *dest, ((*data & 0x8000) >> 8));
|
||||
u32 data = *src;
|
||||
|
||||
writeXYZW<idx,0,doMask>(OFFSET_X, *(dest+0), ((data & 0x001f) << 3), true);
|
||||
writeXYZW<idx,0,doMask>(OFFSET_Y, *(dest+1), ((data & 0x03e0) >> 2), true);
|
||||
writeXYZW<idx,0,doMask>(OFFSET_Z, *(dest+2), ((data & 0x7c00) >> 7), true);
|
||||
writeXYZW<idx,0,doMask>(OFFSET_W, *(dest+3), ((data & 0x8000) >> 8), true);
|
||||
}
|
||||
|
||||
// =====================================================================================================
|
||||
|
@ -148,45 +133,50 @@ static void __fastcall UNPACK_V4_5(u32 *dest, const u32 *data)
|
|||
//
|
||||
|
||||
#define _upk (UNPACKFUNCTYPE)
|
||||
#define _odd (UNPACKFUNCTYPE_ODD)
|
||||
#define _unpk_s(bits) (UNPACKFUNCTYPE_S##bits)
|
||||
#define _unpk_u(bits) (UNPACKFUNCTYPE_U##bits)
|
||||
#define _unpk(usn, bits) (UNPACKFUNCTYPE_##usn##bits)
|
||||
|
||||
// 32-bits versions are unsigned-only!!
|
||||
#define UnpackFuncPair32( vt, doMask ) \
|
||||
(UNPACKFUNCTYPE)_unpk_u(32) UNPACK_##vt<doMask, u32>, \
|
||||
(UNPACKFUNCTYPE)_unpk_u(32) UNPACK_##vt<doMask, u32>
|
||||
#define UnpackFuncSet( vt, idx, mode, usn, doMask ) \
|
||||
(UNPACKFUNCTYPE)_unpk(u,32) UNPACK_##vt<idx, mode, doMask, u32>, \
|
||||
(UNPACKFUNCTYPE)_unpk(usn,16) UNPACK_##vt<idx, mode, doMask, usn##16>, \
|
||||
(UNPACKFUNCTYPE)_unpk(usn,8) UNPACK_##vt<idx, mode, doMask, usn##8> \
|
||||
|
||||
#define UnpackFuncPair( vt, bits, doMask ) \
|
||||
(UNPACKFUNCTYPE)_unpk_u(bits) UNPACK_##vt<doMask, u##bits>, \
|
||||
(UNPACKFUNCTYPE)_unpk_s(bits) UNPACK_##vt<doMask, s##bits>
|
||||
#define UnpackV4_5set(idx, doMask) \
|
||||
(UNPACKFUNCTYPE)_unpk(u,32) UNPACK_V4_5<idx, doMask> \
|
||||
|
||||
#define UnpackFuncSet( doMask ) \
|
||||
{ UnpackFuncPair32( S, doMask ), 4, 4 }, /* 0x0 - S-32 */ \
|
||||
{ UnpackFuncPair ( S, 16, doMask ), 2, 4 }, /* 0x1 - S-16 */ \
|
||||
{ UnpackFuncPair ( S, 8, doMask ), 1, 4 }, /* 0x2 - S-8 */ \
|
||||
{ NULL, NULL, 0, 0 }, /* 0x3 (NULL) */ \
|
||||
{ UnpackFuncPair32( V2, doMask ), 8, 2 }, /* 0x4 - V2-32 */ \
|
||||
{ UnpackFuncPair ( V2, 16, doMask ), 4, 2 }, /* 0x5 - V2-16 */ \
|
||||
{ UnpackFuncPair ( V2, 8, doMask ), 2, 2 }, /* 0x6 - V2-8 */ \
|
||||
{ NULL, NULL, 0, 0 }, /* 0x7 (NULL) */ \
|
||||
{ UnpackFuncPair32( V4, doMask ), 12, 3 }, /* 0x8 - V3-32 */ \
|
||||
{ UnpackFuncPair ( V4, 16, doMask ), 6, 3 }, /* 0x9 - V3-16 */ \
|
||||
{ UnpackFuncPair ( V4, 8, doMask ), 3, 3 }, /* 0xA - V3-8 */ \
|
||||
{ NULL, NULL, 0, 0 }, /* 0xB (NULL) */ \
|
||||
{ UnpackFuncPair32( V4, doMask ), 16, 4 }, /* 0xC - V4-32 */ \
|
||||
{ UnpackFuncPair ( V4, 16, doMask ), 8, 4 }, /* 0xD - V4-16 */ \
|
||||
{ UnpackFuncPair ( V4, 8, doMask ), 4, 4 }, /* 0xE - V4-8 */ \
|
||||
{ /* 0xF - V4-5 */ \
|
||||
(UNPACKFUNCTYPE)_unpk_u(32)UNPACK_V4_5<doMask>, \
|
||||
(UNPACKFUNCTYPE)_unpk_u(32)UNPACK_V4_5<doMask>, \
|
||||
2, 4 \
|
||||
#define UnpackModeSet(idx, mode) \
|
||||
UnpackFuncSet( S, idx, mode, s, 0 ), NULL, \
|
||||
UnpackFuncSet( V2, idx, mode, s, 0 ), NULL, \
|
||||
UnpackFuncSet( V4, idx, mode, s, 0 ), NULL, \
|
||||
UnpackFuncSet( V4, idx, mode, s, 0 ), UnpackV4_5set(idx, 0), \
|
||||
\
|
||||
UnpackFuncSet( S, idx, mode, s, 1 ), NULL, \
|
||||
UnpackFuncSet( V2, idx, mode, s, 1 ), NULL, \
|
||||
UnpackFuncSet( V4, idx, mode, s, 1 ), NULL, \
|
||||
UnpackFuncSet( V4, idx, mode, s, 1 ), UnpackV4_5set(idx, 1), \
|
||||
\
|
||||
UnpackFuncSet( S, idx, mode, u, 0 ), NULL, \
|
||||
UnpackFuncSet( V2, idx, mode, u, 0 ), NULL, \
|
||||
UnpackFuncSet( V4, idx, mode, u, 0 ), NULL, \
|
||||
UnpackFuncSet( V4, idx, mode, u, 0 ), UnpackV4_5set(idx, 0), \
|
||||
\
|
||||
UnpackFuncSet( S, idx, mode, u, 1 ), NULL, \
|
||||
UnpackFuncSet( V2, idx, mode, u, 1 ), NULL, \
|
||||
UnpackFuncSet( V4, idx, mode, u, 1 ), NULL, \
|
||||
UnpackFuncSet( V4, idx, mode, u, 1 ), UnpackV4_5set(idx, 1)
|
||||
|
||||
__aligned16 const UNPACKFUNCTYPE VIFfuncTable[2][3][4 * 4 * 2 * 2] =
|
||||
{
|
||||
{
|
||||
{ UnpackModeSet(0,0) },
|
||||
{ UnpackModeSet(0,1) },
|
||||
{ UnpackModeSet(0,2) }
|
||||
},
|
||||
|
||||
const __aligned16 VIFUnpackFuncTable VIFfuncTable[32] =
|
||||
{
|
||||
UnpackFuncSet( false )
|
||||
UnpackFuncSet( true )
|
||||
{
|
||||
{ UnpackModeSet(1,0) },
|
||||
{ UnpackModeSet(1,1) },
|
||||
{ UnpackModeSet(1,2) }
|
||||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -212,16 +202,23 @@ _vifT void vifUnpackSetup(const u32 *data) {
|
|||
if (vifNum == 0) vifNum = 256;
|
||||
vifXRegs.num = vifNum;
|
||||
|
||||
// Traditional-style way of calculating the gsize, based on VN/VL parameters.
|
||||
// Useful when VN/VL are known template params, but currently they are not so we use
|
||||
// the LUT instead (for now).
|
||||
//uint vl = vifX.cmd & 0x03;
|
||||
//uint vn = (vifX.cmd >> 2) & 0x3;
|
||||
//uint gsize = ((32 >> vl) * (vn+1)) / 8;
|
||||
|
||||
const u8& gsize = nVifT[vifX.cmd & 0x0f];
|
||||
|
||||
if (vifXRegs.cycle.wl <= vifXRegs.cycle.cl) {
|
||||
if (!idx) vif0.tag.size = ((vifNum * VIFfuncTable[ vif0.cmd & 0xf ].gsize) + 3) >> 2;
|
||||
else vif1.tag.size = ((vifNum * VIFfuncTable[ vif1.cmd & 0xf ].gsize) + 3) >> 2;
|
||||
vifX.tag.size = ((vifNum * gsize) + 3) / 4;
|
||||
}
|
||||
else {
|
||||
int n = vifXRegs.cycle.cl * (vifNum / vifXRegs.cycle.wl) +
|
||||
_limit(vifNum % vifXRegs.cycle.wl, vifXRegs.cycle.cl);
|
||||
|
||||
if (!idx) vif0.tag.size = ((n * VIFfuncTable[ vif0.cmd & 0xf ].gsize) + 3) >> 2;
|
||||
else vif1.tag.size = ((n * VIFfuncTable[ vif1.cmd & 0xf ].gsize) + 3) >> 2;
|
||||
vifX.tag.size = ((n * gsize) + 3) >> 2;
|
||||
}
|
||||
|
||||
u32 addr = vifXRegs.code;
|
||||
|
|
|
@ -15,38 +15,28 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
typedef void (__fastcall *UNPACKFUNCTYPE)(u32 *dest, const u32 *data);
|
||||
typedef int (*UNPACKPARTFUNCTYPESSE)(u32 *dest, const u32 *data, int size);
|
||||
struct vifStruct;
|
||||
|
||||
#define create_unpack_u_type(bits) typedef void (__fastcall *UNPACKFUNCTYPE_U##bits)(u32 *dest, const u##bits *data);
|
||||
#define create_unpack_odd_u_type(bits) typedef void (__fastcall *UNPACKFUNCTYPE_ODD_U##bits)(u32 *dest, const u##bits *data, int size);
|
||||
#define create_unpack_s_type(bits) typedef void (__fastcall *UNPACKFUNCTYPE_S##bits)(u32 *dest, const s##bits *data);
|
||||
#define create_unpack_odd_s_type(bits) typedef void (__fastcall *UNPACKFUNCTYPE_ODD_S##bits)(u32 *dest, const s##bits *data, int size);
|
||||
typedef void (__fastcall *UNPACKFUNCTYPE)(void* dest, const void* src);
|
||||
|
||||
#define create_unpack_u_type(bits) typedef void (__fastcall *UNPACKFUNCTYPE_u##bits)(u32* dest);
|
||||
#define create_unpack_s_type(bits) typedef void (__fastcall *UNPACKFUNCTYPE_s##bits)(u32* dest);
|
||||
|
||||
#define create_some_unpacks(bits) \
|
||||
create_unpack_u_type(bits); \
|
||||
create_unpack_odd_u_type(bits); \
|
||||
create_unpack_s_type(bits); \
|
||||
create_unpack_odd_s_type(bits);
|
||||
|
||||
create_some_unpacks(32);
|
||||
create_some_unpacks(16);
|
||||
create_some_unpacks(8);
|
||||
|
||||
struct VIFUnpackFuncTable
|
||||
{
|
||||
UNPACKFUNCTYPE funcU;
|
||||
UNPACKFUNCTYPE funcS;
|
||||
extern __aligned16 const u8 nVifT[16];
|
||||
|
||||
u8 gsize; // size of data in bytes used for each write cycle
|
||||
u8 qsize; // used for unpack parts, num of vectors that
|
||||
// will be decompressed from data for 1 cycle
|
||||
};
|
||||
// Array sub-dimension order: [vifidx] [mode] (VN * VL * USN * doMask)
|
||||
extern __aligned16 const UNPACKFUNCTYPE VIFfuncTable[2][3][(4 * 4 * 2 * 2)];
|
||||
|
||||
extern const __aligned16 VIFUnpackFuncTable VIFfuncTable[32];
|
||||
|
||||
extern int nVifUnpack (int idx, const u8 *data);
|
||||
_vifT extern int nVifUnpack (const u8* data);
|
||||
extern void resetNewVif(int idx);
|
||||
|
||||
template< int idx >
|
||||
extern void vifUnpackSetup(const u32 *data);
|
||||
extern void vifUnpackSetup(const u32* data);
|
||||
|
|
|
@ -32,12 +32,13 @@ typedef void (__fastcall *nVifrecCall)(uptr dest, uptr src);
|
|||
#include "newVif_HashBucket.h"
|
||||
|
||||
extern void mVUmergeRegs(const xRegisterSSE& dest, const xRegisterSSE& src, int xyzw, bool modXYZW = 0);
|
||||
extern void _nVifUnpack (int idx, const u8 *data, u32 size, bool isFill);
|
||||
extern void dVifUnpack (int idx, const u8 *data, u32 size, bool isFill);
|
||||
extern void _nVifUnpack (int idx, const u8* data, uint mode, bool isFill);
|
||||
extern void dVifReset (int idx);
|
||||
extern void dVifClose (int idx);
|
||||
extern void VifUnpackSSE_Init();
|
||||
|
||||
_vifT extern void dVifUnpack (const u8* data, bool isFill);
|
||||
|
||||
#define VUFT VIFUnpackFuncTable
|
||||
#define _v0 0
|
||||
#define _v1 0x55
|
||||
|
@ -62,9 +63,9 @@ struct __aligned16 nVifBlock {
|
|||
u8 num; // [00] Num Field
|
||||
u8 upkType; // [01] Unpack Type [usn*1:mask*1:upk*4]
|
||||
u8 mode; // [02] Mode Field
|
||||
u8 scl; // [03] Start Cycle
|
||||
u8 cl; // [04] CL Field
|
||||
u8 wl; // [05] WL Field
|
||||
u8 scl; // [03] Start Cycle
|
||||
u32 mask; // [06] Mask Field
|
||||
u8 padding[2];// [10] through [11]
|
||||
uptr startPtr; // [12] Start Ptr of RecGen Code
|
||||
|
@ -78,14 +79,14 @@ struct __aligned16 nVifBlock {
|
|||
#define _cmpS (sizeof(nVifBlock) - (4))
|
||||
#define _tParams nVifBlock, _hSize, _cmpS
|
||||
struct nVifStruct {
|
||||
u32 idx; // VIF0 or VIF1
|
||||
vifStruct* vif; // Vif Struct ptr
|
||||
VIFregisters* vifRegs; // Vif Regs ptr
|
||||
VURegs* VU; // VU Regs ptr
|
||||
u32 vuMemLimit; // Use for fast AND
|
||||
|
||||
// Buffer for partial transfers (should always be first to ensure alignment)
|
||||
// Maximum buffer size is 256 (vifRegs.Num max range) * 16 (quadword)
|
||||
__aligned16 u8 buffer[256*16];
|
||||
u32 bSize; // Size of 'buffer'
|
||||
u32 bPtr;
|
||||
u8 buffer[_1mb]; // Buffer for partial transfers
|
||||
|
||||
u32 idx; // VIF0 or VIF1
|
||||
u8* recPtr; // Cur Pos to recompile to
|
||||
u8* recEnd; // 'Safe' End of Rec Cache
|
||||
BlockBuffer* vifCache; // Block Buffer
|
||||
|
@ -103,7 +104,6 @@ struct nVifStruct {
|
|||
};
|
||||
|
||||
extern __aligned16 nVifStruct nVif[2];
|
||||
extern __aligned16 const u8 nVifT[16];
|
||||
extern __aligned16 nVifCall nVifUpk[(2*2*16)*4]; // ([USN][Masking][Unpack Type]) [curCycle]
|
||||
extern __aligned16 u32 nVifMask[3][4][4]; // [MaskNumber][CycleNumber][Vector]
|
||||
|
||||
|
|
|
@ -58,6 +58,7 @@ VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlo
|
|||
usn = (vB.upkType>>5) & 1;
|
||||
doMask = (vB.upkType>>4) & 1;
|
||||
doMode = vB.mode & 3;
|
||||
vCL = 0;
|
||||
}
|
||||
|
||||
#define makeMergeMask(x) { \
|
||||
|
@ -65,15 +66,15 @@ VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlo
|
|||
}
|
||||
|
||||
__fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const {
|
||||
const vifStruct& vif = v.idx ? vif1 : vif0;
|
||||
|
||||
u32 m0 = vB.mask;
|
||||
u32 m1 = m0 & 0xaaaaaaaa;
|
||||
u32 m2 =(~m1>>1) & m0;
|
||||
u32 m3 = (m1>>1) & ~m0;
|
||||
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
|
||||
u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0;
|
||||
if((m2&&(doMask||isFill))||doMode) { xMOVAPS(xmmRow, ptr32[row]); }
|
||||
if((m2&&(doMask||isFill))||doMode) { xMOVAPS(xmmRow, ptr128[&vif.MaskRow]); }
|
||||
if (m3&&(doMask||isFill)) {
|
||||
xMOVAPS(xmmCol0, ptr32[col]);
|
||||
xMOVAPS(xmmCol0, ptr128[&vif.MaskCol]);
|
||||
if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
|
||||
if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
|
||||
if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3);
|
||||
|
@ -95,8 +96,8 @@ void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
|
|||
makeMergeMask(m3);
|
||||
makeMergeMask(m4);
|
||||
if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]); } // Load Write Protect
|
||||
if (doMask&&m2) { mergeVectors(regX, xmmRow, t, m2); } // Merge Row
|
||||
if (doMask&&m3) { mergeVectors(regX, xRegisterSSE(xmmCol0.Id+cc), t, m3); } // Merge Col
|
||||
if (doMask&&m2) { mergeVectors(regX, xmmRow, t, m2); } // Merge MaskRow
|
||||
if (doMask&&m3) { mergeVectors(regX, xRegisterSSE(xmmCol0.Id+cc), t, m3); } // Merge MaskCol
|
||||
if (doMask&&m4) { mergeVectors(regX, xmmTemp, t, m4); } // Merge Write Protect
|
||||
if (doMode) {
|
||||
u32 m5 = (~m1>>1) & ~m0;
|
||||
|
@ -117,8 +118,7 @@ void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
|
|||
}
|
||||
|
||||
void VifUnpackSSE_Dynarec::writeBackRow() const {
|
||||
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
|
||||
xMOVAPS(ptr32[row], xmmRow);
|
||||
xMOVAPS(ptr128[&((v.idx ? vif1 : vif0).MaskRow)], xmmRow);
|
||||
DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
|
||||
// ToDo: Do we need to write back to vifregs.rX too!? :/
|
||||
}
|
||||
|
@ -138,16 +138,17 @@ static void ShiftDisplacementWindow( xAddressVoid& addr, const xRegister32& modR
|
|||
if(addImm) xADD(modReg, addImm);
|
||||
}
|
||||
|
||||
void VifUnpackSSE_Dynarec::CompileRoutine() {
|
||||
const int upkNum = v.vif->cmd & 0xf;
|
||||
void VifUnpackSSE_Dynarec::CompileRoutine(vifStruct& vif) {
|
||||
const int upkNum = vB.upkType & 0xf;
|
||||
const u8& vift = nVifT[upkNum];
|
||||
const int cycleSize = isFill ? vB.cl : vB.wl;
|
||||
const int blockSize = isFill ? vB.wl : vB.cl;
|
||||
const int skipSize = blockSize - cycleSize;
|
||||
|
||||
int vNum = v.vifRegs->num;
|
||||
vCL = v.vif->cl;
|
||||
doMode = upkNum == 0xf ? 0 : doMode;
|
||||
uint vNum = vB.num;
|
||||
doMode = (upkNum == 0xf) ? 0 : doMode; // V4_5 has no mode feature.
|
||||
|
||||
pxAssume(vCL == 0);
|
||||
|
||||
// Value passed determines # of col regs we need to load
|
||||
SetMasks(isFill ? blockSize : cycleSize);
|
||||
|
@ -189,14 +190,17 @@ void VifUnpackSSE_Dynarec::CompileRoutine() {
|
|||
}
|
||||
|
||||
if (doMode==2) writeBackRow();
|
||||
xMOV(ptr32[&v.vif->cl], vCL);
|
||||
xMOV(ptr32[&v.vifRegs->num], vNum);
|
||||
xRET();
|
||||
}
|
||||
|
||||
static __noinline u8* dVifsetVUptr(const nVifStruct& v, int cl, int wl, bool isFill) {
|
||||
u8* startmem = v.VU->Mem + (v.vif->tag.addr & v.vuMemLimit);
|
||||
u8* endmem = v.VU->Mem + (v.vuMemLimit+0x10);
|
||||
_vifT static __fi u8* dVifsetVUptr(uint cl, uint wl, bool isFill) {
|
||||
vifStruct& vif = GetVifX;
|
||||
VIFregisters& vifRegs = vifXRegs;
|
||||
const VURegs& VU = vuRegs[idx];
|
||||
const uint vuMemLimit = idx ? 0x4000 : 0x1000;
|
||||
|
||||
u8* startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10));
|
||||
u8* endmem = VU.Mem + vuMemLimit;
|
||||
uint length = _vBlock.num * 16;
|
||||
|
||||
if (!isFill) {
|
||||
|
@ -204,15 +208,15 @@ static __noinline u8* dVifsetVUptr(const nVifStruct& v, int cl, int wl, bool isF
|
|||
// shouldn't count as wrapped data. Otherwise, a trailing skip can cause the emu to drop back
|
||||
// to the interpreter. -- Refraction (test with MGS3)
|
||||
|
||||
int skipSize = (cl - wl) * 16;
|
||||
int blocks = _vBlock.num / wl;
|
||||
uint skipSize = (cl - wl) * 16;
|
||||
uint blocks = _vBlock.num / wl;
|
||||
length += (blocks-1) * skipSize;
|
||||
}
|
||||
|
||||
if ( (startmem+length) <= endmem ) {
|
||||
return startmem;
|
||||
}
|
||||
//Console.WriteLn("nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x", v.idx, v.vif->tag.addr, v.vif->tag.addr + (_vBlock.num * 16), _vBlock.num, wl, cl);
|
||||
//Console.WriteLn("nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x", v.idx, vif.tag.addr, vif.tag.addr + (_vBlock.num * 16), _vBlock.num, wl, cl);
|
||||
return NULL; // Fall Back to Interpreters which have wrap-around logic
|
||||
}
|
||||
|
||||
|
@ -227,50 +231,57 @@ static __fi void dVifRecLimit(int idx) {
|
|||
}
|
||||
}
|
||||
|
||||
// Gcc complains about recursive functions being inlined.
|
||||
void dVifUnpack(int idx, const u8 *data, u32 size, bool isFill) {
|
||||
|
||||
_vifT static __fi bool dVifExecuteUnpack(const u8* data, bool isFill)
|
||||
{
|
||||
const nVifStruct& v = nVif[idx];
|
||||
const u8 upkType = v.vif->cmd & 0x1f | ((!!v.vif->usn) << 5);
|
||||
const int doMask = v.vif->cmd & 0x10;
|
||||
const int cycle_cl = v.vifRegs->cycle.cl;
|
||||
const int cycle_wl = v.vifRegs->cycle.wl;
|
||||
const int blockSize = isFill ? cycle_wl : cycle_cl;
|
||||
|
||||
if (v.vif->cl >= blockSize) v.vif->cl = 0;
|
||||
|
||||
_vBlock.upkType = upkType;
|
||||
_vBlock.num = (u8&)v.vifRegs->num;
|
||||
_vBlock.mode = (u8&)v.vifRegs->mode;
|
||||
_vBlock.scl = v.vif->cl;
|
||||
_vBlock.cl = cycle_cl;
|
||||
_vBlock.wl = cycle_wl;
|
||||
|
||||
// Zero out the mask parameter if it's unused -- games leave random junk
|
||||
// values here which cause false recblock cache misses.
|
||||
_vBlock.mask = doMask ? v.vifRegs->mask : 0;
|
||||
VIFregisters& vifRegs = vifXRegs;
|
||||
|
||||
if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
|
||||
if (u8* dest = dVifsetVUptr(v, cycle_cl, cycle_wl, isFill)) {
|
||||
if (u8* dest = dVifsetVUptr<idx>(vifRegs.cycle.cl, vifRegs.cycle.wl, isFill)) {
|
||||
//DevCon.WriteLn("Running Recompiled Block!");
|
||||
((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
|
||||
}
|
||||
else {
|
||||
//DevCon.WriteLn("Running Interpreter Block");
|
||||
_nVifUnpack(idx, data, size, isFill);
|
||||
_nVifUnpack(idx, data, vifRegs.mode, isFill);
|
||||
}
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
_vifT __fi void dVifUnpack(const u8* data, bool isFill) {
|
||||
|
||||
const nVifStruct& v = nVif[idx];
|
||||
vifStruct& vif = GetVifX;
|
||||
VIFregisters& vifRegs = vifXRegs;
|
||||
|
||||
const u8 upkType = vif.cmd & 0x1f | ((!!vif.usn) << 5);
|
||||
const int doMask = vif.cmd & 0x10;
|
||||
|
||||
_vBlock.upkType = upkType;
|
||||
_vBlock.num = (u8&)vifRegs.num;
|
||||
_vBlock.mode = (u8&)vifRegs.mode;
|
||||
//_vBlock.scl = vif.cl; // scl is always zero now (effectively padding)
|
||||
_vBlock.cl = vifRegs.cycle.cl;
|
||||
_vBlock.wl = vifRegs.cycle.wl;
|
||||
|
||||
// Zero out the mask parameter if it's unused -- games leave random junk
|
||||
// values here which cause false recblock cache misses.
|
||||
_vBlock.mask = doMask ? vifRegs.mask : 0;
|
||||
|
||||
//DevCon.WriteLn("nVif%d: Recompiled Block! [%d]", idx, nVif[idx].numBlocks++);
|
||||
//DevCon.WriteLn(L"[num=% 3d][upkType=0x%02x][scl=%d][cl=%d][wl=%d][mode=%d][m=%d][mask=%s]",
|
||||
// _vBlock.num, _vBlock.upkType, _vBlock.scl, _vBlock.cl, _vBlock.wl, _vBlock.mode,
|
||||
// doMask >> 4, doMask ? wxsFormat( L"0x%08x", _vBlock.mask ).c_str() : L"ignored"
|
||||
//);
|
||||
|
||||
if (dVifExecuteUnpack<idx>(data, isFill)) return;
|
||||
|
||||
xSetPtr(v.recPtr);
|
||||
_vBlock.startPtr = (uptr)xGetAlignedCallTarget();
|
||||
v.vifBlocks->add(_vBlock);
|
||||
VifUnpackSSE_Dynarec( v, _vBlock ).CompileRoutine();
|
||||
VifUnpackSSE_Dynarec( v, _vBlock ).CompileRoutine(vif);
|
||||
nVif[idx].recPtr = xGetPtr();
|
||||
|
||||
// [TODO] : Ideally we should test recompile buffer limits prior to each instruction,
|
||||
|
@ -279,5 +290,9 @@ void dVifUnpack(int idx, const u8 *data, u32 size, bool isFill) {
|
|||
|
||||
// Run the block we just compiled. Various conditions may force us to still use
|
||||
// the interpreter unpacker though, so a recursive call is the safest way here...
|
||||
dVifUnpack(idx, data, size, isFill);
|
||||
//dVifUnpack<idx,isFill>(data);
|
||||
dVifExecuteUnpack<idx>(data, isFill);
|
||||
}
|
||||
|
||||
template void dVifUnpack<0>(const u8* data, bool isFill);
|
||||
template void dVifUnpack<1>(const u8* data, bool isFill);
|
||||
|
|
|
@ -23,9 +23,18 @@
|
|||
#include "newVif.h"
|
||||
|
||||
__aligned16 nVifStruct nVif[2];
|
||||
__aligned16 nVifCall nVifUpk[(2*2*16) *4]; // ([USN][Masking][Unpack Type]) [curCycle]
|
||||
__aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector]
|
||||
|
||||
// Interpreter-style SSE unpacks. Array layout matches the interpreter C unpacks.
|
||||
// ([USN][Masking][Unpack Type]) [curCycle]
|
||||
__aligned16 nVifCall nVifUpk[(2*2*16) *4];
|
||||
|
||||
// This is used by the interpreted SSE unpacks only. Recompiled SSE unpacks
|
||||
// and the interpreted C unpacks use the vif.MaskRow/MaskCol members directly.
|
||||
// [MaskNumber][CycleNumber][Vector]
|
||||
__aligned16 u32 nVifMask[3][4][4] = {0};
|
||||
|
||||
// Number of bytes of data in the source stream needed for each vector.
|
||||
// [equivalent to ((32 >> VL) * (VN+1)) / 8]
|
||||
__aligned16 const u8 nVifT[16] = {
|
||||
4, // S-32
|
||||
2, // S-16
|
||||
|
@ -47,9 +56,9 @@ __aligned16 const u8 nVifT[16] = {
|
|||
|
||||
// ----------------------------------------------------------------------------
|
||||
template< int idx, bool doMode, bool isFill >
|
||||
__ri void __fastcall _nVifUnpackLoop(const u8 *data, u32 size);
|
||||
__ri void __fastcall _nVifUnpackLoop(const u8* data);
|
||||
|
||||
typedef void __fastcall FnType_VifUnpackLoop(const u8 *data, u32 size);
|
||||
typedef void __fastcall FnType_VifUnpackLoop(const u8* data);
|
||||
typedef FnType_VifUnpackLoop* Fnptr_VifUnpackLoop;
|
||||
|
||||
// Unpacks Until 'Num' is 0
|
||||
|
@ -67,10 +76,6 @@ void resetNewVif(int idx)
|
|||
// changed for some reason.
|
||||
|
||||
nVif[idx].idx = idx;
|
||||
nVif[idx].VU = idx ? &VU1 : &VU0;
|
||||
nVif[idx].vuMemLimit = idx ? 0x3ff0 : 0xff0;
|
||||
nVif[idx].vif = &GetVifX;
|
||||
nVif[idx].vifRegs = &vifXRegs;
|
||||
nVif[idx].bSize = 0;
|
||||
memzero(nVif[idx].buffer);
|
||||
|
||||
|
@ -81,65 +86,75 @@ void closeNewVif(int idx) {
|
|||
if (newVifDynaRec) dVifClose(idx);
|
||||
}
|
||||
|
||||
static __fi u8* setVUptr(int vuidx, const u8* vuMemBase, int offset) {
|
||||
return (u8*)(vuMemBase + ( offset & (vuidx ? 0x3ff0 : 0xff0) ));
|
||||
static __fi u8* getVUptr(uint idx, int offset) {
|
||||
return (u8*)(vuRegs[idx].Mem + ( offset & (idx ? 0x3ff0 : 0xff0) ));
|
||||
}
|
||||
|
||||
static __fi void incVUptr(int vuidx, u8* &ptr, const u8* vuMemBase, int amount) {
|
||||
pxAssume( ((uptr)ptr & 0xf) == 0 ); // alignment check
|
||||
ptr += amount;
|
||||
vif->tag.addr += amount;
|
||||
int diff = ptr - (vuMemBase + (vuidx ? 0x4000 : 0x1000));
|
||||
if (diff >= 0) {
|
||||
ptr = (u8*)(vuMemBase + diff);
|
||||
}
|
||||
}
|
||||
|
||||
static __fi void incVUptrBy16(int vuidx, u8* &ptr, const u8* vuMemBase) {
|
||||
pxAssume( ((uptr)ptr & 0xf) == 0 ); // alignment check
|
||||
ptr += 16;
|
||||
vif->tag.addr += 16;
|
||||
if( ptr == (vuMemBase + (vuidx ? 0x4000 : 0x1000)) ) {
|
||||
ptr -= (vuidx ? 0x4000 : 0x1000);
|
||||
}
|
||||
}
|
||||
|
||||
int nVifUnpack(int idx, const u8* data) {
|
||||
_vifT int nVifUnpack(const u8* data) {
|
||||
nVifStruct& v = nVif[idx];
|
||||
vif = v.vif;
|
||||
vifRegs = v.vifRegs;
|
||||
vifStruct& vif = GetVifX;
|
||||
VIFregisters& vifRegs = vifXRegs;
|
||||
|
||||
const int ret = aMin(vif->vifpacketsize, vif->tag.size);
|
||||
const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
|
||||
const uint ret = aMin(vif.vifpacketsize, vif.tag.size);
|
||||
const bool isFill = (vifRegs.cycle.cl < vifRegs.cycle.wl);
|
||||
s32 size = ret << 2;
|
||||
|
||||
if (ret == v.vif->tag.size) { // Full Transfer
|
||||
if (ret == vif.tag.size) { // Full Transfer
|
||||
if (v.bSize) { // Last transfer was partial
|
||||
memcpy_fast(&v.buffer[v.bSize], data, size);
|
||||
v.bSize += size;
|
||||
data = v.buffer;
|
||||
size = v.bSize;
|
||||
|
||||
vifRegs.num = (vifXRegs.code >> 16) & 0xff; // grab NUM form the original VIFcode input.
|
||||
}
|
||||
if (size > 0 || isFill) {
|
||||
if (newVifDynaRec) dVifUnpack(idx, data, size, isFill);
|
||||
else _nVifUnpack(idx, data, size, isFill);
|
||||
}
|
||||
vif->tag.size = 0;
|
||||
vif->cmd = 0;
|
||||
|
||||
vif.cl = 0;
|
||||
|
||||
if (newVifDynaRec) dVifUnpack<idx>(data, isFill);
|
||||
else _nVifUnpack(idx, data, vifRegs.mode, isFill);
|
||||
|
||||
vif.tag.size = 0;
|
||||
vif.cmd = 0;
|
||||
vifRegs.num = 0;
|
||||
v.bSize = 0;
|
||||
}
|
||||
else { // Partial Transfer
|
||||
memcpy_fast(&v.buffer[v.bSize], data, size);
|
||||
v.bSize += size;
|
||||
vif->tag.size -= ret;
|
||||
vif.tag.size -= ret;
|
||||
|
||||
const u8& vSize = nVifT[vif.cmd & 0x0f];
|
||||
|
||||
// We need to provide accurate accounting of the NUM register, in case games decided
|
||||
// to read back from it mid-transfer. Since so few games actually use partial transfers
|
||||
// of VIF unpacks, this code should not be any bottleneck.
|
||||
|
||||
while (size >= vSize) {
|
||||
--vifRegs.num;
|
||||
++vif.cl;
|
||||
|
||||
if (isFill) {
|
||||
if (vif.cl < vifRegs.cycle.cl) size -= vSize;
|
||||
else if (vif.cl == vifRegs.cycle.wl) vif.cl = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
size -= vSize;
|
||||
if (vif.cl >= vifRegs.cycle.wl) vif.cl = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void setMasks(int idx, const VIFregisters& v) {
|
||||
u32* row = idx ? g_vifmask.Row1 : g_vifmask.Row0;
|
||||
u32* col = idx ? g_vifmask.Col1 : g_vifmask.Col0;
|
||||
template int nVifUnpack<0>(const u8* data);
|
||||
template int nVifUnpack<1>(const u8* data);
|
||||
|
||||
// This is used by the interpreted SSE unpacks only. Recompiled SSE unpacks
|
||||
// and the interpreted C unpacks use the vif.MaskRow/MaskCol members directly.
|
||||
static void setMasks(const vifStruct& vif, const VIFregisters& v) {
|
||||
for (int i = 0; i < 16; i++) {
|
||||
int m = (v.mask >> (i*2)) & 3;
|
||||
switch (m) {
|
||||
|
@ -148,15 +163,15 @@ static void setMasks(int idx, const VIFregisters& v) {
|
|||
nVifMask[1][i/4][i%4] = 0;
|
||||
nVifMask[2][i/4][i%4] = 0;
|
||||
break;
|
||||
case 1: // Row
|
||||
case 1: // MaskRow
|
||||
nVifMask[0][i/4][i%4] = 0;
|
||||
nVifMask[1][i/4][i%4] = 0;
|
||||
nVifMask[2][i/4][i%4] = newVifDynaRec ? row[i%4] : ((u32*)&v.r0)[(i%4)*4];
|
||||
nVifMask[2][i/4][i%4] = vif.MaskRow._u32[i%4];
|
||||
break;
|
||||
case 2: // Col
|
||||
case 2: // MaskCol
|
||||
nVifMask[0][i/4][i%4] = 0;
|
||||
nVifMask[1][i/4][i%4] = 0;
|
||||
nVifMask[2][i/4][i%4] = newVifDynaRec ? col[i/4] : ((u32*)&v.c0)[(i/4)*4];
|
||||
nVifMask[2][i/4][i%4] = vif.MaskCol._u32[i/4];
|
||||
break;
|
||||
case 3: // Write Protect
|
||||
nVifMask[0][i/4][i%4] = 0;
|
||||
|
@ -175,74 +190,81 @@ static void setMasks(int idx, const VIFregisters& v) {
|
|||
// a "win" to move code outside the loop, like normally in most other loop scenarios.
|
||||
//
|
||||
// The biggest bottleneck of the current code is the call/ret needed to invoke the SSE
|
||||
// unpackers. A better option is to generate the entire vifRegs->num loop code as part
|
||||
// unpackers. A better option is to generate the entire vifRegs.num loop code as part
|
||||
// of the SSE template, and inline the SSE code into the heart of it. This both avoids
|
||||
// the call/ret and opens the door for resolving some register dependency chains in the
|
||||
// current emitted functions. (this is what zero's SSE does to get it's final bit of
|
||||
// speed advantage over the new vif). --air
|
||||
//
|
||||
// As a secondary optimization to above, special handlers could be generated for the
|
||||
// cycleSize==1 case, which is used frequently enough, and results in enough code
|
||||
// elimination that it would probably be a win in most cases (and for sure in many
|
||||
// "slow" games that need it most). --air
|
||||
// The BEST optimizatin strategy here is to use data available to us from the UNPACK dispatch
|
||||
// -- namely the unpack type and mask flag -- in combination mode and usn values -- to
|
||||
// generate ~600 special versions of this function. But since it's an interpreter, who gives
|
||||
// a crap? Really? :p
|
||||
//
|
||||
|
||||
// size - size of the packet fragment incoming from DMAC.
|
||||
template< int idx, bool doMode, bool isFill >
|
||||
__ri void __fastcall _nVifUnpackLoop(const u8 *data, u32 size) {
|
||||
__ri void __fastcall _nVifUnpackLoop(const u8* data) {
|
||||
|
||||
const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
|
||||
const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
|
||||
const int skipSize = blockSize - cycleSize;
|
||||
//DevCon.WriteLn("[%d][%d][%d][num=%d][upk=%d][cl=%d][bl=%d][skip=%d]", isFill, doMask, doMode, vifRegs->num, upkNum, vif->cl, blockSize, skipSize);
|
||||
vifStruct& vif = GetVifX;
|
||||
VIFregisters& vifRegs = vifXRegs;
|
||||
|
||||
if (vif->cmd & 0x10) setMasks(idx, *vifRegs);
|
||||
// skipSize used for skipping writes only
|
||||
const int skipSize = (vifRegs.cycle.cl - vifRegs.cycle.wl) * 16;
|
||||
|
||||
const int usn = !!(vif->usn);
|
||||
const int upkNum = vif->cmd & 0x1f;
|
||||
//const s8& vift = nVifT[upkNum]; // might be useful later when other SSE paths are finished.
|
||||
//DevCon.WriteLn("[%d][%d][%d][num=%d][upk=%d][cl=%d][bl=%d][skip=%d]", isFill, doMask, doMode, vifRegs.num, upkNum, vif.cl, blockSize, skipSize);
|
||||
|
||||
if (!doMode && (vif.cmd & 0x10)) setMasks(vif, vifRegs);
|
||||
|
||||
const int usn = !!vif.usn;
|
||||
const int upkNum = vif.cmd & 0x1f;
|
||||
const u8& vSize = nVifT[upkNum & 0x0f];
|
||||
//uint vl = vif.cmd & 0x03;
|
||||
//uint vn = (vif.cmd >> 2) & 0x3;
|
||||
//uint vSize = ((32 >> vl) * (vn+1)) / 8; // size of data (in bytes) used for each write cycle
|
||||
|
||||
const nVifCall* fnbase = &nVifUpk[ ((usn*2*16) + upkNum) * (4*1) ];
|
||||
const VIFUnpackFuncTable& ft = VIFfuncTable[upkNum];
|
||||
UNPACKFUNCTYPE func = usn ? ft.funcU : ft.funcS;
|
||||
const UNPACKFUNCTYPE ft = VIFfuncTable[idx][doMode ? vifRegs.mode : 0][ ((usn*2*16) + upkNum) ];
|
||||
|
||||
const u8* vuMemBase = (idx ? VU1 : VU0).Mem;
|
||||
u8* dest = setVUptr(idx, vuMemBase, vif->tag.addr);
|
||||
if (vif->cl >= blockSize) vif->cl = 0;
|
||||
pxAssume (vif.cl == 0);
|
||||
pxAssume (vifRegs.cycle.wl > 0);
|
||||
|
||||
do {
|
||||
u8* dest = getVUptr(idx, vif.tag.addr);
|
||||
|
||||
while (vifRegs->num) {
|
||||
if (vif->cl < cycleSize) {
|
||||
// This should always be true as per the _1mb buffer used to merge partial transfers.
|
||||
pxAssume (size >= ft.gsize);
|
||||
if (doMode) {
|
||||
//DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum);
|
||||
func((u32*)dest, (u32*)data);
|
||||
//if (1) {
|
||||
ft(dest, data);
|
||||
}
|
||||
else {
|
||||
//DevCon.WriteLn("SSE Unpack!");
|
||||
fnbase[aMin(vif->cl, 3)](dest, data);
|
||||
uint cl3 = aMin(vif.cl,3);
|
||||
fnbase[cl3](dest, data);
|
||||
}
|
||||
data += ft.gsize;
|
||||
size -= ft.gsize;
|
||||
vifRegs->num--;
|
||||
incVUptrBy16(idx, dest, vuMemBase);
|
||||
if (++vif->cl == blockSize) vif->cl = 0;
|
||||
}
|
||||
else if (isFill) {
|
||||
|
||||
vif.tag.addr += 16;
|
||||
--vifRegs.num;
|
||||
++vif.cl;
|
||||
|
||||
if (isFill) {
|
||||
//DevCon.WriteLn("isFill!");
|
||||
func((u32*)dest, (u32*)data);
|
||||
vifRegs->num--;
|
||||
incVUptrBy16(idx, dest, vuMemBase);
|
||||
if (++vif->cl == blockSize) vif->cl = 0;
|
||||
if (vif.cl < vifRegs.cycle.cl) data += vSize;
|
||||
else if (vif.cl == vifRegs.cycle.wl) vif.cl = 0;
|
||||
}
|
||||
else {
|
||||
incVUptr(idx, dest, vuMemBase, 16 * skipSize);
|
||||
vif->cl = 0;
|
||||
else
|
||||
{
|
||||
data += vSize;
|
||||
|
||||
if (vif.cl >= vifRegs.cycle.wl) {
|
||||
vif.tag.addr += skipSize;
|
||||
vif.cl = 0;
|
||||
}
|
||||
}
|
||||
} while (vifRegs.num);
|
||||
}
|
||||
|
||||
__fi void _nVifUnpack(int idx, const u8 *data, u32 size, bool isFill) {
|
||||
__fi void _nVifUnpack(int idx, const u8* data, uint mode, bool isFill) {
|
||||
|
||||
const bool doMode = !!vifRegs->mode;
|
||||
UnpackLoopTable[idx][doMode][isFill]( data, size );
|
||||
UnpackLoopTable[idx][!!mode][isFill]( data );
|
||||
}
|
||||
|
||||
|
|
|
@ -36,31 +36,6 @@ void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xy
|
|||
}
|
||||
}
|
||||
|
||||
// Loads Row/Col Data from vifRegs instead of g_vifmask
|
||||
// Useful for testing vifReg and g_vifmask inconsistency.
|
||||
void loadRowCol(nVifStruct& v) {
|
||||
xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]);
|
||||
xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]);
|
||||
xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]);
|
||||
xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]);
|
||||
|
||||
xPSHUF.D(xmm0, xmm0, _v0);
|
||||
xPSHUF.D(xmm1, xmm1, _v0);
|
||||
xPSHUF.D(xmm2, xmm2, _v0);
|
||||
xPSHUF.D(xmm6, xmm6, _v0);
|
||||
mVUmergeRegs(xmm6, xmm0, 8);
|
||||
mVUmergeRegs(xmm6, xmm1, 4);
|
||||
mVUmergeRegs(xmm6, xmm2, 2);
|
||||
xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
|
||||
xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
|
||||
xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
|
||||
xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]);
|
||||
xPSHUF.D(xmm2, xmm2, _v0);
|
||||
xPSHUF.D(xmm3, xmm3, _v0);
|
||||
xPSHUF.D(xmm4, xmm4, _v0);
|
||||
xPSHUF.D(xmm5, xmm5, _v0);
|
||||
}
|
||||
|
||||
// =====================================================================================================
|
||||
// VifUnpackSSE_Base Section
|
||||
// =====================================================================================================
|
||||
|
|
|
@ -25,7 +25,6 @@
|
|||
using namespace x86Emitter;
|
||||
|
||||
extern void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xyzw);
|
||||
extern void loadRowCol(nVifStruct& v);
|
||||
|
||||
// --------------------------------------------------------------------------------------
|
||||
// VifUnpackSSE_Base
|
||||
|
@ -127,7 +126,7 @@ public:
|
|||
|
||||
virtual bool IsUnmaskedOp() const{ return !doMode && !doMask; }
|
||||
|
||||
void CompileRoutine();
|
||||
void CompileRoutine(vifStruct& vif);
|
||||
|
||||
protected:
|
||||
virtual void doMaskWrite(const xRegisterSSE& regX) const;
|
||||
|
|
Loading…
Reference in New Issue