newVif: I'm a terrible person. What started out at an optimization turned into this.

* Optimized codegen of the VPU recompiler using displaced memory offsets (1-2% speedup)
 * Undid a lot of the inl stuff for more traditional cpp code layout (explained below)
 * Removed some redundant code and turned some macros into functions.
 * Renamed a few things to VPU (Vector Processing Unit, which is the specific name of the logic core that performs VIF Command Processing and Unpacking on the PS2)

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2387 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2009-12-23 13:22:30 +00:00
parent 5c8f4ded22
commit b27b89b162
16 changed files with 827 additions and 806 deletions

View File

@ -99,15 +99,16 @@ extern pxDoAssertFnType* pxDoAssert;
// rear ugly heads in optimized builds, this is one of the few tools we have.
#define pxAssertRel(cond, msg) ( (likely(cond)) || (pxOnAssert(pxAssertSpot(cond), msg), false) )
#define pxAssumeMsg(cond, msg) ((void) ( (!likely(cond)) && (pxOnAssert(pxAssertSpot(cond), msg), false) ))
#define pxAssumeRel(cond, msg) ((void) ( (!likely(cond)) && (pxOnAssert(pxAssertSpot(cond), msg), false) ))
#define pxFailRel(msg) pxAssumeRel(false, msg)
#if defined(PCSX2_DEBUG)
# define pxAssertMsg(cond, msg) pxAssertRel(cond, msg)
# define pxAssertDev(cond, msg) pxAssertMsg(cond, msg)
# define pxAssume(cond) pxAssumeMsg(cond, wxNullChar)
# define pxAssumeDev(cond, msg) pxAssumeMsg(cond, msg)
# define pxAssumeMsg(cond, msg) pxAssumeRel(cond, msg)
# define pxAssumeDev(cond, msg) pxAssumeRel(cond, msg)
# define pxFail(msg) pxAssumeMsg(false, msg)
# define pxFailDev(msg) pxAssumeDev(false, msg)
@ -120,7 +121,7 @@ extern pxDoAssertFnType* pxDoAssert;
# define pxAssertMsg(cond, msg) (likely(cond))
# define pxAssertDev(cond, msg) pxAssertRel(cond, msg)
# define pxAssume(cond) (__assume(cond))
# define pxAssumeMsg(cond, msg) (__assume(cond))
# define pxAssumeDev(cond, msg) pxAssumeMsg(cond, msg)
# define pxFail(msg) (__assume(false))
@ -134,7 +135,7 @@ extern pxDoAssertFnType* pxDoAssert;
# define pxAssertMsg(cond, msg) (likely(cond))
# define pxAssertDev(cond, msg) (likely(cond))
# define pxAssume(cond) (__assume(cond))
# define pxAssumeMsg(cond, msg) (__assume(cond))
# define pxAssumeDev(cond, msg) (__assume(cond))
# define pxFail(msg) (__assume(false))
@ -143,6 +144,7 @@ extern pxDoAssertFnType* pxDoAssert;
#endif
#define pxAssert(cond) pxAssertMsg(cond, wxNullChar)
#define pxAssume(cond) pxAssumeMsg(cond, wxNullChar)
#define pxAssertRelease( cond, msg )

View File

@ -490,6 +490,11 @@ template< typename T > void xWrite( T val );
__forceinline xAddressInfo operator+( s32 imm ) const { return xAddressInfo( *this ).Add( imm ); }
__forceinline xAddressInfo operator-( s32 imm ) const { return xAddressInfo( *this ).Add( -imm ); }
__forceinline xAddressInfo operator+( const void* addr ) const { return xAddressInfo( *this ).Add( (uptr)addr ); }
__forceinline void operator+=( const xAddressReg& right ) { Add( right ); }
__forceinline void operator+=( const xAddressInfo& right ) { Add( right ); }
__forceinline void operator+=( s32 imm ) { Add( imm ); }
__forceinline void operator-=( s32 imm ) { Add( -imm ); }
};
extern const xRegisterSSE

View File

@ -515,6 +515,10 @@
<Unit filename="../x86/ix86-32/iR5900Shift.cpp" />
<Unit filename="../x86/ix86-32/iR5900Templates.cpp" />
<Unit filename="../x86/ix86-32/recVTLB.cpp" />
<Unit filename="../x86/VpuUnpackSSE.cpp" />
<Unit filename="../x86/VpuUnpackSSE.h" />
<Unit filename="../x86/VpuUnpackSSE_Dynarec.cpp" />
<Unit filename="../x86/newVof_Unpack.cpp" />
<Unit filename="../x86/microVU.cpp" />
<Unit filename="../x86/microVU.h" />
<Unit filename="../x86/microVU_Alloc.inl" />
@ -534,12 +538,8 @@
<Unit filename="../x86/microVU_Upper.inl" />
<Unit filename="../x86/newVif.h" />
<Unit filename="../x86/newVif_BlockBuffer.h" />
<Unit filename="../x86/newVif_Dynarec.inl" />
<Unit filename="../x86/newVif_HashBucket.h" />
<Unit filename="../x86/newVif_OldUnpack.inl" />
<Unit filename="../x86/newVif_Tables.inl" />
<Unit filename="../x86/newVif_Unpack.inl" />
<Unit filename="../x86/newVif_UnpackGen.inl" />
<Unit filename="../x86/sVU_Debug.h" />
<Unit filename="../x86/sVU_Lower.cpp" />
<Unit filename="../x86/sVU_Micro.cpp" />

View File

@ -320,7 +320,6 @@ static int __fastcall Vif1TransDirectHL(u32 *data)
static int __fastcall Vif1TransUnpack(u32 *data)
{
#ifdef newVif1
extern int nVifUnpack(int idx, u8 *data);
return nVifUnpack(1, (u8*)data);
#endif

View File

@ -75,19 +75,21 @@ template<const u32 VIFdmanum> void ProcessMemSkip(u32 size, u32 unpackType);
template<const u32 VIFdmanum> u32 VIFalign(u32 *data, vifCode *v, u32 size);
template<const u32 VIFdmanum> void VIFunpack(u32 *data, vifCode *v, u32 size);
template<const u32 VIFdmanum> void vuExecMicro(u32 addr);
extern __forceinline void vif0FLUSH();
extern __forceinline void vif1FLUSH();
extern void vif0FLUSH();
extern void vif1FLUSH();
static __forceinline u32 vif_size(u8 num)
{
return (num == 0) ? 0x1000 : 0x4000;
}
//#define newVif // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code)
//#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined)
#define newVif // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code)
#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined)
//#define newVif0 // Use New Code for Vif0 Unpacks (not implemented)
#ifndef newVif
#ifdef newVif
extern int nVifUnpack(int idx, u8 *data);
#else
//# define NON_SSE_UNPACKS // Turns off SSE Unpacks (slower)
#endif

View File

@ -860,7 +860,7 @@
>
</File>
<File
RelativePath="..\..\x86\newVif_Unpack.inl"
RelativePath="..\..\x86\newVif_Unpack.cpp"
>
</File>
<File
@ -871,11 +871,19 @@
Name="Dynarec"
>
<File
RelativePath="..\..\x86\newVif_Dynarec.inl"
RelativePath="..\..\x86\newVif_Tables.inl"
>
</File>
<File
RelativePath="..\..\x86\newVif_Tables.inl"
RelativePath="..\..\x86\VpuUnpackSSE.cpp"
>
</File>
<File
RelativePath="..\..\x86\VpuUnpackSSE.h"
>
</File>
<File
RelativePath="..\..\x86\VpuUnpackSSE_Dynarec.cpp"
>
</File>
</Filter>

285
pcsx2/x86/VpuUnpackSSE.cpp Normal file
View File

@ -0,0 +1,285 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2009 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "PrecompiledHeader.h"
#include "VpuUnpackSSE.h"
#define xMOV8(regX, loc) xMOVSSZX(regX, loc)
#define xMOV16(regX, loc) xMOVSSZX(regX, loc)
#define xMOV32(regX, loc) xMOVSSZX(regX, loc)
#define xMOV64(regX, loc) xMOVUPS(regX, loc)
#define xMOV128(regX, loc) xMOVUPS(regX, loc)
static __pagealigned u8 nVifUpkExec[__pagesize*4];
// =====================================================================================================
// VpuUnpackSSE_Base Section
// =====================================================================================================
VpuUnpackSSE_Base::VpuUnpackSSE_Base()
: dstIndirect(ecx) // parameter 1 of __fastcall
, srcIndirect(edx) // parameter 2 of __fastcall
{
}
void VpuUnpackSSE_Base::xMovDest(const xRegisterSSE& srcReg) const {
if (!doMode && !doMask) { xMOVAPS (ptr[dstIndirect], srcReg); }
else { doMaskWrite(srcReg); }
}
void VpuUnpackSSE_Base::xShiftR(const xRegisterSSE& regX, int n) const {
if (usn) { xPSRL.D(regX, n); }
else { xPSRA.D(regX, n); }
}
void VpuUnpackSSE_Base::xPMOVXX8(const xRegisterSSE& regX) const {
if (usn) xPMOVZX.BD(regX, ptr32[srcIndirect]);
else xPMOVSX.BD(regX, ptr32[srcIndirect]);
}
void VpuUnpackSSE_Base::xPMOVXX16(const xRegisterSSE& regX) const {
if (usn) xPMOVZX.WD(regX, ptr64[srcIndirect]);
else xPMOVSX.WD(regX, ptr64[srcIndirect]);
}
void VpuUnpackSSE_Base::xUPK_S_32() const {
xMOV32 (xmm0, ptr32[srcIndirect]);
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
}
void VpuUnpackSSE_Base::xUPK_S_16() const {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0);
}
else {
xMOV16 (xmm0, ptr32[srcIndirect]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
}
void VpuUnpackSSE_Base::xUPK_S_8() const {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0);
}
else {
xMOV8 (xmm0, ptr32[srcIndirect]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
}
void VpuUnpackSSE_Base::xUPK_V2_32() const {
xMOV64 (xmm0, ptr32[srcIndirect]);
xMovDest (xmm0);
}
void VpuUnpackSSE_Base::xUPK_V2_16() const {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0);
}
else {
xMOV32 (xmm0, ptr32[srcIndirect]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
}
void VpuUnpackSSE_Base::xUPK_V2_8() const {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0);
}
else {
xMOV16 (xmm0, ptr32[srcIndirect]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
}
void VpuUnpackSSE_Base::xUPK_V3_32() const {
xMOV128 (xmm0, ptr32[srcIndirect]);
xMovDest (xmm0);
}
void VpuUnpackSSE_Base::xUPK_V3_16() const {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0);
}
else {
xMOV64 (xmm0, ptr32[srcIndirect]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
}
void VpuUnpackSSE_Base::xUPK_V3_8() const {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0);
}
else {
xMOV32 (xmm0, ptr32[srcIndirect]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
}
void VpuUnpackSSE_Base::xUPK_V4_32() const {
xMOV128 (xmm0, ptr32[srcIndirect]);
xMovDest (xmm0);
}
void VpuUnpackSSE_Base::xUPK_V4_16() const {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0);
}
else {
xMOV64 (xmm0, ptr32[srcIndirect]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
}
void VpuUnpackSSE_Base::xUPK_V4_8() const {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0);
}
else {
xMOV32 (xmm0, ptr32[srcIndirect]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
}
void VpuUnpackSSE_Base::xUPK_V4_5() const {
xMOV16 (xmm0, ptr32[srcIndirect]);
xPSHUF.D (xmm0, xmm0, _v0);
xPSLL.D (xmm0, 3); // ABG|R5.000
xMOVAPS (xmm1, xmm0); // x|x|x|R
xPSRL.D (xmm0, 8); // ABG
xPSLL.D (xmm0, 3); // AB|G5.000
mVUmergeRegs(XMM1, XMM0, 0x4); // x|x|G|R
xPSRL.D (xmm0, 8); // AB
xPSLL.D (xmm0, 3); // A|B5.000
mVUmergeRegs(XMM1, XMM0, 0x2); // x|B|G|R
xPSRL.D (xmm0, 8); // A
xPSLL.D (xmm0, 7); // A.0000000
mVUmergeRegs(XMM1, XMM0, 0x1); // A|B|G|R
xPSLL.D (xmm1, 24); // can optimize to
xPSRL.D (xmm1, 24); // single AND...
xMovDest (xmm1);
}
void VpuUnpackSSE_Base::xUnpack( int upknum )
{
switch( upknum )
{
case 0: xUPK_S_32(); break;
case 1: xUPK_S_16(); break;
case 2: xUPK_S_8(); break;
case 4: xUPK_V2_32(); break;
case 5: xUPK_V2_16(); break;
case 6: xUPK_V2_8(); break;
case 8: xUPK_V3_32(); break;
case 9: xUPK_V3_16(); break;
case 10: xUPK_V3_8(); break;
case 12: xUPK_V4_32(); break;
case 13: xUPK_V4_16(); break;
case 14: xUPK_V4_8(); break;
case 15: xUPK_V4_5(); break;
case 3:
case 7:
case 11:
pxFailRel( wxsFormat( L"Vpu/Vif - Invalid Unpack! [%d]", upknum ) );
break;
}
}
// =====================================================================================================
// VpuUnpackSSE_Simple
// =====================================================================================================
VpuUnpackSSE_Simple::VpuUnpackSSE_Simple(bool usn_, bool domask_, int curCycle_)
{
curCycle = curCycle_;
usn = usn_;
doMask = domask_;
}
void VpuUnpackSSE_Simple::doMaskWrite(const xRegisterSSE& regX) const {
xMOVAPS(xmm7, ptr[dstIndirect]);
int offX = aMin(curCycle, 3);
xPAND(regX, ptr32[nVifMask[0][offX]]);
xPAND(xmm7, ptr32[nVifMask[1][offX]]);
xPOR (regX, ptr32[nVifMask[2][offX]]);
xPOR (regX, xmm7);
xMOVAPS(ptr[dstIndirect], regX);
}
// ecx = dest, edx = src
static void nVifGen(int usn, int mask, int curCycle) {
int usnpart = usn*2*16;
int maskpart = mask*16;
int curpart = curCycle;
VpuUnpackSSE_Simple vpugen( !!usn, !!mask, curCycle );
for( int i=0; i<16; ++i )
{
nVifCall& ucall( nVifUpk[((usnpart+maskpart+i) * 4) + (curpart)] );
ucall = NULL;
if( nVifT[i] == 0 ) continue;
ucall = (nVifCall)xGetAlignedCallTarget();
vpugen.xUnpack(i);
xRET();
pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) );
}
}
void VpuUnpackSSE_Init()
{
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false);
memset8<0xcc>( nVifUpkExec );
xSetPtr( nVifUpkExec );
for (int a = 0; a < 2; a++) {
for (int b = 0; b < 2; b++) {
for (int c = 0; c < 4; c++) {
nVifGen(a, b, c);
}}}
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true);
}

134
pcsx2/x86/VpuUnpackSSE.h Normal file
View File

@ -0,0 +1,134 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2009 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "Common.h"
#include "VifDma_internal.h"
#include "newVif.h"
#include <xmmintrin.h>
#include <emmintrin.h>
using namespace x86Emitter;
// --------------------------------------------------------------------------------------
// VpuUnpackSSE_Base
// --------------------------------------------------------------------------------------
class VpuUnpackSSE_Base
{
public:
bool usn; // unsigned flag
bool doMask; // masking write enable flag
int doMode; // two bit value representing... something!
protected:
xAddressInfo dstIndirect;
xAddressInfo srcIndirect;
public:
VpuUnpackSSE_Base();
virtual ~VpuUnpackSSE_Base() throw() {}
void xUnpack( int upktype );
protected:
virtual void doMaskWrite(const xRegisterSSE& regX ) const=0;
virtual void xMovDest(const xRegisterSSE& srcReg) const;
virtual void xShiftR(const xRegisterSSE& regX, int n) const;
virtual void xPMOVXX8(const xRegisterSSE& regX) const;
virtual void xPMOVXX16(const xRegisterSSE& regX) const;
virtual void xUPK_S_32() const;
virtual void xUPK_S_16() const;
virtual void xUPK_S_8() const;
virtual void xUPK_V2_32() const;
virtual void xUPK_V2_16() const;
virtual void xUPK_V2_8() const;
virtual void xUPK_V3_32() const;
virtual void xUPK_V3_16() const;
virtual void xUPK_V3_8() const;
virtual void xUPK_V4_32() const;
virtual void xUPK_V4_16() const;
virtual void xUPK_V4_8() const;
virtual void xUPK_V4_5() const;
};
// --------------------------------------------------------------------------------------
// VpuUnpackSSE_Simple
// --------------------------------------------------------------------------------------
class VpuUnpackSSE_Simple : public VpuUnpackSSE_Base
{
typedef VpuUnpackSSE_Base _parent;
public:
int curCycle;
public:
VpuUnpackSSE_Simple(bool usn_, bool domask_, int curCycle_);
virtual ~VpuUnpackSSE_Simple() throw() {}
protected:
virtual void doMaskWrite(const xRegisterSSE& regX ) const;
};
// --------------------------------------------------------------------------------------
// VpuUnpackSSE_Dynarec
// --------------------------------------------------------------------------------------
class VpuUnpackSSE_Dynarec : public VpuUnpackSSE_Base
{
typedef VpuUnpackSSE_Base _parent;
public:
bool isFill;
protected:
const nVifStruct& v; // vif0 or vif1
const nVifBlock& vB; // some pre-collected data from VifStruct
int vCL; // internal copy of vif->cl
public:
VpuUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_);
VpuUnpackSSE_Dynarec(const VpuUnpackSSE_Dynarec& src) // copy constructor
: _parent(src)
, v(src.v)
, vB(src.vB)
{
isFill = src.isFill;
vCL = src.vCL;
}
virtual ~VpuUnpackSSE_Dynarec() throw() {}
void CompileRoutine();
protected:
virtual void doMaskWrite(const xRegisterSSE& regX) const;
void SetMasks(int cS) const;
void writeBackRow() const;
static VpuUnpackSSE_Dynarec FillingWrite( const VpuUnpackSSE_Dynarec& src )
{
VpuUnpackSSE_Dynarec fillingWrite( src );
fillingWrite.doMask = true;
fillingWrite.doMode = 0;
return fillingWrite;
}
};

View File

@ -0,0 +1,278 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2009 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
// newVif Dynarec - Dynamically Recompiles Vif 'unpack' Packets
// authors: cottonvibes(@gmail.com)
// Jake.Stine (@gmail.com)
#include "PrecompiledHeader.h"
#include "VpuUnpackSSE.h"
static __aligned16 nVifBlock _vBlock = {0};
static __pagealigned u8 nVifMemCmp[__pagesize];
static void emitCustomCompare() {
HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadWrite, false);
memset8<0xcc>(nVifMemCmp);
xSetPtr(nVifMemCmp);
xMOVAPS (xmm0, ptr32[ecx]);
xPCMP.EQD(xmm0, ptr32[edx]);
xMOVMSKPS(eax, xmm0);
xAND (eax, 0x7); // ignore top 4 bytes (recBlock pointer)
xRET();
HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadOnly, true);
}
void dVifInit(int idx) {
nVif[idx].idx = idx;
nVif[idx].VU = idx ? &VU1 : &VU0;
nVif[idx].vif = idx ? &vif1 : &vif0;
nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs;
nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
nVif[idx].vifCache = new BlockBuffer(_1mb*4); // 4mb Rec Cache
nVif[idx].vifBlocks = new HashBucket<_tParams>();
nVif[idx].recPtr = nVif[idx].vifCache->getBlock();
nVif[idx].recEnd = &nVif[idx].recPtr[nVif[idx].vifCache->getSize()-(_1mb/4)]; // .25mb Safe Zone
//emitCustomCompare();
}
// Loads Row/Col Data from vifRegs instead of g_vifmask
// Useful for testing vifReg and g_vifmask inconsistency.
static void loadRowCol(nVifStruct& v) {
xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]);
xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]);
xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]);
xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]);
xPSHUF.D(xmm0, xmm0, _v0);
xPSHUF.D(xmm1, xmm1, _v0);
xPSHUF.D(xmm2, xmm2, _v0);
xPSHUF.D(xmm6, xmm6, _v0);
mVUmergeRegs(XMM6, XMM0, 8);
mVUmergeRegs(XMM6, XMM1, 4);
mVUmergeRegs(XMM6, XMM2, 2);
xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]);
xPSHUF.D(xmm2, xmm2, _v0);
xPSHUF.D(xmm3, xmm3, _v0);
xPSHUF.D(xmm4, xmm4, _v0);
xPSHUF.D(xmm5, xmm5, _v0);
}
VpuUnpackSSE_Dynarec::VpuUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_)
: v(vif_)
, vB(vifBlock_)
{
isFill = (vB.cl < vB.wl);
usn = (vB.upkType>>5) & 1;
doMask = (vB.upkType>>4) & 1;
doMode = vB.mode & 3;
}
#define makeMergeMask(x) { \
x = ((x&0x40)>>6) | ((x&0x10)>>3) | (x&4) | ((x&1)<<3); \
}
_f void VpuUnpackSSE_Dynarec::SetMasks(int cS) const {
u32 m0 = vB.mask;
u32 m1 = m0 & 0xaaaaaaaa;
u32 m2 =(~m1>>1) & m0;
u32 m3 = (m1>>1) & ~m0;
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0;
if((m2&&doMask) || doMode) { xMOVAPS(xmmRow, ptr32[row]); }
if (m3&&doMask) {
xMOVAPS(xmmCol0, ptr32[col]);
if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3);
if ((cS>=1) && (m3&0x000000ff)) xPSHUF.D(xmmCol0, xmmCol0, _v0);
}
//if (mask||mode) loadRowCol(v);
}
void VpuUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
pxAssumeDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking.");
int cc = aMin(vCL, 3);
u32 m0 = (vB.mask >> (cc * 8)) & 0xff;
u32 m1 = m0 & 0xaaaa;
u32 m2 =(~m1>>1) & m0;
u32 m3 = (m1>>1) & ~m0;
u32 m4 = (m1>>1) & m0;
makeMergeMask(m2);
makeMergeMask(m3);
makeMergeMask(m4);
if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]); } // Load Write Protect
if (doMask&&m2) { mVUmergeRegs(regX.Id, xmmRow.Id, m2); } // Merge Row
if (doMask&&m3) { mVUmergeRegs(regX.Id, xmmCol0.Id+cc, m3); } // Merge Col
if (doMask&&m4) { mVUmergeRegs(regX.Id, xmmTemp.Id, m4); } // Merge Write Protect
if (doMode) {
u32 m5 = (~m1>>1) & ~m0;
if (!doMask) m5 = 0xf;
else makeMergeMask(m5);
if (m5 < 0xf) {
xPXOR(xmmTemp, xmmTemp);
mVUmergeRegs(xmmTemp.Id, xmmRow.Id, m5);
xPADD.D(regX, xmmTemp);
if (doMode==2) mVUmergeRegs(xmmRow.Id, regX.Id, m5);
}
else if (m5 == 0xf) {
xPADD.D(regX, xmmRow);
if (doMode==2) xMOVAPS(xmmRow, regX);
}
}
xMOVAPS(ptr32[dstIndirect], regX);
}
void VpuUnpackSSE_Dynarec::writeBackRow() const {
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
xMOVAPS(ptr32[row], xmmRow);
DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
// ToDo: Do we need to write back to vifregs.rX too!? :/
}
static void ShiftDisplacementWindow( xAddressInfo& addr, const xRegister32& modReg )
{
// Shifts the displacement factor of a given indirect address, so that the address
// remains in the optimal 0xf0 range (which allows for byte-form displacements when
// generating instructions).
int addImm = 0;
while( addr.Displacement >= 0x80 )
{
addImm += 0xf0;
addr -= 0xf0;
}
if(addImm) xADD(modReg, addImm);
}
void VpuUnpackSSE_Dynarec::CompileRoutine() {
const int upkNum = vB.upkType & 0xf;
const u8& vift = nVifT[upkNum];
const int cycleSize = isFill ? vB.cl : vB.wl;
const int blockSize = isFill ? vB.wl : vB.cl;
const int skipSize = blockSize - cycleSize;
int vNum = vifRegs->num;
vCL = vif->cl;
SetMasks(cycleSize);
while (vNum) {
ShiftDisplacementWindow( srcIndirect, edx );
ShiftDisplacementWindow( dstIndirect, ecx );
if (vCL < cycleSize) {
xUnpack(upkNum);
srcIndirect += vift;
dstIndirect += 16;
vNum--;
if (++vCL == blockSize) vCL = 0;
}
else if (isFill) {
DevCon.WriteLn("filling mode!");
VpuUnpackSSE_Dynarec::FillingWrite( *this ).xUnpack(upkNum);
dstIndirect += 16;
vNum--;
if (++vCL == blockSize) vCL = 0;
}
else {
dstIndirect += (16 * skipSize);
vCL = 0;
}
}
if (doMode==2) writeBackRow();
xMOV(ptr32[&vif->cl], vCL);
xMOV(ptr32[&vifRegs->num], vNum);
xRET();
}
static _f u8* dVifsetVUptr(const nVifStruct& v, int offset) {
u8* ptr = (u8*)(v.VU->Mem + (offset & v.vuMemLimit));
u8* endPtr = ptr + _vBlock.num * 16;
if (endPtr > v.vuMemEnd) {
DevCon.WriteLn("nVif - VU Mem Ptr Overflow; falling back to interpreter.");
ptr = NULL; // Fall Back to Interpreters which have wrap-around logic
}
return ptr;
}
static _f void dVifRecLimit(int idx) {
if (nVif[idx].recPtr > nVif[idx].recEnd) {
DevCon.WriteLn("nVif Rec - Out of Rec Cache! [%x > %x]", nVif[idx].recPtr, nVif[idx].recEnd);
nVif[idx].vifBlocks->clear();
nVif[idx].recPtr = nVif[idx].vifCache->getBlock();
}
}
_f void dVifUnpack(int idx, u8 *data, u32 size, bool isFill) {
const nVifStruct& v = nVif[idx];
const u8 upkType = vif->cmd & 0x1f | ((!!vif->usn) << 5);
const int doMask = (upkType>>4) & 1;
const int cycle_cl = vifRegs->cycle.cl;
const int cycle_wl = vifRegs->cycle.wl;
const int cycleSize = isFill ? cycle_cl : cycle_wl;
const int blockSize = isFill ? cycle_wl : cycle_cl;
if (vif->cl >= blockSize) vif->cl = 0;
_vBlock.upkType = upkType;
_vBlock.num = *(u8*)&vifRegs->num;
_vBlock.mode = *(u8*)&vifRegs->mode;
_vBlock.scl = vif->cl;
_vBlock.cl = cycle_cl;
_vBlock.wl = cycle_wl;
// Zero out the mask parameter if it's unused -- games leave random junk
// values here which cause false recblock cache misses.
_vBlock.mask = doMask ? vifRegs->mask : 0x00;
if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
if( u8* dest = dVifsetVUptr(v, vif->tag.addr) ) {
//DevCon.WriteLn("Running Recompiled Block!");
((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
}
else {
//DevCon.WriteLn("Running Interpreter Block");
_nVifUnpack(idx, data, size, isFill);
}
return;
}
static int recBlockNum = 0;
DevCon.WriteLn("nVif: Recompiled Block! [%d]", recBlockNum++);
DevCon.WriteLn(L"\t(num=0x%02x, upkType=0x%02x, mode=0x%02x, scl=0x%02x, cl/wl=0x%x/0x%x, mask=%s)",
_vBlock.num, _vBlock.upkType, _vBlock.mode, _vBlock.scl, _vBlock.cl, _vBlock.wl,
doMask ? wxsFormat( L"0x%08x", _vBlock.mask ).c_str() : L"ignored"
);
xSetPtr(v.recPtr);
_vBlock.startPtr = (uptr)xGetAlignedCallTarget();
v.vifBlocks->add(_vBlock);
VpuUnpackSSE_Dynarec( v, _vBlock ).CompileRoutine();
nVif[idx].recPtr = xGetPtr();
dVifRecLimit(idx);
// Run the block we just compiled. Various conditions may force us to still use
// the interpreter unpacker though, so a recursive call is the safest way here...
dVifUnpack(idx, data, size, isFill);
}

View File

@ -15,6 +15,12 @@
#pragma once
#include "Vif.h"
#include "VU.h"
#include "x86emitter/x86emitter.h"
using namespace x86Emitter;
#ifdef newVif
// newVif_HashBucket.h uses this typedef, so it has to be decared first.
@ -23,17 +29,12 @@ typedef void (__fastcall *nVifrecCall)(uptr dest, uptr src);
#include "newVif_BlockBuffer.h"
#include "newVif_HashBucket.h"
#include "x86emitter/x86emitter.h"
using namespace x86Emitter;
extern void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0);
extern void nVifGen (int usn, int mask, int curCycle);
extern void _nVifUnpack (int idx, u8 *data, u32 size);
extern void dVifUnpack (int idx, u8 *data, u32 size);
extern void dVifInit (int idx);
static __pagealigned u8 nVifUpkExec[__pagesize*4];
static __aligned16 nVifCall nVifUpk[(2*2*16) *4]; // ([USN][Masking][Unpack Type]) [curCycle]
static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector]
extern void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0);
extern void _nVifUnpack (int idx, u8 *data, u32 size, bool isFill);
extern void dVifUnpack (int idx, u8 *data, u32 size, bool isFill);
extern void dVifInit (int idx);
extern void VpuUnpackSSE_Init();
#define VUFT VIFUnpackFuncTable
#define _1mb (0x100000)
@ -56,7 +57,10 @@ static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][
# pragma warning(disable:4996) // 'function': was declared deprecated
#endif
struct __aligned16 nVifBlock { // Ordered for Hashing
// nVifBlock - Ordered for Hashing; the 'num' field and the lower 6 bits of upkType are
// used as the hash bucke selector.
//
struct __aligned16 nVifBlock {
u8 num; // [00] Num Field
u8 upkType; // [01] Unpack Type [usn*1:mask*1:upk*4]
u8 mode; // [02] Mode Field
@ -88,63 +92,14 @@ struct nVifStruct {
u8* recEnd; // End of Rec Cache
BlockBuffer* vifCache; // Block Buffer
HashBucket<_tParams>* vifBlocks; // Vif Blocks
nVifBlock* vifBlock; // Current Vif Block Ptr
};
// Contents of this table are doubled up for doMask(false) and doMask(true) lookups.
// (note: currently unused, I'm using gsize in the interp tables instead since it
// seems to be faster for now, which may change when nVif isn't reliant on interpreted
// unpackers anymore --air)
static const u32 nVifT[32] = {
4, // S-32
2, // S-16
1, // S-8
0, // ----
8, // V2-32
4, // V2-16
2, // V2-8
0, // ----
12,// V3-32
6, // V3-16
3, // V3-8
0, // ----
16,// V4-32
8, // V4-16
4, // V4-8
2, // V4-5
extern __aligned16 nVifStruct nVif[2];
extern __aligned16 const u8 nVifT[32];
extern __aligned16 nVifCall nVifUpk[(2*2*16) *4]; // ([USN][Masking][Unpack Type]) [curCycle]
extern __aligned16 u32 nVifMask[3][4][4]; // [MaskNumber][CycleNumber][Vector]
// Second verse, same as the first!
4,2,1,0,8,4,2,0,12,6,3,0,16,8,4,2
};
template< int idx, bool doMode, bool isFill, bool singleUnpack >
__releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size);
typedef void (__fastcall* Fnptr_VifUnpackLoop)(u8 *data, u32 size);
// Unpacks Until 'Num' is 0
static const __aligned16 Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = {
{{ _nVifUnpackLoop<0,0,0,0>, _nVifUnpackLoop<0,0,1,0> },
{ _nVifUnpackLoop<0,1,0,0>, _nVifUnpackLoop<0,1,1,0> },},
{{ _nVifUnpackLoop<1,0,0,0>, _nVifUnpackLoop<1,0,1,0> },
{ _nVifUnpackLoop<1,1,0,0>, _nVifUnpackLoop<1,1,1,0> },},
};
// Unpacks until 1 normal write cycle unpack has been written to VU mem
static const __aligned16 Fnptr_VifUnpackLoop UnpackSingleTable[2][2][2] = {
{{ _nVifUnpackLoop<0,0,0,1>, _nVifUnpackLoop<0,0,1,1> },
{ _nVifUnpackLoop<0,1,0,1>, _nVifUnpackLoop<0,1,1,1> },},
{{ _nVifUnpackLoop<1,0,0,1>, _nVifUnpackLoop<1,0,1,1> },
{ _nVifUnpackLoop<1,1,0,1>, _nVifUnpackLoop<1,1,1,1> },},
};
#define useOldUnpack 0 // Use code in newVif_OldUnpack.inl
#define newVifDynaRec 1 // Use code in newVif_Dynarec.inl
#include "newVif_OldUnpack.inl"
#include "newVif_Unpack.inl"
#include "newVif_UnpackGen.inl"
#include "newVif_Tables.inl"
#include "newVif_Dynarec.inl"
static const bool useOldUnpack = false; // Use code in newVif_OldUnpack.inl
static const bool newVifDynaRec = true; // Use code in newVif_Dynarec.inl
#endif

View File

@ -25,7 +25,7 @@
// just use 'new' and 'delete' for initialization and
// deletion/cleanup respectfully...
class BlockBuffer {
private:
protected:
u32 mSize; // Cur Size
u32 mSizeT; // Total Size
u8* mData; // Data Ptr

View File

@ -1,163 +0,0 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2009 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
// newVif Dynarec - Dynamically Recompiles Vif 'unpack' Packets
// authors: cottonvibes(@gmail.com)
// Jake.Stine (@gmail.com)
#pragma once
void dVifInit(int idx) {
nVif[idx].idx = idx;
nVif[idx].VU = idx ? &VU1 : &VU0;
nVif[idx].vif = idx ? &vif1 : &vif0;
nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs;
nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000));
nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0;
nVif[idx].vifCache = new BlockBuffer(_1mb*4); // 4mb Rec Cache
nVif[idx].vifBlocks = new HashBucket<_tParams>();
nVif[idx].recPtr = nVif[idx].vifCache->getBlock();
nVif[idx].recEnd = &nVif[idx].recPtr[nVif[idx].vifCache->getSize()-(_1mb/4)]; // .25mb Safe Zone
emitCustomCompare();
}
_f void dVifRecLimit(int idx) {
if (nVif[idx].recPtr > nVif[idx].recEnd) {
DevCon.WriteLn("nVif Rec - Out of Rec Cache! [%x > %x]", nVif[idx].recPtr, nVif[idx].recEnd);
nVif[idx].vifBlocks->clear();
nVif[idx].recPtr = nVif[idx].vifCache->getBlock();
}
}
_f void dVifSetMasks(nVifStruct& v, int mask, int mode, int cS) {
u32 m0 = v.vifBlock->mask;
u32 m1 = m0 & 0xaaaaaaaa;
u32 m2 =(~m1>>1) & m0;
u32 m3 = (m1>>1) & ~m0;
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
u32* col = (v.idx) ? g_vifmask.Col1 : g_vifmask.Col0;
if((m2&&mask) || mode) { xMOVAPS(xmmRow, ptr32[row]); }
if (m3&&mask) {
xMOVAPS(xmmCol0, ptr32[col]);
if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
if ((cS>=4) && (m3&0xff000000)) xPSHUF.D(xmmCol3, xmmCol0, _v3);
if ((cS>=1) && (m3&0x000000ff)) xPSHUF.D(xmmCol0, xmmCol0, _v0);
}
//if (mask||mode) loadRowCol(v);
}
void dVifRecompile(nVifStruct& v, nVifBlock* vB) {
const bool isFill = (vB->cl < vB->wl);
const int usn = (vB->upkType>>5)&1;
const int doMask = (vB->upkType>>4)&1;
const int upkNum = vB->upkType & 0xf;
const u32& vift = nVifT[upkNum];
const int doMode = vifRegs->mode & 3;
const int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl;
const int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl;
const int skipSize = blockSize - cycleSize;
const bool simpleBlock = (vifRegs->num == 1);
const int backupCL = vif->cl;
const int backupNum = vifRegs->num;
if (vif->cl >= blockSize) vif->cl = 0;
v.vifBlock = vB;
xSetPtr(v.recPtr);
xAlignPtr(16);
vB->startPtr = (uptr)xGetPtr();
dVifSetMasks(v, doMask, doMode, cycleSize);
while (vifRegs->num) {
if (vif->cl < cycleSize) {
xUnpack[upkNum](&v, doMode<<1 | doMask);
if (!simpleBlock) xADD(edx, vift);
if (!simpleBlock) xADD(ecx, 16);
vifRegs->num--;
if (++vif->cl == blockSize) vif->cl = 0;
}
else if (isFill) {
DevCon.WriteLn("filling mode!");
xUnpack[upkNum](&v, 1);
xADD(ecx, 16);
vifRegs->num--;
if (++vif->cl == blockSize) vif->cl = 0;
}
else {
xADD(ecx, 16 * skipSize);
vif->cl = 0;
}
}
if (doMode==2) writeBackRow(v);
xMOV(ptr32[&vif->cl], vif->cl);
xMOV(ptr32[&vifRegs->num], vifRegs->num);
xRET();
v.recPtr = xGetPtr();
vif->cl = backupCL;
vifRegs->num = backupNum;
}
static __aligned16 nVifBlock _vBlock = {0};
_f u8* dVifsetVUptr(nVifStruct& v, int offset) {
u8* ptr = (u8*)(v.VU->Mem + (offset & v.vuMemLimit));
u8* endPtr = ptr + _vBlock.num * 16;
if (endPtr > v.vuMemEnd) {
DevCon.WriteLn("nVif - VU Mem Ptr Overflow!");
ptr = NULL; // Fall Back to Interpreters which have wrap-around logic
}
return ptr;
}
void dVifUnpack(int idx, u8 *data, u32 size) {
nVifStruct& v = nVif[idx];
const u8 upkType = vif->cmd & 0x1f | ((!!(vif->usn)) << 5);
const int doMask = (upkType>>4)&1;
_vBlock.upkType = upkType;
_vBlock.num = *(u8*)&vifRegs->num;
_vBlock.mode = *(u8*)&vifRegs->mode;
_vBlock.scl = vif->cl;
_vBlock.cl = vifRegs->cycle.cl;
_vBlock.wl = vifRegs->cycle.wl;
// Zero out the mask parameter if it's unused -- games leave random junk
// values here which cause false recblock cache misses.
_vBlock.mask = doMask ? vifRegs->mask : 0x00;
if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
if( u8* dest = dVifsetVUptr(v, vif->tag.addr) ) {
//DevCon.WriteLn("Running Recompiled Block!");
((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
}
else {
//DevCon.WriteLn("Running Interpreter Block");
_nVifUnpack(idx, data, size);
}
return;
}
static int recBlockNum = 0;
DevCon.WriteLn("nVif: Recompiled Block! [%d]", recBlockNum++);
DevCon.WriteLn(L"\t(num=0x%02x, upkType=0x%02x, mode=0x%02x, scl=0x%02x, cl=0x%x, wl=0x%x, mask=%s)",
_vBlock.num, _vBlock.upkType, _vBlock.mode, _vBlock.scl, _vBlock.cl, _vBlock.wl,
doMask ? wxsFormat( L"0x%08x", _vBlock.mask ).c_str() : L"ignored"
);
dVifRecompile(v, &_vBlock);
v.vifBlocks->add(&_vBlock);
dVifRecLimit(idx);
dVifUnpack(idx, data, size);
}

View File

@ -15,8 +15,6 @@
#pragma once
static __pagealigned u8 nVifMemCmp[__pagesize];
template< typename T >
struct SizeChain
{
@ -66,8 +64,8 @@ public:
if( bucket.Size > 3 ) DevCon.Warning( "recVifUnpk: Bucket 0x%04x has %d micro-programs", d % hSize, bucket.Size );
return NULL;
}
__forceinline void add(T* dataPtr) {
u32 d = *(u32*)dataPtr;
__forceinline void add(const T& dataPtr) {
u32 d = (u32&)dataPtr;
SizeChain<T>& bucket( mBucket[d % hSize] );
if( bucket.Chain = (T*)_aligned_realloc( bucket.Chain, sizeof(T)*(bucket.Size+1), 16), bucket.Chain==NULL ) {
@ -76,7 +74,7 @@ public:
wxEmptyString
);
}
memcpy_fast(&bucket.Chain[bucket.Size++], dataPtr, sizeof(T));
memcpy_fast(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
}
void clear() {
for (int i = 0; i < hSize; i++) {

View File

@ -1,287 +0,0 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2009 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#define vUPK(x) void x(nVifStruct* v, int doMask)
#define _doUSN (v->vifBlock->upkType & 0x20)
#undef xMovDest
#undef xShiftR
#undef xPMOVXX8
#undef xPMOVXX16
#undef xMaskWrite
#define makeMergeMask(x) { \
x = ((x&0x40)>>6) | ((x&0x10)>>3) | (x&4) | ((x&1)<<3); \
}
void doMaskWrite(const xRegisterSSE& regX, nVifStruct* v, int doMask) {
if (regX.Id > 1) DevCon.WriteLn("Reg Overflow!!!");
int doMode = doMask>>1; doMask &= 1;
int cc = aMin(v->vif->cl, 3);
u32 m0 = (v->vifBlock->mask >> (cc * 8)) & 0xff;
u32 m1 = m0 & 0xaaaa;
u32 m2 =(~m1>>1) & m0;
u32 m3 = (m1>>1) & ~m0;
u32 m4 = (m1>>1) & m0;
makeMergeMask(m2);
makeMergeMask(m3);
makeMergeMask(m4);
if (doMask&&m4) { xMOVAPS(xmmTemp, ptr32[ecx]); } // Load Write Protect
if (doMask&&m2) { mVUmergeRegs(regX.Id, xmmRow.Id, m2); } // Merge Row
if (doMask&&m3) { mVUmergeRegs(regX.Id, xmmCol0.Id+cc, m3); } // Merge Col
if (doMask&&m4) { mVUmergeRegs(regX.Id, xmmTemp.Id, m4); } // Merge Write Protect
if (doMode) {
u32 m5 = (~m1>>1) & ~m0;
if (!doMask) m5 = 0xf;
else makeMergeMask(m5);
if (m5 < 0xf) {
xPXOR(xmmTemp, xmmTemp);
mVUmergeRegs(xmmTemp.Id, xmmRow.Id, m5);
xPADD.D(regX, xmmTemp);
if (doMode==2) mVUmergeRegs(xmmRow.Id, regX.Id, m5);
}
else if (m5 == 0xf) {
xPADD.D(regX, xmmRow);
if (doMode==2) xMOVAPS(xmmRow, regX);
}
}
xMOVAPS(ptr32[ecx], regX);
}
#define xMovDest(regX) { \
if (!doMask){ xMOVAPS (ptr32[ecx], regX); } \
else { doMaskWrite(regX, v, doMask); } \
}
#define xShiftR(regX, n) { \
if (_doUSN) { xPSRL.D(regX, n); } \
else { xPSRA.D(regX, n); } \
}
#define xPMOVXX8(regX, src) { \
if (_doUSN) xPMOVZX.BD(regX, src); \
else xPMOVSX.BD(regX, src); \
}
#define xPMOVXX16(regX, src) { \
if (_doUSN) xPMOVZX.WD(regX, src); \
else xPMOVSX.WD(regX, src); \
}
// ecx = dest, edx = src
vUPK(nVif_S_32) {
xMOV32 (xmm0, ptr32[edx]);
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
}
vUPK(nVif_S_16) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV16 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
}
vUPK(nVif_S_8) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV8 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
}
vUPK(nVif_V2_32) {
xMOV64 (xmm0, ptr32[edx]);
xMovDest (xmm0);
}
vUPK(nVif_V2_16) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
}
vUPK(nVif_V2_8) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV16 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
}
vUPK(nVif_V3_32) {
xMOV128 (xmm0, ptr32[edx]);
xMovDest (xmm0);
}
vUPK(nVif_V3_16) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV64 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
}
vUPK(nVif_V3_8) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
}
vUPK(nVif_V4_32) {
xMOV128 (xmm0, ptr32[edx]);
xMovDest (xmm0);
}
vUPK(nVif_V4_16) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV64 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
}
vUPK(nVif_V4_8) {
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
}
vUPK(nVif_V4_5) {
xMOV16 (xmm0, ptr32[edx]);
xPSHUF.D (xmm0, xmm0, _v0);
xPSLL.D (xmm0, 3); // ABG|R5.000
xMOVAPS (xmm1, xmm0); // x|x|x|R
xPSRL.D (xmm0, 8); // ABG
xPSLL.D (xmm0, 3); // AB|G5.000
mVUmergeRegs(XMM1, XMM0, 0x4); // x|x|G|R
xPSRL.D (xmm0, 8); // AB
xPSLL.D (xmm0, 3); // A|B5.000
mVUmergeRegs(XMM1, XMM0, 0x2); // x|B|G|R
xPSRL.D (xmm0, 8); // A
xPSLL.D (xmm0, 7); // A.0000000
mVUmergeRegs(XMM1, XMM0, 0x1); // A|B|G|R
xPSLL.D (xmm1, 24); // can optimize to
xPSRL.D (xmm1, 24); // single AND...
xMovDest (xmm1);
}
vUPK(nVif_unkown) {
Console.Error("nVif%d - Invalid Unpack! [%d]", v->idx, v->vif->tag.cmd & 0xf);
}
void (*xUnpack[16])(nVifStruct* v, int doMask) = {
nVif_S_32,
nVif_S_16,
nVif_S_8,
nVif_unkown,
nVif_V2_32,
nVif_V2_16,
nVif_V2_8,
nVif_unkown,
nVif_V3_32,
nVif_V3_16,
nVif_V3_8,
nVif_unkown,
nVif_V4_32,
nVif_V4_16,
nVif_V4_8,
nVif_V4_5,
};
// Loads Row/Col Data from vifRegs instead of g_vifmask
// Useful for testing vifReg and g_vifmask inconsistency.
void loadRowCol(nVifStruct& v) {
xMOVAPS(xmm0, ptr32[&v.vifRegs->r0]);
xMOVAPS(xmm1, ptr32[&v.vifRegs->r1]);
xMOVAPS(xmm2, ptr32[&v.vifRegs->r2]);
xMOVAPS(xmm6, ptr32[&v.vifRegs->r3]);
xPSHUF.D(xmm0, xmm0, _v0);
xPSHUF.D(xmm1, xmm1, _v0);
xPSHUF.D(xmm2, xmm2, _v0);
xPSHUF.D(xmm6, xmm6, _v0);
mVUmergeRegs(XMM6, XMM0, 8);
mVUmergeRegs(XMM6, XMM1, 4);
mVUmergeRegs(XMM6, XMM2, 2);
xMOVAPS(xmm2, ptr32[&v.vifRegs->c0]);
xMOVAPS(xmm3, ptr32[&v.vifRegs->c1]);
xMOVAPS(xmm4, ptr32[&v.vifRegs->c2]);
xMOVAPS(xmm5, ptr32[&v.vifRegs->c3]);
xPSHUF.D(xmm2, xmm2, _v0);
xPSHUF.D(xmm3, xmm3, _v0);
xPSHUF.D(xmm4, xmm4, _v0);
xPSHUF.D(xmm5, xmm5, _v0);
}
void writeBackRow(nVifStruct& v) {
u32* row = (v.idx) ? g_vifmask.Row1 : g_vifmask.Row0;
xMOVAPS(ptr32[row], xmmRow);
DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
// ToDo: Do we need to write back to vifregs.rX too!? :/
}
void emitCustomCompare() {
HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadWrite, false);
memset8<0xcc>(nVifMemCmp);
xSetPtr(nVifMemCmp);
xMOVAPS (xmm0, ptr32[ecx]);
xPCMP.EQD(xmm0, ptr32[edx]);
xMOVMSKPS(eax, xmm0);
xAND (eax, 0x7); // ignore top 4 bytes (recBlock pointer)
xRET();
HostSys::MemProtectStatic(nVifMemCmp, Protect_ReadOnly, true);
}

View File

@ -17,9 +17,66 @@
// authors: cottonvibes(@gmail.com)
// Jake.Stine (@gmail.com)
#pragma once
#include "PrecompiledHeader.h"
#include "Common.h"
#include "VifDma_internal.h"
#include "newVif.h"
static __aligned16 nVifStruct nVif[2];
#ifdef newVif
#include "newVif_OldUnpack.inl"
__aligned16 nVifStruct nVif[2];
__aligned16 nVifCall nVifUpk[(2*2*16) *4]; // ([USN][Masking][Unpack Type]) [curCycle]
__aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector]
// Contents of this table are doubled up for doMask(false) and doMask(true) lookups.
// (note: currently unused, I'm using gsize in the interp tables instead since it
// seems to be faster for now, which may change when nVif isn't reliant on interpreted
// unpackers anymore --air)
__aligned16 const u8 nVifT[32] = {
4, // S-32
2, // S-16
1, // S-8
0, // ----
8, // V2-32
4, // V2-16
2, // V2-8
0, // ----
12,// V3-32
6, // V3-16
3, // V3-8
0, // ----
16,// V4-32
8, // V4-16
4, // V4-8
2, // V4-5
// Second verse, same as the first!
4,2,1,0,8,4,2,0,12,6,3,0,16,8,4,2
};
// ----------------------------------------------------------------------------
template< int idx, bool doMode, bool isFill, bool singleUnpack >
__releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size);
typedef void (__fastcall* Fnptr_VifUnpackLoop)(u8 *data, u32 size);
// Unpacks Until 'Num' is 0
static const __aligned16 Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = {
{{ _nVifUnpackLoop<0,0,0,0>, _nVifUnpackLoop<0,0,1,0> },
{ _nVifUnpackLoop<0,1,0,0>, _nVifUnpackLoop<0,1,1,0> },},
{{ _nVifUnpackLoop<1,0,0,0>, _nVifUnpackLoop<1,0,1,0> },
{ _nVifUnpackLoop<1,1,0,0>, _nVifUnpackLoop<1,1,1,0> },},
};
// Unpacks until 1 normal write cycle unpack has been written to VU mem
static const __aligned16 Fnptr_VifUnpackLoop UnpackSingleTable[2][2][2] = {
{{ _nVifUnpackLoop<0,0,0,1>, _nVifUnpackLoop<0,0,1,1> },
{ _nVifUnpackLoop<0,1,0,1>, _nVifUnpackLoop<0,1,1,1> },},
{{ _nVifUnpackLoop<1,0,0,1>, _nVifUnpackLoop<1,0,1,1> },
{ _nVifUnpackLoop<1,1,0,1>, _nVifUnpackLoop<1,1,1,1> },},
};
// ----------------------------------------------------------------------------
void initNewVif(int idx) {
nVif[idx].idx = idx;
@ -31,26 +88,15 @@ void initNewVif(int idx) {
nVif[idx].vifCache = NULL;
nVif[idx].partTransfer = 0;
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadWrite, false);
memset8<0xcc>( nVifUpkExec );
xSetPtr( nVifUpkExec );
for (int a = 0; a < 2; a++) {
for (int b = 0; b < 2; b++) {
for (int c = 0; c < 4; c++) {
nVifGen(a, b, c);
}}}
HostSys::MemProtectStatic(nVifUpkExec, Protect_ReadOnly, true);
VpuUnpackSSE_Init();
if (newVifDynaRec) dVifInit(idx);
}
_f u8* setVUptr(int vuidx, const u8* vuMemBase, int offset) {
static _f u8* setVUptr(int vuidx, const u8* vuMemBase, int offset) {
return (u8*)(vuMemBase + ( offset & (vuidx ? 0x3ff0 : 0xff0) ));
}
_f void incVUptr(int vuidx, u8* &ptr, const u8* vuMemBase, int amount) {
static _f void incVUptr(int vuidx, u8* &ptr, const u8* vuMemBase, int amount) {
pxAssert( ((uptr)ptr & 0xf) == 0 ); // alignment check
ptr += amount;
int diff = ptr - (vuMemBase + (vuidx ? 0x4000 : 0x1000));
@ -59,7 +105,7 @@ _f void incVUptr(int vuidx, u8* &ptr, const u8* vuMemBase, int amount) {
}
}
_f void incVUptrBy16(int vuidx, u8* &ptr, const u8* vuMemBase) {
static _f void incVUptrBy16(int vuidx, u8* &ptr, const u8* vuMemBase) {
pxAssert( ((uptr)ptr & 0xf) == 0 ); // alignment check
ptr += 16;
if( ptr == (vuMemBase + (vuidx ? 0x4000 : 0x1000)) )
@ -73,16 +119,16 @@ int nVifUnpack(int idx, u8* data) {
vifRegs = v.vifRegs;
int ret = aMin(vif->vifpacketsize, vif->tag.size);
s32 size = ret << 2;
u32 vifT = nVifT[vif->cmd & 0xf];
const u8& vifT = nVifT[vif->cmd & 0xf];
vif->tag.size -= ret;
const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
if (v.partTransfer) { // Last transfer was a partial vector transfer...
const bool doMode = vifRegs->mode && !(vif->tag.cmd & 0x10);
const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
const u8 upkNum = vif->cmd & 0x1f;
const VUFT& ft = VIFfuncTable[upkNum];
const int diff = ft.gsize - v.partTransfer;
const int diff = vifT - v.partTransfer;
memcpy(&v.partBuffer[v.partTransfer], data, diff);
UnpackSingleTable[idx][doMode][isFill]( v.partBuffer, size );
data += diff;
@ -95,8 +141,8 @@ int nVifUnpack(int idx, u8* data) {
u32 oldNum = vifRegs->num;
if (size > 0) {
if (newVifDynaRec) dVifUnpack(idx, data, size);
else _nVifUnpack(idx, data, size);
if (newVifDynaRec) dVifUnpack(idx, data, size, isFill);
else _nVifUnpack(idx, data, size, isFill);
}
u32 s =(size/vifT) * vifT;
@ -230,7 +276,7 @@ __releaseinline void __fastcall _nVifUnpackLoop(u8 *data, u32 size) {
}
}
_f void _nVifUnpack(int idx, u8 *data, u32 size) {
_f void _nVifUnpack(int idx, u8 *data, u32 size, bool isFill) {
if (useOldUnpack) {
if (!idx) VIFunpack<0>((u32*)data, &vif0.tag, size>>2);
@ -239,7 +285,6 @@ _f void _nVifUnpack(int idx, u8 *data, u32 size) {
}
const bool doMode = vifRegs->mode && !(vif->tag.cmd & 0x10);
const bool isFill = (vifRegs->cycle.cl < vifRegs->cycle.wl);
UnpackLoopTable[idx][doMode][isFill]( data, size );
}
#endif

View File

@ -1,240 +0,0 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2009 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#define xMaskWrite(regX) { \
xMOVAPS(xmm7, ptr32[ecx]); \
int offX = aMin(curCycle, 3); \
xPAND(regX, ptr32[nVifMask[0][offX]]); \
xPAND(xmm7, ptr32[nVifMask[1][offX]]); \
xPOR (regX, ptr32[nVifMask[2][offX]]); \
xPOR (regX, xmm7); \
xMOVAPS(ptr32[ecx], regX); \
}
#define xMovDest(regX) { \
if (!mask) { xMOVAPS (ptr32[ecx], regX); } \
else { xMaskWrite(regX); } \
}
#define xShiftR(regX, n) { \
if (usn) { xPSRL.D(regX, n); } \
else { xPSRA.D(regX, n); } \
}
#define xPMOVXX8(regX, src) { \
if (usn) xPMOVZX.BD(regX, src); \
else xPMOVSX.BD(regX, src); \
}
#define xPMOVXX16(regX, src) { \
if (usn) xPMOVZX.WD(regX, src); \
else xPMOVSX.WD(regX, src); \
}
struct VifUnpackIndexer {
int usn, mask;
int curCycle, cyclesToWrite;
nVifCall& GetCall(int packType) const {
int usnpart = usn*2*16;
int maskpart = mask*16;
int packpart = packType;
int curpart = curCycle;
return nVifUpk[((usnpart+maskpart+packpart) * 4) + (curpart)];
}
void xSetCall(int packType) const {
GetCall( packType ) = (nVifCall)xGetAlignedCallTarget();
}
void xSetNullCall(int packType) const {
GetCall( packType ) = NULL;
}
};
// xMOVSS doesn't seem to have all overloads defined with new emitter
#define xMOVSSS(regX, loc) SSE_MOVSS_Rm_to_XMM(0, 2, 0)
#define xMOV8(regX, loc) xMOVSSS(regX, loc)
#define xMOV16(regX, loc) xMOVSSS(regX, loc)
#define xMOV32(regX, loc) xMOVSSS(regX, loc)
#define xMOV64(regX, loc) xMOVUPS(regX, loc)
#define xMOV128(regX, loc) xMOVUPS(regX, loc)
// ecx = dest, edx = src
void nVifGen(int usn, int mask, int curCycle) {
const VifUnpackIndexer indexer = { usn, mask, curCycle, 0 };
indexer.xSetCall(0x0); // S-32
xMOV32 (xmm0, ptr32[edx]);
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
xRET();
indexer.xSetCall(0x1); // S-16
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV16 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
xRET();
indexer.xSetCall(0x2); // S-8
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV8 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xPSHUF.D (xmm1, xmm0, _v0);
xMovDest (xmm1);
xRET();
indexer.xSetNullCall(0x3); // ----
indexer.xSetCall(0x4); // V2-32
xMOV64 (xmm0, ptr32[edx]);
xMovDest (xmm0);
xRET();
indexer.xSetCall(0x5); // V2-16
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
xRET();
indexer.xSetCall(0x6); // V2-8
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV16 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
xRET();
indexer.xSetNullCall(0x7); // ----
indexer.xSetCall(0x8); // V3-32
xMOV128 (xmm0, ptr32[edx]);
xMovDest (xmm0);
xRET();
indexer.xSetCall(0x9); // V3-16
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV64 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
xRET();
indexer.xSetCall(0xa); // V3-8
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
xRET();
indexer.xSetNullCall(0xb); // ----
indexer.xSetCall(0xc); // V4-32
xMOV128 (xmm0, ptr32[edx]);
xMovDest (xmm0);
xRET();
indexer.xSetCall(0xd); // V4-16
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX16 (xmm0, ptr64[edx]);
}
else {
xMOV64 (xmm0, ptr32[edx]);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 16);
}
xMovDest (xmm0);
xRET();
indexer.xSetCall(0xe); // V4-8
if (x86caps.hasStreamingSIMD4Extensions) {
xPMOVXX8 (xmm0, ptr32[edx]);
}
else {
xMOV32 (xmm0, ptr32[edx]);
xPUNPCK.LBW(xmm0, xmm0);
xPUNPCK.LWD(xmm0, xmm0);
xShiftR (xmm0, 24);
}
xMovDest (xmm0);
xRET();
// A | B5 | G5 | R5
// ..0.. A 0000000 | ..0.. B 000 | ..0.. G 000 | ..0.. R 000
// Optimization: This function has a *really* long dependency chain.
// It would be better if the [edx] is loaded into multiple regs and
// then the regs are shifted each independently, instead of using the
// progressive shift->move pattern below. --air
indexer.xSetCall(0xf); // V4-5
xMOV16 (xmm0, ptr32[edx]);
xMOVAPS (xmm1, xmm0);
xPSLL.D (xmm1, 3); // ABG|R5.000
xMOVAPS (xmm2, xmm1);// R5.000 (garbage upper bits)
xPSRL.D (xmm1, 8); // ABG
xPSLL.D (xmm1, 3); // AB|G5.000
xMOVAPS (xmm3, xmm1);// G5.000 (garbage upper bits)
xPSRL.D (xmm1, 8); // AB
xPSLL.D (xmm1, 3); // A|B5.000
xMOVAPS (xmm4, xmm1);// B5.000 (garbage upper bits)
xPSRL.D (xmm1, 8); // A
xPSLL.D (xmm1, 7); // A.0000000
xPSHUF.D (xmm1, xmm1, _v0); // A|A|A|A
xPSHUF.D (xmm3, xmm3, _v0); // G|G|G|G
xPSHUF.D (xmm4, xmm4, _v0); // B|B|B|B
mVUmergeRegs(XMM2, XMM1, 0x3); // A|x|x|R
mVUmergeRegs(XMM2, XMM3, 0x4); // A|x|G|R
mVUmergeRegs(XMM2, XMM4, 0x2); // A|B|G|R
xPSLL.D (xmm2, 24); // can optimize to
xPSRL.D (xmm2, 24); // single AND...
xMovDest (xmm2);
xRET();
pxAssert( ((uptr)xGetPtr() - (uptr)nVifUpkExec) < sizeof(nVifUpkExec) );
}