mirror of https://github.com/PCSX2/pcsx2.git
1740 lines
56 KiB
C++
1740 lines
56 KiB
C++
/* PCSX2 - PS2 Emulator for PCs
|
|
* Copyright (C) 2002-2010 PCSX2 Dev Team
|
|
*
|
|
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
|
* of the GNU Lesser General Public License as published by the Free Software Found-
|
|
* ation, either version 3 of the License, or (at your option) any later version.
|
|
*
|
|
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
|
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
* PURPOSE. See the GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License along with PCSX2.
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "PrecompiledHeader.h"
|
|
|
|
#include "Common.h"
|
|
#include "GS.h"
|
|
#include "R5900OpcodeTables.h"
|
|
#include "iR5900.h"
|
|
#include "iMMI.h"
|
|
#include "iFPU.h"
|
|
#include "iCOP0.h"
|
|
#include "VUmicro.h"
|
|
#include "VUflags.h"
|
|
#include "sVU_Micro.h"
|
|
#include "sVU_Debug.h"
|
|
#include "sVU_zerorec.h"
|
|
|
|
#ifdef _WIN32
|
|
#pragma warning(disable:4244)
|
|
#pragma warning(disable:4761)
|
|
#endif
|
|
//------------------------------------------------------------------
|
|
|
|
// fixme - VUmicro should really use its own static vars for pc and branch.
|
|
// Sharing with the EE's copies of pc and branch is not cool! (air)
|
|
|
|
//------------------------------------------------------------------
|
|
// Helper Macros
|
|
//------------------------------------------------------------------
|
|
#define _Ft_ (( VU->code >> 16) & 0x1F) // The rt part of the instruction register
|
|
#define _Fs_ (( VU->code >> 11) & 0x1F) // The rd part of the instruction register
|
|
#define _Fd_ (( VU->code >> 6) & 0x1F) // The sa part of the instruction register
|
|
#define _It_ (_Ft_ & 15)
|
|
#define _Is_ (_Fs_ & 15)
|
|
#define _Id_ (_Fd_ & 15)
|
|
|
|
#define _X (( VU->code>>24) & 0x1)
|
|
#define _Y (( VU->code>>23) & 0x1)
|
|
#define _Z (( VU->code>>22) & 0x1)
|
|
#define _W (( VU->code>>21) & 0x1)
|
|
|
|
#define _XYZW_SS (_X+_Y+_Z+_W==1)
|
|
|
|
#define _Fsf_ (( VU->code >> 21) & 0x03)
|
|
#define _Ftf_ (( VU->code >> 23) & 0x03)
|
|
|
|
#define _Imm11_ (s32)(VU->code & 0x400 ? 0xfffffc00 | (VU->code & 0x3ff) : VU->code & 0x3ff)
|
|
#define _UImm11_ (s32)(VU->code & 0x7ff)
|
|
|
|
#define VU_VFx_ADDR(x) (uptr)&VU->VF[x].UL[0]
|
|
#define VU_VFy_ADDR(x) (uptr)&VU->VF[x].UL[1]
|
|
#define VU_VFz_ADDR(x) (uptr)&VU->VF[x].UL[2]
|
|
#define VU_VFw_ADDR(x) (uptr)&VU->VF[x].UL[3]
|
|
|
|
#define VU_REGR_ADDR (uptr)&VU->VI[REG_R]
|
|
#define VU_REGQ_ADDR (uptr)&VU->VI[REG_Q]
|
|
#define VU_REGMAC_ADDR (uptr)&VU->VI[REG_MAC_FLAG]
|
|
|
|
#define VU_VI_ADDR(x, read) GetVIAddr(VU, x, read, info)
|
|
|
|
#define VU_ACCx_ADDR (uptr)&VU->ACC.UL[0]
|
|
#define VU_ACCy_ADDR (uptr)&VU->ACC.UL[1]
|
|
#define VU_ACCz_ADDR (uptr)&VU->ACC.UL[2]
|
|
#define VU_ACCw_ADDR (uptr)&VU->ACC.UL[3]
|
|
|
|
#define _X_Y_Z_W ((( VU->code >> 21 ) & 0xF ) )
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// Global Variables
|
|
//------------------------------------------------------------------
|
|
int vucycle;
|
|
|
|
const __aligned16 float s_fones[8] = {1.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f};
|
|
const __aligned16 u32 s_mask[4] = {0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff};
|
|
const __aligned16 u32 s_expmask[4] = {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
|
|
const __aligned16 u32 g_minvals[4] = {0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff};
|
|
const __aligned16 u32 g_maxvals[4] = {0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff};
|
|
const __aligned16 u32 const_clip[8] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
|
|
0x80000000, 0x80000000, 0x80000000, 0x80000000};
|
|
|
|
const __aligned(64) u32 g_ones[4] = {0x00000001, 0x00000001, 0x00000001, 0x00000001};
|
|
|
|
const __aligned16 u32 g_minvals_XYZW[16][4] =
|
|
{
|
|
{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //0000
|
|
{ 0xffffffff, 0xffffffff, 0xffffffff, 0xff7fffff }, //0001
|
|
{ 0xffffffff, 0xffffffff, 0xff7fffff, 0xffffffff }, //0010
|
|
{ 0xffffffff, 0xffffffff, 0xff7fffff, 0xff7fffff }, //0011
|
|
{ 0xffffffff, 0xff7fffff, 0xffffffff, 0xffffffff }, //0100
|
|
{ 0xffffffff, 0xff7fffff, 0xffffffff, 0xff7fffff }, //0101
|
|
{ 0xffffffff, 0xff7fffff, 0xff7fffff, 0xffffffff }, //0110
|
|
{ 0xffffffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //0111
|
|
{ 0xff7fffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000
|
|
{ 0xff7fffff, 0xffffffff, 0xffffffff, 0xff7fffff }, //1001
|
|
{ 0xff7fffff, 0xffffffff, 0xff7fffff, 0xffffffff }, //1010
|
|
{ 0xff7fffff, 0xffffffff, 0xff7fffff, 0xff7fffff }, //1011
|
|
{ 0xff7fffff, 0xff7fffff, 0xffffffff, 0xffffffff }, //1100
|
|
{ 0xff7fffff, 0xff7fffff, 0xffffffff, 0xff7fffff }, //1101
|
|
{ 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xffffffff }, //1110
|
|
{ 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //1111
|
|
};
|
|
const __aligned16 u32 g_maxvals_XYZW[16][4] =
|
|
{
|
|
{ 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //0000
|
|
{ 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7f7fffff }, //0001
|
|
{ 0x7fffffff, 0x7fffffff, 0x7f7fffff, 0x7fffffff }, //0010
|
|
{ 0x7fffffff, 0x7fffffff, 0x7f7fffff, 0x7f7fffff }, //0011
|
|
{ 0x7fffffff, 0x7f7fffff, 0x7fffffff, 0x7fffffff }, //0100
|
|
{ 0x7fffffff, 0x7f7fffff, 0x7fffffff, 0x7f7fffff }, //0101
|
|
{ 0x7fffffff, 0x7f7fffff, 0x7f7fffff, 0x7fffffff }, //0110
|
|
{ 0x7fffffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //0111
|
|
{ 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1000
|
|
{ 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7f7fffff }, //1001
|
|
{ 0x7f7fffff, 0x7fffffff, 0x7f7fffff, 0x7fffffff }, //1010
|
|
{ 0x7f7fffff, 0x7fffffff, 0x7f7fffff, 0x7f7fffff }, //1011
|
|
{ 0x7f7fffff, 0x7f7fffff, 0x7fffffff, 0x7fffffff }, //1100
|
|
{ 0x7f7fffff, 0x7f7fffff, 0x7fffffff, 0x7f7fffff }, //1101
|
|
{ 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7fffffff }, //1110
|
|
{ 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //1111
|
|
};
|
|
//------------------------------------------------------------------
|
|
|
|
//------------------------------------------------------------------
|
|
// VU Pipeline/Test Stalls/Analyzing Functions
|
|
//------------------------------------------------------------------
|
|
void _recvuFMACflush(VURegs * VU, bool intermediate) {
|
|
int i;
|
|
|
|
for (i=0; i<8; i++) {
|
|
if (VU->fmac[i].enable == 0) continue;
|
|
|
|
if( intermediate ) {
|
|
if ((vucycle - VU->fmac[i].sCycle) > VU->fmac[i].Cycle) {
|
|
// VUM_LOG("flushing FMAC pipe[%d]", i);
|
|
VU->fmac[i].enable = 0;
|
|
}
|
|
}
|
|
else {
|
|
if ((vucycle - VU->fmac[i].sCycle) >= VU->fmac[i].Cycle) {
|
|
// VUM_LOG("flushing FMAC pipe[%d]", i);
|
|
VU->fmac[i].enable = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void _recvuFDIVflush(VURegs * VU, bool intermediate) {
|
|
if (VU->fdiv.enable == 0) return;
|
|
|
|
if( intermediate ) {
|
|
if ((vucycle - VU->fdiv.sCycle) > VU->fdiv.Cycle) {
|
|
// Console.WriteLn("flushing FDIV pipe");
|
|
VU->fdiv.enable = 0;
|
|
}
|
|
}
|
|
else {
|
|
if ((vucycle - VU->fdiv.sCycle) >= VU->fdiv.Cycle) {
|
|
// Console.WriteLn("flushing FDIV pipe");
|
|
VU->fdiv.enable = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
void _recvuEFUflush(VURegs * VU, bool intermediate) {
|
|
if (VU->efu.enable == 0) return;
|
|
|
|
if( intermediate ) {
|
|
if ((vucycle - VU->efu.sCycle) > VU->efu.Cycle) {
|
|
// Console.WriteLn("flushing FDIV pipe");
|
|
VU->efu.enable = 0;
|
|
}
|
|
}
|
|
else {
|
|
if ((vucycle - VU->efu.sCycle) >= VU->efu.Cycle) {
|
|
// Console.WriteLn("flushing FDIV pipe");
|
|
VU->efu.enable = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
void _recvuIALUflush(VURegs * VU, bool intermediate) {
|
|
int i;
|
|
|
|
for (i=0; i<8; i++) {
|
|
if (VU->ialu[i].enable == 0) continue;
|
|
|
|
if( intermediate ) {
|
|
if ((vucycle - VU->ialu[i].sCycle) > VU->ialu[i].Cycle) {
|
|
// VUM_LOG("flushing IALU pipe[%d]", i);
|
|
VU->ialu[i].enable = 0;
|
|
}
|
|
}
|
|
else {
|
|
if ((vucycle - VU->ialu[i].sCycle) >= VU->ialu[i].Cycle) {
|
|
// VUM_LOG("flushing IALU pipe[%d]", i);
|
|
VU->ialu[i].enable = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void _recvuTestPipes(VURegs * VU, bool intermediate) { // intermediate = true if called by upper FMAC stall detection
|
|
_recvuFMACflush(VU, intermediate);
|
|
_recvuFDIVflush(VU, intermediate);
|
|
_recvuEFUflush(VU, intermediate);
|
|
_recvuIALUflush(VU, intermediate);
|
|
}
|
|
|
|
void _recvuFMACTestStall(VURegs * VU, int reg, int xyzw) {
|
|
int cycle;
|
|
int i;
|
|
u32 mask = 0;
|
|
|
|
for (i=0; i<8; i++) {
|
|
if (VU->fmac[i].enable == 0) continue;
|
|
if (VU->fmac[i].reg == reg && (VU->fmac[i].xyzw & xyzw)) break;
|
|
}
|
|
|
|
if (i == 8) return;
|
|
|
|
// do a perchannel delay
|
|
// old code
|
|
// cycle = VU->fmac[i].Cycle - (vucycle - VU->fmac[i].sCycle);
|
|
|
|
// new code
|
|
mask = 4; // w
|
|
// if( VU->fmac[i].xyzw & 1 ) mask = 4; // w
|
|
// else if( VU->fmac[i].xyzw & 2 ) mask = 3; // z
|
|
// else if( VU->fmac[i].xyzw & 4 ) mask = 2; // y
|
|
// else {
|
|
// assert(VU->fmac[i].xyzw & 8 );
|
|
// mask = 1; // x
|
|
// }
|
|
|
|
// mask = 0;
|
|
// if( VU->fmac[i].xyzw & 1 ) mask++; // w
|
|
// else if( VU->fmac[i].xyzw & 2 ) mask++; // z
|
|
// else if( VU->fmac[i].xyzw & 4 ) mask++; // y
|
|
// else if( VU->fmac[i].xyzw & 8 ) mask++; // x
|
|
|
|
assert( (int)VU->fmac[i].sCycle < (int)vucycle );
|
|
cycle = 0;
|
|
if( vucycle - VU->fmac[i].sCycle < mask )
|
|
cycle = mask - (vucycle - VU->fmac[i].sCycle);
|
|
|
|
VU->fmac[i].enable = 0;
|
|
vucycle+= cycle;
|
|
_recvuTestPipes(VU, true); // for lower instructions
|
|
}
|
|
|
|
void _recvuIALUTestStall(VURegs * VU, int reg) {
|
|
int cycle;
|
|
int i;
|
|
u32 latency;
|
|
|
|
for (i=0; i<8; i++) {
|
|
if (VU->ialu[i].enable == 0) continue;
|
|
if (VU->ialu[i].reg == reg) break;
|
|
}
|
|
|
|
if (i == 8) return;
|
|
|
|
latency = VU->ialu[i].Cycle + 1;
|
|
cycle = 0;
|
|
if( vucycle - VU->ialu[i].sCycle < latency )
|
|
cycle = latency - (vucycle - VU->ialu[i].sCycle);
|
|
|
|
VU->ialu[i].enable = 0;
|
|
vucycle+= cycle;
|
|
_recvuTestPipes(VU, true);
|
|
}
|
|
|
|
void _recvuFMACAdd(VURegs * VU, int reg, int xyzw) {
|
|
int i;
|
|
|
|
/* find a free fmac pipe */
|
|
for (i=0; i<8; i++) {
|
|
if (VU->fmac[i].enable == 1) continue;
|
|
break;
|
|
}
|
|
|
|
if (i==8) Console.Error("*PCSX2*: error , out of fmacs");
|
|
// VUM_LOG("adding FMAC pipe[%d]; reg %d", i, reg);
|
|
|
|
VU->fmac[i].enable = 1;
|
|
VU->fmac[i].sCycle = vucycle;
|
|
VU->fmac[i].Cycle = 3;
|
|
VU->fmac[i].xyzw = xyzw;
|
|
VU->fmac[i].reg = reg;
|
|
}
|
|
|
|
void _recvuFDIVAdd(VURegs * VU, int cycles) {
|
|
// Console.WriteLn("adding FDIV pipe");
|
|
VU->fdiv.enable = 1;
|
|
VU->fdiv.sCycle = vucycle;
|
|
VU->fdiv.Cycle = cycles;
|
|
}
|
|
|
|
void _recvuEFUAdd(VURegs * VU, int cycles) {
|
|
// Console.WriteLn("adding EFU pipe");
|
|
VU->efu.enable = 1;
|
|
VU->efu.sCycle = vucycle;
|
|
VU->efu.Cycle = cycles;
|
|
}
|
|
|
|
void _recvuIALUAdd(VURegs * VU, int reg, int cycles) {
|
|
int i;
|
|
|
|
/* find a free ialu pipe */
|
|
for (i=0; i<8; i++) {
|
|
if (VU->ialu[i].enable == 1) continue;
|
|
break;
|
|
}
|
|
|
|
if (i==8) Console.Error("*PCSX2*: error , out of ialus");
|
|
|
|
VU->ialu[i].enable = 1;
|
|
VU->ialu[i].sCycle = vucycle;
|
|
VU->ialu[i].Cycle = cycles;
|
|
VU->ialu[i].reg = reg;
|
|
}
|
|
|
|
void _recvuTestIALUStalls(VURegs * VU, _VURegsNum *VUregsn) {
|
|
|
|
int VIread0 = 0, VIread1 = 0; // max 2 integer registers are read simulataneously
|
|
int i;
|
|
|
|
for(i=0;i<16;i++) { // find used integer(vi00-vi15) registers
|
|
if( (VUregsn->VIread >> i) & 1 ) {
|
|
if( VIread0 ) VIread1 = i;
|
|
else VIread0 = i;
|
|
}
|
|
}
|
|
|
|
if( VIread0 ) _recvuIALUTestStall(VU, VIread0);
|
|
if( VIread1 ) _recvuIALUTestStall(VU, VIread1);
|
|
}
|
|
|
|
void _recvuAddIALUStalls(VURegs * VU, _VURegsNum *VUregsn) {
|
|
if (VUregsn->VIwrite && VUregsn->cycles) {
|
|
int VIWrite0 = 0;
|
|
int i;
|
|
|
|
for(i=0;i<16;i++) { // find used(vi00-vi15) registers
|
|
if( (VUregsn->VIwrite >> i) & 1 ) {
|
|
VIWrite0 = i;
|
|
}
|
|
}
|
|
if( VIWrite0 ) _recvuIALUAdd(VU, VIWrite0, VUregsn->cycles);
|
|
}
|
|
}
|
|
|
|
void _recvuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn, bool upper) {
|
|
|
|
if( VUregsn->VFread0 && (VUregsn->VFread0 == VUregsn->VFread1) ) {
|
|
_recvuFMACTestStall(VU, VUregsn->VFread0, VUregsn->VFr0xyzw|VUregsn->VFr1xyzw);
|
|
}
|
|
else {
|
|
if (VUregsn->VFread0) _recvuFMACTestStall(VU, VUregsn->VFread0, VUregsn->VFr0xyzw);
|
|
if (VUregsn->VFread1) _recvuFMACTestStall(VU, VUregsn->VFread1, VUregsn->VFr1xyzw);
|
|
}
|
|
|
|
if( !upper && VUregsn->VIread ) _recvuTestIALUStalls(VU, VUregsn); // for lower instructions which read integer reg
|
|
}
|
|
|
|
void _recvuAddFMACStalls(VURegs * VU, _VURegsNum *VUregsn) {
|
|
|
|
if (VUregsn->VFwrite) _recvuFMACAdd(VU, VUregsn->VFwrite, VUregsn->VFwxyzw);
|
|
else if (VUregsn->VIwrite & (1 << REG_CLIP_FLAG)) _recvuFMACAdd(VU, -REG_CLIP_FLAG, 0); // REG_CLIP_FLAG pipe
|
|
else _recvuFMACAdd(VU, 0, 0); // cause no data dependency with fp registers
|
|
}
|
|
|
|
void _recvuFlushFDIV(VURegs * VU) {
|
|
int cycle;
|
|
|
|
if (VU->fdiv.enable == 0) return;
|
|
|
|
cycle = VU->fdiv.Cycle + 1 - (vucycle - VU->fdiv.sCycle); //VU->fdiv.Cycle contains the latency minus 1 (6 or 12)
|
|
// Console.WriteLn("waiting FDIV pipe %d", cycle);
|
|
VU->fdiv.enable = 0;
|
|
vucycle+= cycle;
|
|
}
|
|
|
|
void _recvuFlushEFU(VURegs * VU) {
|
|
int cycle;
|
|
|
|
if (VU->efu.enable == 0) return;
|
|
|
|
cycle = VU->efu.Cycle - (vucycle - VU->efu.sCycle);
|
|
// Console.WriteLn("waiting FDIV pipe %d", cycle);
|
|
VU->efu.enable = 0;
|
|
vucycle+= cycle;
|
|
}
|
|
|
|
void _recvuTestFDIVStalls(VURegs * VU, _VURegsNum *VUregsn) {
|
|
_recvuTestFMACStalls(VU,VUregsn, false);
|
|
_recvuFlushFDIV(VU);
|
|
}
|
|
|
|
void _recvuTestEFUStalls(VURegs * VU, _VURegsNum *VUregsn) {
|
|
_recvuTestFMACStalls(VU,VUregsn, false);
|
|
_recvuFlushEFU(VU);
|
|
}
|
|
|
|
void _recvuAddFDIVStalls(VURegs * VU, _VURegsNum *VUregsn) {
|
|
// _vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn);
|
|
if (VUregsn->VIwrite & (1 << REG_Q)) {
|
|
_recvuFDIVAdd(VU, VUregsn->cycles);
|
|
}
|
|
}
|
|
|
|
void _recvuAddEFUStalls(VURegs * VU, _VURegsNum *VUregsn) {
|
|
// _vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn);
|
|
if (VUregsn->VIwrite & (1 << REG_P)) {
|
|
_recvuEFUAdd(VU, VUregsn->cycles);
|
|
}
|
|
}
|
|
|
|
void _recvuTestUpperStalls(VURegs * VU, _VURegsNum *VUregsn) {
|
|
switch (VUregsn->pipe) {
|
|
case VUPIPE_FMAC: _recvuTestFMACStalls(VU, VUregsn, true); break;
|
|
}
|
|
}
|
|
|
|
void _recvuTestLowerStalls(VURegs * VU, _VURegsNum *VUregsn) {
|
|
switch (VUregsn->pipe) {
|
|
case VUPIPE_FMAC: _recvuTestFMACStalls(VU, VUregsn, false); break;
|
|
case VUPIPE_FDIV: _recvuTestFDIVStalls(VU, VUregsn); break;
|
|
case VUPIPE_EFU: _recvuTestEFUStalls(VU, VUregsn); break;
|
|
case VUPIPE_IALU: _recvuTestIALUStalls(VU, VUregsn); break;
|
|
case VUPIPE_BRANCH: _recvuTestIALUStalls(VU, VUregsn); break;
|
|
}
|
|
}
|
|
|
|
void _recvuAddUpperStalls(VURegs * VU, _VURegsNum *VUregsn) {
|
|
switch (VUregsn->pipe) {
|
|
case VUPIPE_FMAC: _recvuAddFMACStalls(VU, VUregsn); break;
|
|
}
|
|
}
|
|
|
|
void _recvuAddLowerStalls(VURegs * VU, _VURegsNum *VUregsn) {
|
|
switch (VUregsn->pipe) {
|
|
case VUPIPE_FMAC: _recvuAddFMACStalls(VU, VUregsn); break;
|
|
case VUPIPE_FDIV: _recvuAddFDIVStalls(VU, VUregsn); break;
|
|
case VUPIPE_EFU: _recvuAddEFUStalls(VU, VUregsn); break;
|
|
case VUPIPE_IALU: _recvuAddIALUStalls(VU, VUregsn); break; // note: only ILW and ILWR cause stall in IALU pipe
|
|
}
|
|
}
|
|
|
|
void SuperVUAnalyzeOp(VURegs *VU, _vuopinfo *info, _VURegsNum* pCodeRegs)
|
|
{
|
|
_VURegsNum* lregs;
|
|
_VURegsNum* uregs;
|
|
int *ptr;
|
|
|
|
lregs = pCodeRegs;
|
|
uregs = pCodeRegs+1;
|
|
|
|
ptr = (int*)&VU->Micro[pc];
|
|
pc += 8;
|
|
|
|
if (ptr[1] & 0x40000000) { // EOP
|
|
branch |= 8;
|
|
}
|
|
|
|
VU->code = ptr[1];
|
|
if (VU == &VU1) VU1regs_UPPER_OPCODE[VU->code & 0x3f](uregs);
|
|
else VU0regs_UPPER_OPCODE[VU->code & 0x3f](uregs);
|
|
|
|
_recvuTestUpperStalls(VU, uregs);
|
|
switch(VU->code & 0x3f) {
|
|
case 0x10: case 0x11: case 0x12: case 0x13:
|
|
case 0x14: case 0x15: case 0x16: case 0x17:
|
|
case 0x1d: case 0x1f:
|
|
case 0x2b: case 0x2f:
|
|
break;
|
|
|
|
case 0x3c:
|
|
switch ((VU->code >> 6) & 0x1f) {
|
|
case 0x4: case 0x5:
|
|
break;
|
|
default:
|
|
info->statusflag = 4;
|
|
info->macflag = 4;
|
|
break;
|
|
}
|
|
break;
|
|
case 0x3d:
|
|
switch ((VU->code >> 6) & 0x1f) {
|
|
case 0x4: case 0x5: case 0x7:
|
|
break;
|
|
default:
|
|
info->statusflag = 4;
|
|
info->macflag = 4;
|
|
break;
|
|
}
|
|
break;
|
|
case 0x3e:
|
|
switch ((VU->code >> 6) & 0x1f) {
|
|
case 0x4: case 0x5:
|
|
break;
|
|
default:
|
|
info->statusflag = 4;
|
|
info->macflag = 4;
|
|
break;
|
|
}
|
|
break;
|
|
case 0x3f:
|
|
switch ((VU->code >> 6) & 0x1f) {
|
|
case 0x4: case 0x5: case 0x7: case 0xb:
|
|
break;
|
|
default:
|
|
info->statusflag = 4;
|
|
info->macflag = 4;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
info->statusflag = 4;
|
|
info->macflag = 4;
|
|
break;
|
|
}
|
|
|
|
if (uregs->VIread & (1 << REG_Q)) { info->q |= 2; }
|
|
if (uregs->VIread & (1 << REG_P)) { info->p |= 2; assert( VU == &VU1 ); }
|
|
|
|
// check upper flags
|
|
if (ptr[1] & 0x80000000) { // I flag
|
|
info->cycle = vucycle;
|
|
memzero(*lregs);
|
|
}
|
|
else {
|
|
|
|
VU->code = ptr[0];
|
|
if (VU == &VU1) VU1regs_LOWER_OPCODE[VU->code >> 25](lregs);
|
|
else VU0regs_LOWER_OPCODE[VU->code >> 25](lregs);
|
|
|
|
_recvuTestLowerStalls(VU, lregs);
|
|
info->cycle = vucycle;
|
|
|
|
if (lregs->pipe == VUPIPE_BRANCH) {
|
|
branch |= 1;
|
|
}
|
|
|
|
if (lregs->VIwrite & (1 << REG_Q)) {
|
|
info->q |= 4;
|
|
info->cycles = lregs->cycles;
|
|
info->pqinst = (VU->code&2)>>1; // rsqrt is 2
|
|
}
|
|
else if (lregs->pipe == VUPIPE_FDIV) {
|
|
info->q |= 8|1;
|
|
info->pqinst = 0;
|
|
}
|
|
|
|
if (lregs->VIwrite & (1 << REG_P)) {
|
|
assert( VU == &VU1 );
|
|
info->p |= 4;
|
|
info->cycles = lregs->cycles;
|
|
|
|
switch( VU->code & 0xff ) {
|
|
case 0xfd: info->pqinst = 0; break; //eatan
|
|
case 0x7c: info->pqinst = 0; break; //eatanxy
|
|
case 0x7d: info->pqinst = 0; break; //eatanzy
|
|
case 0xfe: info->pqinst = 1; break; //eexp
|
|
case 0xfc: info->pqinst = 2; break; //esin
|
|
case 0x3f: info->pqinst = 3; break; //erleng
|
|
case 0x3e: info->pqinst = 4; break; //eleng
|
|
case 0x3d: info->pqinst = 4; break; //ersadd
|
|
case 0xbd: info->pqinst = 4; break; //ersqrt
|
|
case 0xbe: info->pqinst = 5; break; //ercpr
|
|
case 0xbc: info->pqinst = 5; break; //esqrt
|
|
case 0x7e: info->pqinst = 5; break; //esum
|
|
case 0x3c: info->pqinst = 6; break; //esadd
|
|
default: assert(0);
|
|
}
|
|
}
|
|
else if (lregs->pipe == VUPIPE_EFU) {
|
|
info->p |= 8|1;
|
|
}
|
|
|
|
if (lregs->VIread & (1 << REG_STATUS_FLAG)) info->statusflag|= VUOP_READ;
|
|
if (lregs->VIread & (1 << REG_MAC_FLAG)) info->macflag|= VUOP_READ;
|
|
|
|
if (lregs->VIwrite & (1 << REG_STATUS_FLAG)) info->statusflag|= VUOP_WRITE;
|
|
if (lregs->VIwrite & (1 << REG_MAC_FLAG)) info->macflag|= VUOP_WRITE;
|
|
|
|
if (lregs->VIread & (1 << REG_Q)) { info->q |= 2; }
|
|
if (lregs->VIread & (1 << REG_P)) { info->p |= 2; assert( VU == &VU1 ); }
|
|
|
|
_recvuAddLowerStalls(VU, lregs);
|
|
}
|
|
|
|
_recvuAddUpperStalls(VU, uregs);
|
|
_recvuTestPipes(VU, false);
|
|
|
|
vucycle++;
|
|
}
|
|
|
|
int eeVURecompileCode(VURegs *VU, _VURegsNum* regs)
|
|
{
|
|
int info = 0;
|
|
int vfread0=-1, vfread1 = -1, vfwrite = -1, vfacc = -1, vftemp=-1;
|
|
|
|
assert( regs != NULL );
|
|
|
|
if( regs->VFread0 ) _addNeededVFtoXMMreg(regs->VFread0);
|
|
if( regs->VFread1 ) _addNeededVFtoXMMreg(regs->VFread1);
|
|
if( regs->VFwrite ) _addNeededVFtoXMMreg(regs->VFwrite);
|
|
if( regs->VIread & (1<<REG_ACC_FLAG) ) _addNeededACCtoXMMreg();
|
|
if( regs->VIread & (1<<REG_VF0_FLAG) ) _addNeededVFtoXMMreg(0);
|
|
|
|
// alloc
|
|
if( regs->VFread0 ) vfread0 = _allocVFtoXMMreg(VU, -1, regs->VFread0, MODE_READ);
|
|
else if( regs->VIread & (1<<REG_VF0_FLAG) ) vfread0 = _allocVFtoXMMreg(VU, -1, 0, MODE_READ);
|
|
if( regs->VFread1 ) vfread1 = _allocVFtoXMMreg(VU, -1, regs->VFread1, MODE_READ);
|
|
else if( (regs->VIread & (1<<REG_VF0_FLAG)) && regs->VFr1xyzw != 0xff) vfread1 = _allocVFtoXMMreg(VU, -1, 0, MODE_READ);
|
|
|
|
if( regs->VIread & (1<<REG_ACC_FLAG )) {
|
|
vfacc = _allocACCtoXMMreg(VU, -1, ((regs->VIwrite&(1<<REG_ACC_FLAG))?MODE_WRITE:0)|MODE_READ);
|
|
}
|
|
else if( regs->VIwrite & (1<<REG_ACC_FLAG) ) {
|
|
vfacc = _allocACCtoXMMreg(VU, -1, MODE_WRITE|(regs->VFwxyzw != 0xf?MODE_READ:0));
|
|
}
|
|
|
|
if( regs->VFwrite ) {
|
|
assert( !(regs->VIwrite&(1<<REG_ACC_FLAG)) );
|
|
vfwrite = _allocVFtoXMMreg(VU, -1, regs->VFwrite, MODE_WRITE|(regs->VFwxyzw != 0xf?MODE_READ:0));
|
|
}
|
|
|
|
if( vfacc>= 0 ) info |= PROCESS_EE_SET_ACC(vfacc);
|
|
if( vfwrite >= 0 ) {
|
|
if( regs->VFwrite == _Ft_ && vfread1 < 0 ) {
|
|
info |= PROCESS_EE_SET_T(vfwrite);
|
|
}
|
|
else {
|
|
assert( regs->VFwrite == _Fd_ );
|
|
info |= PROCESS_EE_SET_D(vfwrite);
|
|
}
|
|
}
|
|
|
|
if( vfread0 >= 0 ) info |= PROCESS_EE_SET_S(vfread0);
|
|
if( vfread1 >= 0 ) info |= PROCESS_EE_SET_T(vfread1);
|
|
|
|
vftemp = _allocTempXMMreg(XMMT_FPS, -1);
|
|
info |= PROCESS_VU_SET_TEMP(vftemp);
|
|
|
|
if( regs->VIwrite & (1 << REG_CLIP_FLAG) ) {
|
|
// CLIP inst, need two extra temp registers, put it EEREC_D and EEREC_ACC
|
|
int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
|
|
int t2reg = _allocTempXMMreg(XMMT_FPS, -1);
|
|
|
|
info |= PROCESS_EE_SET_D(t1reg);
|
|
info |= PROCESS_EE_SET_ACC(t2reg);
|
|
|
|
_freeXMMreg(t1reg); // don't need
|
|
_freeXMMreg(t2reg); // don't need
|
|
}
|
|
else if( regs->VIwrite & (1<<REG_P) ) {
|
|
int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
|
|
info |= PROCESS_EE_SET_D(t1reg);
|
|
_freeXMMreg(t1reg); // don't need
|
|
}
|
|
|
|
_freeXMMreg(vftemp); // don't need it
|
|
|
|
return info;
|
|
}
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// Misc VU/VI Allocation Functions
|
|
//------------------------------------------------------------------
|
|
// returns the correct VI addr
|
|
u32 GetVIAddr(VURegs * VU, int reg, int read, int info)
|
|
{
|
|
if( info & PROCESS_VU_SUPER ) return SuperVUGetVIAddr(reg, read);
|
|
if( info & PROCESS_VU_COP2 ) return (uptr)&VU->VI[reg].UL;
|
|
|
|
if( read != 1 ) {
|
|
if( reg == REG_MAC_FLAG ) return (uptr)&VU->macflag;
|
|
if( reg == REG_CLIP_FLAG ) return (uptr)&VU->clipflag;
|
|
if( reg == REG_STATUS_FLAG ) return (uptr)&VU->statusflag;
|
|
if( reg == REG_Q ) return (uptr)&VU->q;
|
|
if( reg == REG_P ) return (uptr)&VU->p;
|
|
}
|
|
|
|
return (uptr)&VU->VI[reg].UL;
|
|
}
|
|
|
|
// gets a temp reg that is not EEREC_TEMP
|
|
int _vuGetTempXMMreg(int info)
|
|
{
|
|
int t1reg = -1;
|
|
|
|
if( _hasFreeXMMreg() ) {
|
|
t1reg = _allocTempXMMreg(XMMT_FPS, -1);
|
|
|
|
if( t1reg == EEREC_TEMP ) {
|
|
if( _hasFreeXMMreg() ) {
|
|
int t = _allocTempXMMreg(XMMT_FPS, -1);
|
|
_freeXMMreg(t1reg);
|
|
t1reg = t;
|
|
}
|
|
else {
|
|
_freeXMMreg(t1reg);
|
|
t1reg = -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
return t1reg;
|
|
}
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// Misc VU Reg Flipping/Merging Functions
|
|
//------------------------------------------------------------------
|
|
void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw)
|
|
{
|
|
switch (xyzw) {
|
|
case 0: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x00); break;
|
|
case 1: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x55); break;
|
|
case 2: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xaa); break;
|
|
case 3: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xff); break;
|
|
}
|
|
}
|
|
|
|
void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw)
|
|
{
|
|
switch (xyzw) {
|
|
case 0: SSE_MOVSS_XMM_to_XMM(dstreg, srcreg); break;
|
|
case 1: if ( x86caps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(1, 0, 0));
|
|
else SSE2_PSHUFLW_XMM_to_XMM(dstreg, srcreg, 0xee);
|
|
break;
|
|
case 2: SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); break;
|
|
case 3: if ( x86caps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(3, 0, 0));
|
|
else { SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); SSE2_PSHUFLW_XMM_to_XMM(dstreg, dstreg, 0xee); }
|
|
break;
|
|
}
|
|
}
|
|
|
|
void _vuFlipRegSS(VURegs * VU, int reg)
|
|
{
|
|
assert( _XYZW_SS );
|
|
if( _Y ) SSE2_PSHUFLW_XMM_to_XMM(reg, reg, 0x4e);
|
|
else if( _Z ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xc6);
|
|
else if( _W ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x27);
|
|
}
|
|
|
|
void _vuFlipRegSS_xyzw(int reg, int xyzw)
|
|
{
|
|
switch ( xyzw ) {
|
|
case 1: SSE2_PSHUFLW_XMM_to_XMM(reg, reg, 0x4e); break;
|
|
case 2: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xc6); break;
|
|
case 3: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x27); break;
|
|
}
|
|
}
|
|
|
|
void _vuMoveSS(VURegs * VU, int dstreg, int srcreg)
|
|
{
|
|
assert( _XYZW_SS );
|
|
if( _Y ) _unpackVFSS_xyzw(dstreg, srcreg, 1);
|
|
else if( _Z ) _unpackVFSS_xyzw(dstreg, srcreg, 2);
|
|
else if( _W ) _unpackVFSS_xyzw(dstreg, srcreg, 3);
|
|
else _unpackVFSS_xyzw(dstreg, srcreg, 0);
|
|
}
|
|
|
|
// 1 - src, 0 - dest wzyx
|
|
void VU_MERGE0(int dest, int src) { // 0000s
|
|
}
|
|
void VU_MERGE1(int dest, int src) { // 1000
|
|
SSE_MOVHLPS_XMM_to_XMM(src, dest);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4);
|
|
}
|
|
void VU_MERGE1b(int dest, int src) { // 1000s
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
|
|
}
|
|
void VU_MERGE2(int dest, int src) { // 0100
|
|
SSE_MOVHLPS_XMM_to_XMM(src, dest);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64);
|
|
}
|
|
void VU_MERGE2b(int dest, int src) { // 0100s
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
|
|
}
|
|
void VU_MERGE3(int dest, int src) { // 1100s
|
|
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
|
|
}
|
|
void VU_MERGE4(int dest, int src) { // 0010
|
|
SSE_MOVSS_XMM_to_XMM(src, dest);
|
|
SSE2_MOVSD_XMM_to_XMM(dest, src);
|
|
}
|
|
void VU_MERGE4b(int dest, int src) { // 0010s
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
|
|
}
|
|
void VU_MERGE5(int dest, int src) { // 1010
|
|
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd8);
|
|
}
|
|
void VU_MERGE5b(int dest, int src) { // 1010s
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
|
|
}
|
|
void VU_MERGE6(int dest, int src) { // 0110
|
|
SSE_SHUFPS_XMM_to_XMM(dest, src, 0x9c);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x78);
|
|
}
|
|
void VU_MERGE6b(int dest, int src) { // 0110s
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
|
|
}
|
|
void VU_MERGE7(int dest, int src) { // 1110
|
|
SSE_MOVSS_XMM_to_XMM(src, dest);
|
|
SSE_MOVAPS_XMM_to_XMM(dest, src);
|
|
}
|
|
void VU_MERGE7b(int dest, int src) { // 1110s
|
|
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
|
|
}
|
|
void VU_MERGE8(int dest, int src) { // 0001s
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
}
|
|
void VU_MERGE9(int dest, int src) { // 1001
|
|
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc9);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xd2);
|
|
}
|
|
void VU_MERGE9b(int dest, int src) { // 1001s
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
|
|
}
|
|
void VU_MERGE10(int dest, int src) { // 0101
|
|
SSE_SHUFPS_XMM_to_XMM(dest, src, 0x8d);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x72);
|
|
}
|
|
void VU_MERGE10b(int dest, int src) { // 0101s
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
|
|
}
|
|
void VU_MERGE11(int dest, int src) { // 1101s
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
|
|
}
|
|
void VU_MERGE12(int dest, int src) { // 0011
|
|
SSE2_MOVSD_XMM_to_XMM(dest, src);
|
|
}
|
|
void VU_MERGE13(int dest, int src) { // 1011
|
|
SSE_MOVHLPS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, dest, 0x64);
|
|
SSE_MOVAPS_XMM_to_XMM(dest, src);
|
|
}
|
|
void VU_MERGE13b(int dest, int src) { // 1011s
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0x27);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0x27);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
|
|
}
|
|
void VU_MERGE14(int dest, int src) { // 0111
|
|
SSE_MOVHLPS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, dest, 0xc4);
|
|
SSE_MOVAPS_XMM_to_XMM(dest, src);
|
|
}
|
|
void VU_MERGE14b(int dest, int src) { // 0111s
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
|
|
SSE_MOVSS_XMM_to_XMM(dest, src);
|
|
SSE_SHUFPS_XMM_to_XMM(src, src, 0xC6);
|
|
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xC6);
|
|
}
|
|
void VU_MERGE15(int dest, int src) { // 1111s
|
|
SSE_MOVAPS_XMM_to_XMM(dest, src);
|
|
}
|
|
|
|
typedef void (*VUMERGEFN)(int dest, int src);
|
|
|
|
static VUMERGEFN s_VuMerge[16] = {
|
|
VU_MERGE0, VU_MERGE1, VU_MERGE2, VU_MERGE3,
|
|
VU_MERGE4, VU_MERGE5, VU_MERGE6, VU_MERGE7,
|
|
VU_MERGE8, VU_MERGE9, VU_MERGE10, VU_MERGE11,
|
|
VU_MERGE12, VU_MERGE13, VU_MERGE14, VU_MERGE15 };
|
|
|
|
static VUMERGEFN s_VuMerge2[16] = {
|
|
VU_MERGE0, VU_MERGE1b, VU_MERGE2b, VU_MERGE3,
|
|
VU_MERGE4b, VU_MERGE5b, VU_MERGE6b, VU_MERGE7b,
|
|
VU_MERGE8, VU_MERGE9b, VU_MERGE10b, VU_MERGE11,
|
|
VU_MERGE12, VU_MERGE13b, VU_MERGE14b, VU_MERGE15 };
|
|
|
|
// Modifies the Source Reg!
|
|
void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw) {
|
|
xyzw &= 0xf;
|
|
if ( (dest != src) && (xyzw != 0) ) {
|
|
if ( x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf) ) {
|
|
xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
|
|
SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw);
|
|
}
|
|
else s_VuMerge[xyzw](dest, src);
|
|
}
|
|
}
|
|
// Doesn't Modify the Source Reg! (ToDo: s_VuMerge2() has room for optimization)
|
|
void VU_MERGE_REGS_SAFE(int dest, int src, int xyzw) {
|
|
xyzw &= 0xf;
|
|
if ( (dest != src) && (xyzw != 0) ) {
|
|
if ( x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf) ) {
|
|
xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
|
|
SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw);
|
|
}
|
|
else s_VuMerge2[xyzw](dest, src);
|
|
}
|
|
}
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// Misc VU Reg Clamping/Overflow Functions
|
|
//------------------------------------------------------------------
|
|
#define CLAMP_NORMAL_SSE4(n) \
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);\
|
|
SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[n][0]);\
|
|
SSE2_PSUBD_XMM_to_XMM(regTemp, regd);\
|
|
SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_ones[0]);\
|
|
SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[n][0]);\
|
|
SSE2_PSLLD_I8_to_XMM(regTemp, 31);\
|
|
SSE_XORPS_XMM_to_XMM(regd, regTemp);
|
|
|
|
#define CLAMP_SIGN_SSE4(n) \
|
|
SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[n][0]);\
|
|
SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[n][0]);
|
|
|
|
void vFloat0(int regd, int regTemp) { } //0000
|
|
void vFloat1(int regd, int regTemp) { //1000
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
}
|
|
void vFloat1c(int regd, int regTemp) { //1000
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(1);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat2(int regd, int regTemp) { //0100
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
}
|
|
void vFloat2c(int regd, int regTemp) { //0100
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(2);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat3(int regd, int regTemp) { //1100
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
|
|
}
|
|
void vFloat3b(int regd, int regTemp) { //1100 //regTemp is Modified
|
|
SSE2_MOVSD_XMM_to_XMM(regTemp, regd);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
SSE2_MOVSD_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
void vFloat3c(int regd, int regTemp) { //1100
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(3);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat4(int regd, int regTemp) { //0010
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
}
|
|
void vFloat4c(int regd, int regTemp) { //0010
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(4);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat5(int regd, int regTemp) { //1010
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
|
|
}
|
|
void vFloat5b(int regd, int regTemp) { //1010 //regTemp is Modified
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_NORMAL_SSE4(5);
|
|
}
|
|
else {
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
|
|
}
|
|
}
|
|
void vFloat5c(int regd, int regTemp) { //1010
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(5);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat6(int regd, int regTemp) { //0110
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
|
|
}
|
|
void vFloat6b(int regd, int regTemp) { //0110 //regTemp is Modified
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_NORMAL_SSE4(6);
|
|
}
|
|
else {
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
|
|
}
|
|
}
|
|
void vFloat6c(int regd, int regTemp) { //0110
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(6);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat7(int regd, int regTemp) { //1110
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
|
|
}
|
|
void vFloat7_useEAX(int regd, int regTemp) { //1110 //EAX is Modified
|
|
SSE2_MOVD_XMM_to_R(EAX, regd);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
if ( x86caps.hasStreamingSIMD4Extensions )
|
|
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
|
|
else {
|
|
SSE_PINSRW_R32_to_XMM(regd, EAX, 0);
|
|
SHR32ItoR(EAX, 16);
|
|
SSE_PINSRW_R32_to_XMM(regd, EAX, 1);
|
|
}
|
|
}
|
|
void vFloat7b(int regd, int regTemp) { //1110 //regTemp is Modified
|
|
SSE_MOVSS_XMM_to_XMM(regTemp, regd);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_MOVSS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
void vFloat7c(int regd, int regTemp) { //1110
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(7);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat7c_useEAX(int regd, int regTemp) { //1110 //EAX is Modified
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(7);
|
|
}
|
|
else {
|
|
SSE2_MOVD_XMM_to_R(EAX, regd);
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
SSE2_MOVD_R_to_XMM(regTemp, EAX);
|
|
SSE_MOVSS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat8(int regd, int regTemp) { //0001
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
}
|
|
void vFloat8c(int regd, int regTemp) { //0001
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(8);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat9(int regd, int regTemp) { //1001
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
}
|
|
void vFloat9b(int regd, int regTemp) { //1001 //regTemp is Modified
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_NORMAL_SSE4(9);
|
|
}
|
|
else {
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
}
|
|
}
|
|
void vFloat9c(int regd, int regTemp) { //1001
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(9);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat10(int regd, int regTemp) { //0101
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
}
|
|
void vFloat10b(int regd, int regTemp) { //0101 //regTemp is Modified
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_NORMAL_SSE4(10);
|
|
}
|
|
else {
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
}
|
|
}
|
|
void vFloat10c(int regd, int regTemp) { //0101
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(10);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat11(int regd, int regTemp) { //1101
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
|
|
}
|
|
void vFloat11_useEAX(int regd, int regTemp) { //1101 //EAX is Modified
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE2_MOVD_XMM_to_R(EAX, regd);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
if ( x86caps.hasStreamingSIMD4Extensions )
|
|
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
|
|
else {
|
|
SSE_PINSRW_R32_to_XMM(regd, EAX, 0);
|
|
SHR32ItoR(EAX, 16);
|
|
SSE_PINSRW_R32_to_XMM(regd, EAX, 1);
|
|
}
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
}
|
|
void vFloat11b(int regd, int regTemp) { //1101 //regTemp is Modified
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_MOVSS_XMM_to_XMM(regTemp, regd);
|
|
SSE2_MOVSD_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
void vFloat11c(int regd, int regTemp) { //1101
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(11);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat11c_useEAX(int regd, int regTemp) { //1101 // EAX is modified
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(11);
|
|
}
|
|
else {
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE2_MOVD_XMM_to_R(EAX, regd);
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
SSE2_MOVD_R_to_XMM(regTemp, EAX);
|
|
SSE_MOVSS_XMM_to_XMM(regd, regTemp);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
}
|
|
}
|
|
void vFloat12(int regd, int regTemp) { //0011
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
}
|
|
void vFloat12b(int regd, int regTemp) { //0011 //regTemp is Modified
|
|
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
SSE2_PUNPCKLQDQ_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
void vFloat12c(int regd, int regTemp) { //0011
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(12);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat13(int regd, int regTemp) { //1011
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
|
|
}
|
|
void vFloat13_useEAX(int regd, int regTemp) { //1011 // EAX is modified
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE2_MOVD_XMM_to_R(EAX, regd);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
if ( x86caps.hasStreamingSIMD4Extensions )
|
|
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
|
|
else {
|
|
SSE_PINSRW_R32_to_XMM(regd, EAX, 0);
|
|
SHR32ItoR(EAX, 16);
|
|
SSE_PINSRW_R32_to_XMM(regd, EAX, 1);
|
|
}
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
}
|
|
void vFloat13b(int regd, int regTemp) { //1011 //regTemp is Modified
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x64);
|
|
}
|
|
void vFloat13c(int regd, int regTemp) { //1011
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(13);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat13c_useEAX(int regd, int regTemp) { //1011 // EAX is modified
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(13);
|
|
}
|
|
else {
|
|
SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE2_MOVD_XMM_to_R(EAX, regd);
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
SSE2_MOVD_R_to_XMM(regTemp, EAX);
|
|
SSE_MOVSS_XMM_to_XMM(regd, regTemp);
|
|
SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0xc6);
|
|
}
|
|
}
|
|
void vFloat14(int regd, int regTemp) { //0111
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
|
|
}
|
|
void vFloat14_useEAX(int regd, int regTemp) { //0111 // EAX is modified
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE2_MOVD_XMM_to_R(EAX, regd);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
if ( x86caps.hasStreamingSIMD4Extensions )
|
|
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
|
|
else {
|
|
SSE_PINSRW_R32_to_XMM(regd, EAX, 0);
|
|
SHR32ItoR(EAX, 16);
|
|
SSE_PINSRW_R32_to_XMM(regd, EAX, 1);
|
|
}
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
|
|
}
|
|
void vFloat14b(int regd, int regTemp) { //0111 //regTemp is Modified
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xc4);
|
|
}
|
|
void vFloat14c(int regd, int regTemp) { //0111
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(14);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
|
|
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
void vFloat14c_useEAX(int regd, int regTemp) { //0111 // EAX is modified
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(14);
|
|
}
|
|
else {
|
|
SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0x27);
|
|
SSE2_MOVD_XMM_to_R(EAX, regd);
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
SSE2_MOVD_R_to_XMM(regTemp, EAX);
|
|
SSE_MOVSS_XMM_to_XMM(regd, regTemp);
|
|
SSE2_PSHUFD_XMM_to_XMM(regd, regd, 0x27);
|
|
}
|
|
}
|
|
void vFloat15(int regd, int regTemp) { //1111
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
|
|
}
|
|
void vFloat15c(int regd, int regTemp) { //1111
|
|
if ( x86caps.hasStreamingSIMD4Extensions ) {
|
|
CLAMP_SIGN_SSE4(15);
|
|
}
|
|
else {
|
|
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
|
|
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
|
|
SSE_MINPS_M128_to_XMM(regd, (uptr)&g_maxvals[0]);
|
|
SSE_MAXPS_M128_to_XMM(regd, (uptr)&g_minvals[0]);
|
|
SSE_ORPS_XMM_to_XMM(regd, regTemp);
|
|
}
|
|
}
|
|
|
|
vFloat vFloats1[16] = { //regTemp is not modified
|
|
vFloat0, vFloat1, vFloat2, vFloat3,
|
|
vFloat4, vFloat5, vFloat6, vFloat7,
|
|
vFloat8, vFloat9, vFloat10, vFloat11,
|
|
vFloat12, vFloat13, vFloat14, vFloat15 };
|
|
|
|
vFloat vFloats1_useEAX[16] = { //regTemp is not modified but EAX is used
|
|
vFloat0, vFloat1, vFloat2, vFloat3,
|
|
vFloat4, vFloat5, vFloat6, vFloat7_useEAX,
|
|
vFloat8, vFloat9, vFloat10, vFloat11_useEAX,
|
|
vFloat12, vFloat13_useEAX, vFloat14_useEAX, vFloat15 };
|
|
|
|
vFloat vFloats2[16] = { //regTemp is modified
|
|
vFloat0, vFloat1, vFloat2, vFloat3b,
|
|
vFloat4, vFloat5b, vFloat6b, vFloat7b,
|
|
vFloat8, vFloat9b, vFloat10b, vFloat11b,
|
|
vFloat12b, vFloat13b, vFloat14b, vFloat15 };
|
|
|
|
vFloat vFloats4[16] = { //regTemp is modified
|
|
vFloat0, vFloat1c, vFloat2c, vFloat3c,
|
|
vFloat4c, vFloat5c, vFloat6c, vFloat7c,
|
|
vFloat8c, vFloat9c, vFloat10c, vFloat11c,
|
|
vFloat12c, vFloat13c, vFloat14c, vFloat15c };
|
|
|
|
vFloat vFloats4_useEAX[16] = { //regTemp is modified and EAX is used
|
|
vFloat0, vFloat1c, vFloat2c, vFloat3c,
|
|
vFloat4c, vFloat5c, vFloat6c, vFloat7c_useEAX,
|
|
vFloat8c, vFloat9c, vFloat10c, vFloat11c_useEAX,
|
|
vFloat12c, vFloat13c_useEAX, vFloat14c_useEAX, vFloat15c };
|
|
|
|
//------------------------------------------------------------------
|
|
// Clamping Functions (wrapper for vFloat* functions)
|
|
// vuFloat : "normal" clamping
|
|
// vuFloat_useEAX : "normal" clamping (faster but EAX is modified)
|
|
// vuFloat2 : "normal" clamping (fastest but regTemp is modified)
|
|
// vuFloat3 : "preserve sign" clamping for pointer
|
|
// vuFloat4 : "preserve sign" clamping (regTemp is modified; *FASTEST* on SSE4 CPUs)
|
|
// vuFloat4_useEAX : "preserve sign" clamping (faster but regTemp and EAX are modified)
|
|
// vuFloat5 : wrapper function for vuFloat2 and vuFloat4
|
|
// vuFloat5_useEAX : wrapper function for vuFloat2 and vuFloat4_useEAX
|
|
// vuFloatExtra : for debugging
|
|
//
|
|
// Notice 1: vuFloat*_useEAX may be slower on AMD CPUs, which have independent execution pipeline for
|
|
// vector and scalar instructions (need checks)
|
|
// Notice 2: recVUMI_MUL_xyzw_toD and recVUMI_MADD_xyzw_toD use vFloats directly!
|
|
//------------------------------------------------------------------
|
|
|
|
// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (doesn't use any temp regs)
|
|
void vuFloat( int info, int regd, int XYZW) {
|
|
if( CHECK_VU_OVERFLOW ) {
|
|
/*if ( (XYZW != 0) && (XYZW != 8) && (XYZW != 0xF) ) {
|
|
int t1reg = _vuGetTempXMMreg(info);
|
|
if (t1reg >= 0) {
|
|
vuFloat2( regd, t1reg, XYZW );
|
|
_freeXMMreg( t1reg );
|
|
return;
|
|
}
|
|
}*/
|
|
//vuFloatExtra(regd, XYZW);
|
|
vFloats1[XYZW](regd, regd);
|
|
}
|
|
}
|
|
|
|
// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (uses EAX as a temp register; faster but **destroys EAX**)
|
|
void vuFloat_useEAX( int info, int regd, int XYZW) {
|
|
if( CHECK_VU_OVERFLOW ) {
|
|
vFloats1_useEAX[XYZW](regd, regd);
|
|
}
|
|
}
|
|
|
|
// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (uses a temp reg)
|
|
void vuFloat2(int regd, int regTemp, int XYZW) {
|
|
if( CHECK_VU_OVERFLOW ) {
|
|
//vuFloatExtra(regd, XYZW);
|
|
vFloats2[XYZW](regd, regTemp);
|
|
}
|
|
}
|
|
|
|
// Clamps +/-NaN and +/-Inf to +/-fMax (uses a temp reg)
|
|
void vuFloat4(int regd, int regTemp, int XYZW) {
|
|
if( CHECK_VU_OVERFLOW ) {
|
|
vFloats4[XYZW](regd, regTemp);
|
|
}
|
|
}
|
|
|
|
// Clamps +/-NaN and +/-Inf to +/-fMax (uses a temp reg, and uses EAX as a temp register; faster but **destroys EAX**)
|
|
void vuFloat4_useEAX(int regd, int regTemp, int XYZW) {
|
|
if( CHECK_VU_OVERFLOW ) {
|
|
vFloats4_useEAX[XYZW](regd, regTemp);
|
|
}
|
|
}
|
|
|
|
// Uses vuFloat4 or vuFloat2 depending on the CHECK_VU_SIGN_OVERFLOW setting
|
|
void vuFloat5(int regd, int regTemp, int XYZW) {
|
|
if (CHECK_VU_SIGN_OVERFLOW) {
|
|
vuFloat4(regd, regTemp, XYZW);
|
|
}
|
|
else vuFloat2(regd, regTemp, XYZW);
|
|
}
|
|
|
|
// Uses vuFloat4_useEAX or vuFloat2 depending on the CHECK_VU_SIGN_OVERFLOW setting (uses EAX as a temp register; faster but **destoroyes EAX**)
|
|
void vuFloat5_useEAX(int regd, int regTemp, int XYZW) {
|
|
if (CHECK_VU_SIGN_OVERFLOW) {
|
|
vuFloat4_useEAX(regd, regTemp, XYZW);
|
|
}
|
|
else vuFloat2(regd, regTemp, XYZW);
|
|
}
|
|
|
|
// Clamps +/-infs to +/-fMax, and +/-NaNs to +/-fMax
|
|
void vuFloat3(uptr x86ptr) {
|
|
u8* pjmp;
|
|
|
|
if( CHECK_VU_OVERFLOW ) {
|
|
CMP32ItoM(x86ptr, 0x7f800000 );
|
|
pjmp = JL8(0); // Signed Comparison
|
|
MOV32ItoM(x86ptr, 0x7f7fffff );
|
|
x86SetJ8(pjmp);
|
|
|
|
CMP32ItoM(x86ptr, 0xff800000 );
|
|
pjmp = JB8(0); // Unsigned Comparison
|
|
MOV32ItoM(x86ptr, 0xff7fffff );
|
|
x86SetJ8(pjmp);
|
|
}
|
|
}
|
|
|
|
__aligned16 u64 vuFloatData[4];
|
|
|
|
// Makes NaN == 0, Infinities stay the same; Very Slow - Use only for debugging
|
|
void vuFloatExtra( int regd, int XYZW) {
|
|
int t1reg = (regd == 0) ? (regd + 1) : (regd - 1);
|
|
int t2reg = (regd <= 1) ? (regd + 2) : (regd - 2);
|
|
SSE_MOVAPS_XMM_to_M128( (uptr)&vuFloatData[0], t1reg );
|
|
SSE_MOVAPS_XMM_to_M128( (uptr)&vuFloatData[2], t2reg );
|
|
|
|
SSE_XORPS_XMM_to_XMM(t1reg, t1reg);
|
|
SSE_CMPORDPS_XMM_to_XMM(t1reg, regd);
|
|
SSE_MOVAPS_XMM_to_XMM(t2reg, regd);
|
|
SSE_ANDPS_XMM_to_XMM(t2reg, t1reg);
|
|
VU_MERGE_REGS_CUSTOM(regd, t2reg, XYZW);
|
|
|
|
SSE_MOVAPS_M128_to_XMM( t1reg, (uptr)&vuFloatData[0] );
|
|
SSE_MOVAPS_M128_to_XMM( t2reg, (uptr)&vuFloatData[2] );
|
|
}
|
|
|
|
static __aligned16 u32 tempRegX[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000};
|
|
|
|
// Called by testWhenOverflow() function
|
|
void testPrintOverflow() {
|
|
tempRegX[0] &= 0xff800000;
|
|
tempRegX[1] &= 0xff800000;
|
|
tempRegX[2] &= 0xff800000;
|
|
tempRegX[3] &= 0xff800000;
|
|
if ( (tempRegX[0] == 0x7f800000) || (tempRegX[1] == 0x7f800000) || (tempRegX[2] == 0x7f800000) || (tempRegX[3] == 0x7f800000) )
|
|
Console.Warning( "VU OVERFLOW!: Changing to +Fmax!!!!!!!!!!!!" );
|
|
if ( (tempRegX[0] == 0xff800000) || (tempRegX[1] == 0xff800000) || (tempRegX[2] == 0xff800000) || (tempRegX[3] == 0xff800000) )
|
|
Console.Warning( "VU OVERFLOW!: Changing to -Fmax!!!!!!!!!!!!" );
|
|
}
|
|
|
|
// Outputs to the console when overflow has occured.
|
|
void testWhenOverflow(int info, int regd, int t0reg) {
|
|
SSE_MOVAPS_XMM_to_M128((uptr)tempRegX, regd);
|
|
CALLFunc((uptr)testPrintOverflow);
|
|
}
|