added various VU optimizations by tmkk, including different clamping optimizations.

i also fixed some VU functions to use appropriate clampmodes, that i had forgotten to change in the past.

git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@670 a6443dda-0b58-4228-96e9-037be469359c
This commit is contained in:
cottonvibes 2009-01-31 06:43:55 +00:00 committed by Gregory Hainaut
parent 7adb8c864f
commit 167399c52c
6 changed files with 604 additions and 253 deletions

View File

@ -94,6 +94,63 @@ PCSX2_ALIGNED16(u32 g_minvals[4]) = {0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7ff
PCSX2_ALIGNED16(u32 g_maxvals[4]) = {0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff}; PCSX2_ALIGNED16(u32 g_maxvals[4]) = {0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff};
PCSX2_ALIGNED16(u32 const_clip[8]) = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, PCSX2_ALIGNED16(u32 const_clip[8]) = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
0x80000000, 0x80000000, 0x80000000, 0x80000000}; 0x80000000, 0x80000000, 0x80000000, 0x80000000};
PCSX2_ALIGNED16(u32 g_minvals_XYZW[16][4]) =
{
{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, //0000
{ 0xffffffff, 0xffffffff, 0xffffffff, 0xff7fffff }, //0001
{ 0xffffffff, 0xffffffff, 0xff7fffff, 0xffffffff }, //0010
{ 0xffffffff, 0xffffffff, 0xff7fffff, 0xff7fffff }, //0011
{ 0xffffffff, 0xff7fffff, 0xffffffff, 0xffffffff }, //0100
{ 0xffffffff, 0xff7fffff, 0xffffffff, 0xff7fffff }, //0101
{ 0xffffffff, 0xff7fffff, 0xff7fffff, 0xffffffff }, //0110
{ 0xffffffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //0111
{ 0xff7fffff, 0xffffffff, 0xffffffff, 0xffffffff }, //1000
{ 0xff7fffff, 0xffffffff, 0xffffffff, 0xff7fffff }, //1001
{ 0xff7fffff, 0xffffffff, 0xff7fffff, 0xffffffff }, //1010
{ 0xff7fffff, 0xffffffff, 0xff7fffff, 0xff7fffff }, //1011
{ 0xff7fffff, 0xff7fffff, 0xffffffff, 0xffffffff }, //1100
{ 0xff7fffff, 0xff7fffff, 0xffffffff, 0xff7fffff }, //1101
{ 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xffffffff }, //1110
{ 0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff }, //1111
};
PCSX2_ALIGNED16(u32 g_maxvals_XYZW[16][4])=
{
{ 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //0000
{ 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7f7fffff }, //0001
{ 0x7fffffff, 0x7fffffff, 0x7f7fffff, 0x7fffffff }, //0010
{ 0x7fffffff, 0x7fffffff, 0x7f7fffff, 0x7f7fffff }, //0011
{ 0x7fffffff, 0x7f7fffff, 0x7fffffff, 0x7fffffff }, //0100
{ 0x7fffffff, 0x7f7fffff, 0x7fffffff, 0x7f7fffff }, //0101
{ 0x7fffffff, 0x7f7fffff, 0x7f7fffff, 0x7fffffff }, //0110
{ 0x7fffffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //0111
{ 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1000
{ 0x7f7fffff, 0x7fffffff, 0x7fffffff, 0x7f7fffff }, //1001
{ 0x7f7fffff, 0x7fffffff, 0x7f7fffff, 0x7fffffff }, //1010
{ 0x7f7fffff, 0x7fffffff, 0x7f7fffff, 0x7f7fffff }, //1011
{ 0x7f7fffff, 0x7f7fffff, 0x7fffffff, 0x7fffffff }, //1100
{ 0x7f7fffff, 0x7f7fffff, 0x7fffffff, 0x7f7fffff }, //1101
{ 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7fffffff }, //1110
{ 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff }, //1111
};
PCSX2_ALIGNED16(u32 g_NaNs_XYZW[16][4])=
{
{ 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //0000
{ 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7f800000 }, //0001
{ 0x7fffffff, 0x7fffffff, 0x7f800000, 0x7fffffff }, //0010
{ 0x7fffffff, 0x7fffffff, 0x7f800000, 0x7f800000 }, //0011
{ 0x7fffffff, 0x7f800000, 0x7fffffff, 0x7fffffff }, //0100
{ 0x7fffffff, 0x7f800000, 0x7fffffff, 0x7f800000 }, //0101
{ 0x7fffffff, 0x7f800000, 0x7f800000, 0x7fffffff }, //0110
{ 0x7fffffff, 0x7f800000, 0x7f800000, 0x7f800000 }, //0111
{ 0x7f800000, 0x7fffffff, 0x7fffffff, 0x7fffffff }, //1000
{ 0x7f800000, 0x7fffffff, 0x7fffffff, 0x7f800000 }, //1001
{ 0x7f800000, 0x7fffffff, 0x7f800000, 0x7fffffff }, //1010
{ 0x7f800000, 0x7fffffff, 0x7f800000, 0x7f800000 }, //1011
{ 0x7f800000, 0x7f800000, 0x7fffffff, 0x7fffffff }, //1100
{ 0x7f800000, 0x7f800000, 0x7fffffff, 0x7f800000 }, //1101
{ 0x7f800000, 0x7f800000, 0x7f800000, 0x7fffffff }, //1110
{ 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }, //1111
};
//------------------------------------------------------------------ //------------------------------------------------------------------
//------------------------------------------------------------------ //------------------------------------------------------------------
@ -589,13 +646,11 @@ void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw)
switch (xyzw) { switch (xyzw) {
case 0: SSE_MOVSS_XMM_to_XMM(dstreg, srcreg); break; case 0: SSE_MOVSS_XMM_to_XMM(dstreg, srcreg); break;
case 1: if ( cpucaps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(1, 0, 0)); case 1: if ( cpucaps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(1, 0, 0));
else if ( cpucaps.hasStreamingSIMD3Extensions ) SSE3_MOVSHDUP_XMM_to_XMM(dstreg, srcreg); else SSE2_PSHUFLW_XMM_to_XMM(dstreg, srcreg, 0xee);
else { SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg); SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0x55); }
break; break;
case 2: SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); break; case 2: SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); break;
case 3: if ( cpucaps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(3, 0, 0)); case 3: if ( cpucaps.hasStreamingSIMD4Extensions ) SSE4_INSERTPS_XMM_to_XMM(dstreg, srcreg, _MM_MK_INSERTPS_NDX(3, 0, 0));
else if ( cpucaps.hasStreamingSIMD3Extensions && dstreg != srcreg ) { SSE3_MOVSHDUP_XMM_to_XMM(dstreg, srcreg); SSE_MOVHLPS_XMM_to_XMM(dstreg, dstreg); } else { SSE_MOVHLPS_XMM_to_XMM(dstreg, srcreg); SSE2_PSHUFLW_XMM_to_XMM(dstreg, dstreg, 0xee); }
else { SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg); SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0xff); }
break; break;
} }
} }
@ -603,7 +658,7 @@ void _unpackVFSS_xyzw(int dstreg, int srcreg, int xyzw)
void _vuFlipRegSS(VURegs * VU, int reg) void _vuFlipRegSS(VURegs * VU, int reg)
{ {
assert( _XYZW_SS ); assert( _XYZW_SS );
if( _Y ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xe1); if( _Y ) SSE2_PSHUFLW_XMM_to_XMM(reg, reg, 0x4e);
else if( _Z ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xc6); else if( _Z ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xc6);
else if( _W ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x27); else if( _W ) SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x27);
} }
@ -611,7 +666,7 @@ void _vuFlipRegSS(VURegs * VU, int reg)
void _vuFlipRegSS_xyzw(int reg, int xyzw) void _vuFlipRegSS_xyzw(int reg, int xyzw)
{ {
switch ( xyzw ) { switch ( xyzw ) {
case 1: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xe1); break; case 1: SSE2_PSHUFLW_XMM_to_XMM(reg, reg, 0x4e); break;
case 2: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xc6); break; case 2: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0xc6); break;
case 3: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x27); break; case 3: SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x27); break;
} }
@ -833,13 +888,19 @@ void vFloat1(int regd, int regTemp) { //1000
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
} }
void vFloat1c(int regd, int regTemp) { //1000 void vFloat1c(int regd, int regTemp) { //1000
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[1][0]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[1][0]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); }
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); else {
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat2(int regd, int regTemp) { //0100 void vFloat2(int regd, int regTemp) { //0100
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
@ -848,13 +909,19 @@ void vFloat2(int regd, int regTemp) { //0100
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
} }
void vFloat2c(int regd, int regTemp) { //0100 void vFloat2c(int regd, int regTemp) { //0100
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[2][0]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[2][0]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); }
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); else {
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat3(int regd, int regTemp) { //1100 void vFloat3(int regd, int regTemp) { //1100
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
@ -866,40 +933,64 @@ void vFloat3(int regd, int regTemp) { //1100
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
} }
void vFloat3b(int regd, int regTemp) { //1100 //regTemp is Modified void vFloat3b(int regd, int regTemp) { //1100 //regTemp is Modified
SSE2_MOVSD_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); SSE2_PCMPEQD_XMM_to_XMM(regTemp, regTemp);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE2_MOVSD_XMM_to_XMM(regd, regTemp); SSE_XORPS_XMM_to_XMM(regTemp, regd);
SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[3][0]);
SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[3][0]);
SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_NaNs_XYZW[3][0]);
SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_XORPS_XMM_to_XMM(regd, regTemp);
}
else {
SSE2_MOVSD_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE2_MOVSD_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat3c(int regd, int regTemp) { //1100 void vFloat3c(int regd, int regTemp) { //1100
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[3][0]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[3][0]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); }
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); else {
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat4(int regd, int regTemp) { //0010 void vFloat4(int regd, int regTemp) { //0010
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
} }
void vFloat4c(int regd, int regTemp) { //0010 void vFloat4c(int regd, int regTemp) { //0010
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[4][0]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[4][0]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); }
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); else {
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat5(int regd, int regTemp) { //1010 void vFloat5(int regd, int regTemp) { //1010
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
@ -907,20 +998,47 @@ void vFloat5(int regd, int regTemp) { //1010
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
} }
void vFloat5b(int regd, int regTemp) { //1010 //regTemp is Modified
if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE2_PCMPEQD_XMM_to_XMM(regTemp, regTemp);
SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_XORPS_XMM_to_XMM(regTemp, regd);
SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[5][0]);
SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[5][0]);
SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_NaNs_XYZW[5][0]);
SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_XORPS_XMM_to_XMM(regd, regTemp);
}
else {
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
}
}
void vFloat5c(int regd, int regTemp) { //1010 void vFloat5c(int regd, int regTemp) { //1010
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[5][0]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[5][0]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); }
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); else {
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat6(int regd, int regTemp) { //0110 void vFloat6(int regd, int regTemp) { //0110
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
@ -928,20 +1046,47 @@ void vFloat6(int regd, int regTemp) { //0110
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
} }
void vFloat6b(int regd, int regTemp) { //0110 //regTemp is Modified
if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE2_PCMPEQD_XMM_to_XMM(regTemp, regTemp);
SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_XORPS_XMM_to_XMM(regTemp, regd);
SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[6][0]);
SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[6][0]);
SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_NaNs_XYZW[6][0]);
SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_XORPS_XMM_to_XMM(regd, regTemp);
}
else {
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
}
}
void vFloat6c(int regd, int regTemp) { //0110 void vFloat6c(int regd, int regTemp) { //0110
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[6][0]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[6][0]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); }
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); else {
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat7(int regd, int regTemp) { //1110 void vFloat7(int regd, int regTemp) { //1110
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
@ -965,25 +1110,43 @@ void vFloat7_useEAX(int regd, int regTemp) { //1110 //EAX is Modified
} }
} }
void vFloat7b(int regd, int regTemp) { //1110 //regTemp is Modified void vFloat7b(int regd, int regTemp) { //1110 //regTemp is Modified
SSE_MOVSS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); SSE2_PCMPEQD_XMM_to_XMM(regTemp, regTemp);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_MOVSS_XMM_to_XMM(regd, regTemp); SSE_XORPS_XMM_to_XMM(regTemp, regd);
SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[7][0]);
SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[7][0]);
SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_NaNs_XYZW[7][0]);
SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_XORPS_XMM_to_XMM(regd, regTemp);
}
else {
SSE_MOVSS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_MOVSS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat7c(int regd, int regTemp) { //1110 void vFloat7c(int regd, int regTemp) { //1110
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[7][0]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[7][0]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); }
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); else {
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat7c_useEAX(int regd, int regTemp) { //1110 //EAX is Modified void vFloat7c_useEAX(int regd, int regTemp) { //1110 //EAX is Modified
SSE2_MOVD_XMM_to_R(EAX, regd); SSE2_MOVD_XMM_to_R(EAX, regd);
@ -992,23 +1155,25 @@ void vFloat7c_useEAX(int regd, int regTemp) { //1110 //EAX is Modified
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_ORPS_XMM_to_XMM(regd, regTemp);
if ( cpucaps.hasStreamingSIMD4Extensions ) SSE2_MOVD_R_to_XMM(regTemp, EAX);
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00); SSE_MOVSS_XMM_to_XMM(regd, regTemp);
else {
SSE2_MOVD_R_to_XMM(regTemp, EAX);
SSE_MOVSS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat8(int regd, int regTemp) { //0001 void vFloat8(int regd, int regTemp) { //0001
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
} }
void vFloat8c(int regd, int regTemp) { //0001 void vFloat8c(int regd, int regTemp) { //0001
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[8][0]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[8][0]);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); }
SSE_ORPS_XMM_to_XMM(regd, regTemp); else {
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat9(int regd, int regTemp) { //1001 void vFloat9(int regd, int regTemp) { //1001
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
@ -1018,16 +1183,42 @@ void vFloat9(int regd, int regTemp) { //1001
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
} }
void vFloat9b(int regd, int regTemp) { //1001 //regTemp is Modified
if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE2_PCMPEQD_XMM_to_XMM(regTemp, regTemp);
SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_XORPS_XMM_to_XMM(regTemp, regd);
SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[9][0]);
SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[9][0]);
SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_NaNs_XYZW[9][0]);
SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_XORPS_XMM_to_XMM(regd, regTemp);
}
else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
}
}
void vFloat9c(int regd, int regTemp) { //1001 void vFloat9c(int regd, int regTemp) { //1001
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[9][0]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[9][0]);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); }
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat10(int regd, int regTemp) { //0101 void vFloat10(int regd, int regTemp) { //0101
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
@ -1037,16 +1228,42 @@ void vFloat10(int regd, int regTemp) { //0101
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
} }
void vFloat10b(int regd, int regTemp) { //0101 //regTemp is Modified
if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE2_PCMPEQD_XMM_to_XMM(regTemp, regTemp);
SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_XORPS_XMM_to_XMM(regTemp, regd);
SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[10][0]);
SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[10][0]);
SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_NaNs_XYZW[10][0]);
SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_XORPS_XMM_to_XMM(regd, regTemp);
}
else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
}
}
void vFloat10c(int regd, int regTemp) { //0101 void vFloat10c(int regd, int regTemp) { //0101
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[10][0]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[10][0]);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); }
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat11(int regd, int regTemp) { //1101 void vFloat11(int regd, int regTemp) { //1101
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
@ -1060,7 +1277,7 @@ void vFloat11(int regd, int regTemp) { //1101
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
} }
void vFloat11_useEAX(int regd, int regTemp) { //1101 //EAX is Modified void vFloat11_useEAX(int regd, int regTemp) { //1101 //EAX is Modified
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE2_MOVD_XMM_to_R(EAX, regd); SSE2_MOVD_XMM_to_R(EAX, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
@ -1071,80 +1288,106 @@ void vFloat11_useEAX(int regd, int regTemp) { //1101 //EAX is Modified
SHR32ItoR(EAX, 16); SHR32ItoR(EAX, 16);
SSE_PINSRW_R32_to_XMM(regd, EAX, 1); SSE_PINSRW_R32_to_XMM(regd, EAX, 1);
} }
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
} }
void vFloat11b(int regd, int regTemp) { //1101 //regTemp is Modified void vFloat11b(int regd, int regTemp) { //1101 //regTemp is Modified
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); SSE2_PCMPEQD_XMM_to_XMM(regTemp, regTemp);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); SSE2_PSLLD_I8_to_XMM(regTemp, 31);
if ( cpucaps.hasStreamingSIMD4Extensions ) SSE_XORPS_XMM_to_XMM(regTemp, regd);
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x02); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[11][0]);
SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[11][0]);
SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_NaNs_XYZW[11][0]);
SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_XORPS_XMM_to_XMM(regd, regTemp);
}
else { else {
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_MOVSS_XMM_to_XMM(regTemp, regd); SSE_MOVSS_XMM_to_XMM(regTemp, regd);
SSE2_MOVSD_XMM_to_XMM(regd, regTemp); SSE2_MOVSD_XMM_to_XMM(regd, regTemp);
} }
} }
void vFloat11c(int regd, int regTemp) { //1101 void vFloat11c(int regd, int regTemp) { //1101
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[11][0]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[11][0]);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); }
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat11c_useEAX(int regd, int regTemp) { //1101 // EAX is modified void vFloat11c_useEAX(int regd, int regTemp) { //1101 // EAX is modified
SSE2_PSHUFD_XMM_to_XMM(regTemp, regd, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE2_MOVD_XMM_to_R(EAX, regTemp); SSE2_MOVD_XMM_to_R(EAX, regTemp);
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
if ( cpucaps.hasStreamingSIMD4Extensions ) { SSE_ORPS_XMM_to_XMM(regTemp, regd);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE2_MOVD_R_to_XMM(regd, EAX);
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x01); SSE_MOVLHPS_XMM_to_XMM(regd, regTemp);
} SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xe2);
else {
SSE_ORPS_XMM_to_XMM(regTemp, regd);
SSE2_MOVD_R_to_XMM(regd, EAX);
SSE_MOVLHPS_XMM_to_XMM(regd, regTemp);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xe2);
}
} }
void vFloat12(int regd, int regTemp) { //0011 void vFloat12(int regd, int regTemp) { //0011
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
} }
void vFloat12b(int regd, int regTemp) { //0011 //regTemp is Modified void vFloat12b(int regd, int regTemp) { //0011 //regTemp is Modified
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); SSE2_PCMPEQD_XMM_to_XMM(regTemp, regTemp);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE2_PUNPCKLQDQ_XMM_to_XMM(regd, regTemp); SSE_XORPS_XMM_to_XMM(regTemp, regd);
SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[12][0]);
SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[12][0]);
SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_NaNs_XYZW[12][0]);
SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_XORPS_XMM_to_XMM(regd, regTemp);
}
else {
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE2_PUNPCKLQDQ_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat12c(int regd, int regTemp) { //0011 void vFloat12c(int regd, int regTemp) { //0011
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[12][0]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[12][0]);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); }
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat13(int regd, int regTemp) { //1011 void vFloat13(int regd, int regTemp) { //1011
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
@ -1167,29 +1410,43 @@ void vFloat13_useEAX(int regd, int regTemp) { //1011 // EAX is modified
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
} }
void vFloat13b(int regd, int regTemp) { //1011 //regTemp is Modified void vFloat13b(int regd, int regTemp) { //1011 //regTemp is Modified
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); SSE2_PCMPEQD_XMM_to_XMM(regTemp, regTemp);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); SSE2_PSLLD_I8_to_XMM(regTemp, 31);
if ( cpucaps.hasStreamingSIMD4Extensions ) SSE_XORPS_XMM_to_XMM(regTemp, regd);
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x04); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[13][0]);
SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[13][0]);
SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_NaNs_XYZW[13][0]);
SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_XORPS_XMM_to_XMM(regd, regTemp);
}
else { else {
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd); SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x64); SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x64);
} }
} }
void vFloat13c(int regd, int regTemp) { //1011 void vFloat13c(int regd, int regTemp) { //1011
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[13][0]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[13][0]);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); }
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat13c_useEAX(int regd, int regTemp) { //1011 // EAX is modified void vFloat13c_useEAX(int regd, int regTemp) { //1011 // EAX is modified
SSE2_PSHUFD_XMM_to_XMM(regTemp, regd, 0xd2); SSE2_PSHUFD_XMM_to_XMM(regTemp, regd, 0xd2);
@ -1199,18 +1456,14 @@ void vFloat13c_useEAX(int regd, int regTemp) { //1011 // EAX is modified
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_ORPS_XMM_to_XMM(regd, regTemp);
if ( cpucaps.hasStreamingSIMD4Extensions ) SSE2_MOVD_R_to_XMM(regTemp, EAX);
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x02); SSE_SHUFPS_XMM_to_XMM(regTemp, regd, 0xf0);
else { SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x84);
SSE2_MOVD_R_to_XMM(regTemp, EAX);
SSE_SHUFPS_XMM_to_XMM(regTemp, regd, 0xf0);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x84);
}
} }
void vFloat14(int regd, int regTemp) { //0111 void vFloat14(int regd, int regTemp) { //0111
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
@ -1233,29 +1486,43 @@ void vFloat14_useEAX(int regd, int regTemp) { //0111 // EAX is modified
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27); SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
} }
void vFloat14b(int regd, int regTemp) { //0111 //regTemp is Modified void vFloat14b(int regd, int regTemp) { //0111 //regTemp is Modified
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); SSE2_PCMPEQD_XMM_to_XMM(regTemp, regTemp);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); SSE2_PSLLD_I8_to_XMM(regTemp, 31);
if ( cpucaps.hasStreamingSIMD4Extensions ) SSE_XORPS_XMM_to_XMM(regTemp, regd);
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x08); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[14][0]);
SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[14][0]);
SSE2_PCMPGTD_M128_to_XMM(regTemp, (uptr)&g_NaNs_XYZW[14][0]);
SSE2_PSLLD_I8_to_XMM(regTemp, 31);
SSE_XORPS_XMM_to_XMM(regd, regTemp);
}
else { else {
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd); SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xc4); SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xc4);
} }
} }
void vFloat14c(int regd, int regTemp) { //0111 void vFloat14c(int regd, int regTemp) { //0111
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)&g_maxvals_XYZW[14][0]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE4_PMINUD_M128_to_XMM(regd, (uptr)&g_minvals_XYZW[14][0]);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); }
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1); else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals); SSE2_PSHUFLW_XMM_to_XMM(regd, regd, 0x4e);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9); SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
void vFloat14c_useEAX(int regd, int regTemp) { //0111 // EAX is modified void vFloat14c_useEAX(int regd, int regTemp) { //0111 // EAX is modified
SSE2_PSHUFD_XMM_to_XMM(regTemp, regd, 0x93); SSE2_PSHUFD_XMM_to_XMM(regTemp, regd, 0x93);
@ -1265,24 +1532,26 @@ void vFloat14c_useEAX(int regd, int regTemp) { //0111 // EAX is modified
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp); SSE_ORPS_XMM_to_XMM(regd, regTemp);
if ( cpucaps.hasStreamingSIMD4Extensions ) SSE2_MOVD_R_to_XMM(regTemp, EAX);
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x03); SSE_SHUFPS_XMM_to_XMM(regTemp, regd, 0xa0);
else { SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x24);
SSE2_MOVD_R_to_XMM(regTemp, EAX);
SSE_SHUFPS_XMM_to_XMM(regTemp, regd, 0xa0);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x24);
}
} }
void vFloat15(int regd, int regTemp) { //1111 void vFloat15(int regd, int regTemp) { //1111
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals); SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals); SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
} }
void vFloat15c(int regd, int regTemp) { //1111 void vFloat15c(int regd, int regTemp) { //1111
SSE_MOVAPS_XMM_to_XMM(regTemp, regd); if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]); SSE4_PMINSD_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MINPS_M128_to_XMM(regd, (uptr)&g_maxvals[0]); SSE4_PMINUD_M128_to_XMM(regd, (uptr)g_minvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)&g_minvals[0]); }
SSE_ORPS_XMM_to_XMM(regd, regTemp); else {
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_MINPS_M128_to_XMM(regd, (uptr)&g_maxvals[0]);
SSE_MAXPS_M128_to_XMM(regd, (uptr)&g_minvals[0]);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
} }
vFloat vFloats1[16] = { //regTemp is not modified vFloat vFloats1[16] = { //regTemp is not modified
@ -1299,8 +1568,8 @@ vFloat vFloats1_useEAX[16] = { //regTemp is not modified but EAX is used
vFloat vFloats2[16] = { //regTemp is modified vFloat vFloats2[16] = { //regTemp is modified
vFloat0, vFloat1, vFloat2, vFloat3b, vFloat0, vFloat1, vFloat2, vFloat3b,
vFloat4, vFloat5, vFloat6, vFloat7b, vFloat4, vFloat5b, vFloat6b, vFloat7b,
vFloat8, vFloat9, vFloat10, vFloat11b, vFloat8, vFloat9b, vFloat10b, vFloat11b,
vFloat12b, vFloat13b, vFloat14b, vFloat15 }; vFloat12b, vFloat13b, vFloat14b, vFloat15 };
vFloat vFloats4[16] = { //regTemp is modified vFloat vFloats4[16] = { //regTemp is modified
@ -1321,7 +1590,7 @@ vFloat vFloats4_useEAX[16] = { //regTemp is modified and EAX is used
// vuFloat_useEAX : "normal" clamping (faster but EAX is modified) // vuFloat_useEAX : "normal" clamping (faster but EAX is modified)
// vuFloat2 : "normal" clamping (fastest but regTemp is modified) // vuFloat2 : "normal" clamping (fastest but regTemp is modified)
// vuFloat3 : "preserve sign" clamping for pointer // vuFloat3 : "preserve sign" clamping for pointer
// vuFloat4 : "preserve sign" clamping (regTemp is modified) // vuFloat4 : "preserve sign" clamping (regTemp is modified; *FASTEST* on SSE4 CPUs)
// vuFloat4_useEAX : "preserve sign" clamping (faster but regTemp and EAX are modified) // vuFloat4_useEAX : "preserve sign" clamping (faster but regTemp and EAX are modified)
// vuFloat5 : wrapper function for vuFloat2 and vuFloat4 // vuFloat5 : wrapper function for vuFloat2 and vuFloat4
// vuFloat5_useEAX : wrapper function for vuFloat2 and vuFloat4_useEAX // vuFloat5_useEAX : wrapper function for vuFloat2 and vuFloat4_useEAX
@ -1348,7 +1617,7 @@ void vuFloat( int info, int regd, int XYZW) {
} }
} }
// Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (uses EAX as a temp register; faster but **destroyes EAX**) // Clamps +/-NaN to +fMax and +/-Inf to +/-fMax (uses EAX as a temp register; faster but **destroys EAX**)
void vuFloat_useEAX( int info, int regd, int XYZW) { void vuFloat_useEAX( int info, int regd, int XYZW) {
if( CHECK_VU_OVERFLOW ) { if( CHECK_VU_OVERFLOW ) {
vFloats1_useEAX[XYZW](regd, regd); vFloats1_useEAX[XYZW](regd, regd);
@ -1370,10 +1639,13 @@ void vuFloat4(int regd, int regTemp, int XYZW) {
} }
} }
// Clamps +/-NaN and +/-Inf to +/-fMax (uses a temp reg, and uses EAX as a temp register; faster but **destroyes EAX**) // Clamps +/-NaN and +/-Inf to +/-fMax (uses a temp reg, and uses EAX as a temp register; faster but **destroys EAX**)
void vuFloat4_useEAX(int regd, int regTemp, int XYZW) { void vuFloat4_useEAX(int regd, int regTemp, int XYZW) {
if( CHECK_VU_OVERFLOW ) { if( CHECK_VU_OVERFLOW ) {
vFloats4_useEAX[XYZW](regd, regTemp); if ( cpucaps.hasStreamingSIMD4Extensions )
vFloats4[XYZW](regd, regTemp);
else
vFloats4_useEAX[XYZW](regd, regTemp);
} }
} }

View File

@ -72,6 +72,8 @@ typedef void (*vFloat)(int regd, int regTemp);
extern vFloat vFloats1[16]; extern vFloat vFloats1[16];
extern vFloat vFloats1_useEAX[16]; extern vFloat vFloats1_useEAX[16];
extern vFloat vFloats2[16]; extern vFloat vFloats2[16];
extern vFloat vFloats4[16];
extern vFloat vFloats4_useEAX[16];
extern PCSX2_ALIGNED16(float s_fones[8]); extern PCSX2_ALIGNED16(float s_fones[8]);
extern PCSX2_ALIGNED16(u32 s_mask[4]); extern PCSX2_ALIGNED16(u32 s_mask[4]);
extern PCSX2_ALIGNED16(u32 s_expmask[4]); extern PCSX2_ALIGNED16(u32 s_expmask[4]);
@ -283,4 +285,4 @@ void recVUMI_XTOP(VURegs *vuRegs, int info);
void recVUMI_XITOP(VURegs *vuRegs, int info); void recVUMI_XITOP(VURegs *vuRegs, int info);
void recVUMI_XTOP( VURegs *VU , int info); void recVUMI_XTOP( VURegs *VU , int info);
#endif /* __IVUMICRO_H__ */ #endif /* __IVUMICRO_H__ */

View File

@ -140,7 +140,7 @@ void recVUMI_DIV(VURegs *VU, int info)
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_T); SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_T);
_vuFlipRegSS_xyzw(EEREC_T, _Ftf_); _vuFlipRegSS_xyzw(EEREC_T, _Ftf_);
vuFloat2(EEREC_TEMP, EEREC_TEMP, 0x8); vuFloat_useEAX(info, EEREC_TEMP, 0x8);
x86SetJ32(bjmp32); x86SetJ32(bjmp32);
@ -226,9 +226,9 @@ void recVUMI_RSQRT(VURegs *VU, int info)
x86SetJ8(ajmp8); x86SetJ8(ajmp8);
_unpackVFSS_xyzw(t1reg, EEREC_S, _Fsf_); _unpackVFSS_xyzw(t1reg, EEREC_S, _Fsf_);
if (CHECK_VU_EXTRA_OVERFLOW) vuFloat2(t1reg, t1reg, 0x8); // Clamp Infinities if (CHECK_VU_EXTRA_OVERFLOW) vuFloat_useEAX(info, t1reg, 0x8); // Clamp Infinities
SSE_DIVSS_XMM_to_XMM(t1reg, EEREC_TEMP); SSE_DIVSS_XMM_to_XMM(t1reg, EEREC_TEMP);
vuFloat2(t1reg, t1reg, 0x8); vuFloat_useEAX(info, t1reg, 0x8);
SSE_MOVSS_XMM_to_M32(VU_VI_ADDR(REG_Q, 0), t1reg); SSE_MOVSS_XMM_to_M32(VU_VI_ADDR(REG_Q, 0), t1reg);
x86SetJ8(bjmp8); x86SetJ8(bjmp8);
@ -813,7 +813,7 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
else SSE_MOVHPS_XMM_to_M64(offset+8, EEREC_S); else SSE_MOVHPS_XMM_to_M64(offset+8, EEREC_S);
break; break;
case 4: // Y case 4: // Y
SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0x4e);
if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+4); if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+4);
else SSE_MOVSS_XMM_to_M32(offset+4, EEREC_TEMP); else SSE_MOVSS_XMM_to_M32(offset+4, EEREC_TEMP);
break; break;
@ -1566,7 +1566,7 @@ void vuSqSumXYZ(int regd, int regs, int regtemp) // regd.x = x ^ 2 + y ^ 2 + z
} }
else { else {
SSE_MOVSS_XMM_to_XMM(regd, regtemp); SSE_MOVSS_XMM_to_XMM(regd, regtemp);
SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xE1); // wzyx -> wzxy SSE2_PSHUFLW_XMM_to_XMM(regtemp, regtemp, 0x4e); // wzyx -> wzxy
SSE_ADDSS_XMM_to_XMM(regd, regtemp); // x ^ 2 + y ^ 2 SSE_ADDSS_XMM_to_XMM(regd, regtemp); // x ^ 2 + y ^ 2
SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xD2); // wzxy -> wxyz SSE_SHUFPS_XMM_to_XMM(regtemp, regtemp, 0xD2); // wzxy -> wxyz
SSE_ADDSS_XMM_to_XMM(regd, regtemp); // x ^ 2 + y ^ 2 + z ^ 2 SSE_ADDSS_XMM_to_XMM(regd, regtemp); // x ^ 2 + y ^ 2 + z ^ 2
@ -1710,7 +1710,7 @@ void recVUMI_ESUM( VURegs *VU, int info )
if( cpucaps.hasStreamingSIMD3Extensions ) { if( cpucaps.hasStreamingSIMD3Extensions ) {
SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S); SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
if (CHECK_VU_EXTRA_OVERFLOW) vuFloat2(EEREC_TEMP, EEREC_TEMP, 0xf); if (CHECK_VU_EXTRA_OVERFLOW) vuFloat_useEAX(info, EEREC_TEMP, 0xf);
SSE3_HADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); SSE3_HADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
SSE3_HADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); SSE3_HADDPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP);
} }
@ -1744,11 +1744,11 @@ void recVUMI_ERCPR( VURegs *VU, int info )
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S); SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
break; break;
case 1: //0010 case 1: //0010
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(EEREC_S, EEREC_S, 0x4e);
if (CHECK_VU_EXTRA_OVERFLOW) vuFloat5_useEAX(EEREC_S, EEREC_TEMP, 8); if (CHECK_VU_EXTRA_OVERFLOW) vuFloat5_useEAX(EEREC_S, EEREC_TEMP, 8);
SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)VU_ONE); // temp <- 1 SSE_MOVSS_M32_to_XMM(EEREC_TEMP, (uptr)VU_ONE); // temp <- 1
SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S); SSE_DIVSS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xe1); SSE2_PSHUFLW_XMM_to_XMM(EEREC_S, EEREC_S, 0x4e);
break; break;
case 2: //0100 case 2: //0100
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xc6); SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xc6);

View File

@ -1502,9 +1502,8 @@ void recVUMI_MUL_toD(VURegs *VU, int regd, int info)
{ {
//SysPrintf ("recVUMI_MUL_toD \n"); //SysPrintf ("recVUMI_MUL_toD \n");
if (CHECK_VU_EXTRA_OVERFLOW) { if (CHECK_VU_EXTRA_OVERFLOW) {
//using vuFloat instead of vuFloat2 incase regd == EEREC_TEMP if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
if (_Fs_) vuFloat_useEAX( info, EEREC_S, _X_Y_Z_W ); if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
if (_Ft_) vuFloat_useEAX( info, EEREC_T, _X_Y_Z_W );
} }
if (_X_Y_Z_W == 1 && (_Ft_ == 0 || _Fs_==0) ) { // W if (_X_Y_Z_W == 1 && (_Ft_ == 0 || _Fs_==0) ) { // W
@ -1545,7 +1544,7 @@ void recVUMI_MUL_iq_toD(VURegs *VU, uptr addr, int regd, int info)
//SysPrintf ("recVUMI_MUL_iq_toD \n"); //SysPrintf ("recVUMI_MUL_iq_toD \n");
if (CHECK_VU_EXTRA_OVERFLOW) { if (CHECK_VU_EXTRA_OVERFLOW) {
vuFloat3(addr); vuFloat3(addr);
if (_Fs_) vuFloat_useEAX( info, EEREC_S, _X_Y_Z_W ); if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
} }
if( _XYZW_SS ) { if( _XYZW_SS ) {
@ -1599,11 +1598,12 @@ void recVUMI_MUL_xyzw_toD(VURegs *VU, int xyzw, int regd, int info)
{ {
//SysPrintf ("recVUMI_MUL_xyzw_toD \n"); //SysPrintf ("recVUMI_MUL_xyzw_toD \n");
if (CHECK_VU_EXTRA_OVERFLOW) { if (CHECK_VU_EXTRA_OVERFLOW) {
if (_Ft_) vuFloat_useEAX( info, EEREC_T, ( 1 << (3 - xyzw) ) ); if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
}
if (_Fs_) { // This is needed for alot of games; so always clamp this operand
if (CHECK_VU_SIGN_OVERFLOW) vFloats4_useEAX[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set
else vFloats2[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set
} }
// This is needed for alot of games
if (_Fs_) vFloats1_useEAX[_X_Y_Z_W]( EEREC_S, EEREC_S ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set
if( _Ft_ == 0 ) { if( _Ft_ == 0 ) {
if( xyzw < 3 ) { if( xyzw < 3 ) {
if (_X_Y_Z_W != 0xf) { if (_X_Y_Z_W != 0xf) {
@ -1736,9 +1736,9 @@ void recVUMI_MADD_toD(VURegs *VU, int regd, int info)
{ {
//SysPrintf ("recVUMI_MADD_toD \n"); //SysPrintf ("recVUMI_MADD_toD \n");
if (CHECK_VU_EXTRA_OVERFLOW) { if (CHECK_VU_EXTRA_OVERFLOW) {
if (_Fs_) vuFloat_useEAX( info, EEREC_S, _X_Y_Z_W ); if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
if (_Ft_) vuFloat_useEAX( info, EEREC_T, _X_Y_Z_W ); if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, _X_Y_Z_W );
vuFloat_useEAX( info, EEREC_ACC, _X_Y_Z_W ); vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
} }
if( _X_Y_Z_W == 8 ) { if( _X_Y_Z_W == 8 ) {
@ -1804,8 +1804,8 @@ void recVUMI_MADD_iq_toD(VURegs *VU, uptr addr, int regd, int info)
//SysPrintf ("recVUMI_MADD_iq_toD \n"); //SysPrintf ("recVUMI_MADD_iq_toD \n");
if (CHECK_VU_EXTRA_OVERFLOW) { if (CHECK_VU_EXTRA_OVERFLOW) {
vuFloat3(addr); vuFloat3(addr);
if (_Fs_) vuFloat_useEAX( info, EEREC_S, _X_Y_Z_W ); if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
vuFloat_useEAX( info, EEREC_ACC, _X_Y_Z_W ); vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
} }
if( _X_Y_Z_W == 8 ) { if( _X_Y_Z_W == 8 ) {
@ -1891,12 +1891,13 @@ void recVUMI_MADD_xyzw_toD(VURegs *VU, int xyzw, int regd, int info)
{ {
//SysPrintf ("recVUMI_MADD_xyzw_toD \n"); //SysPrintf ("recVUMI_MADD_xyzw_toD \n");
if (CHECK_VU_EXTRA_OVERFLOW) { if (CHECK_VU_EXTRA_OVERFLOW) {
if (_Ft_) vuFloat_useEAX( info, EEREC_T, ( 1 << (3 - xyzw) ) ); if (_Ft_) vuFloat5_useEAX( EEREC_T, EEREC_TEMP, ( 1 << (3 - xyzw) ) );
vuFloat_useEAX( info, EEREC_ACC, _X_Y_Z_W ); vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
}
if (_Fs_) { // This is needed for alot of games; so always clamp this operand
if (CHECK_VU_SIGN_OVERFLOW) vFloats4_useEAX[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set
else vFloats2[_X_Y_Z_W]( EEREC_S, EEREC_TEMP ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set
} }
// This is needed for alot of games
if (_Fs_) vFloats1_useEAX[_X_Y_Z_W]( EEREC_S, EEREC_S ); // Always clamp EEREC_S, regardless if CHECK_VU_OVERFLOW is set
if( _Ft_ == 0 ) { if( _Ft_ == 0 ) {
if( xyzw == 3 ) { if( xyzw == 3 ) {
@ -2094,9 +2095,9 @@ void recVUMI_MSUB_toD(VURegs *VU, int regd, int info)
{ {
//SysPrintf ("recVUMI_MSUB_toD \n"); //SysPrintf ("recVUMI_MSUB_toD \n");
if (CHECK_VU_EXTRA_OVERFLOW) { if (CHECK_VU_EXTRA_OVERFLOW) {
if (_Fs_) vuFloat_useEAX( info, EEREC_S, _X_Y_Z_W ); if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
if (_Ft_) vuFloat_useEAX( info, EEREC_T, _X_Y_Z_W ); if (_Ft_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
vuFloat_useEAX( info, EEREC_ACC, _X_Y_Z_W ); vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
} }
if (_X_Y_Z_W != 0xf) { if (_X_Y_Z_W != 0xf) {
@ -2155,8 +2156,8 @@ void recVUMI_MSUB_temp_toD(VURegs *VU, int regd, int info)
{ {
//SysPrintf ("recVUMI_MSUB_temp_toD \n"); //SysPrintf ("recVUMI_MSUB_temp_toD \n");
if (CHECK_VU_EXTRA_OVERFLOW) { if (CHECK_VU_EXTRA_OVERFLOW) {
if (_Fs_) vuFloat_useEAX( info, EEREC_S, _X_Y_Z_W ); if (_Fs_) vuFloat5_useEAX( EEREC_S, EEREC_TEMP, _X_Y_Z_W );
vuFloat_useEAX( info, EEREC_ACC, _X_Y_Z_W ); vuFloat5_useEAX( EEREC_ACC, EEREC_TEMP, _X_Y_Z_W );
} }
if (_X_Y_Z_W != 0xf) { if (_X_Y_Z_W != 0xf) {
@ -3069,4 +3070,4 @@ void recVUMI_CLIP(VURegs *VU, int info)
_freeX86reg(x86temp1); _freeX86reg(x86temp1);
_freeX86reg(x86temp2); _freeX86reg(x86temp2);
} }

View File

@ -1620,6 +1620,14 @@ extern void SSE4_BLENDVPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
extern void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from); extern void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from);
extern void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from); extern void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
extern void SSE4_PINSRD_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 imm8); extern void SSE4_PINSRD_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 imm8);
extern void SSE4_PMAXSD_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
extern void SSE4_PMINSD_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
extern void SSE4_PMAXUD_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
extern void SSE4_PMINUD_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
extern void SSE4_PMAXSD_M128_to_XMM(x86SSERegType to, uptr from);
extern void SSE4_PMINSD_M128_to_XMM(x86SSERegType to, uptr from);
extern void SSE4_PMAXUD_M128_to_XMM(x86SSERegType to, uptr from);
extern void SSE4_PMINUD_M128_to_XMM(x86SSERegType to, uptr from);
//********************* //*********************
// SSE-X - uses both SSE,SSE2 code and tries to keep consistensies between the data // SSE-X - uses both SSE,SSE2 code and tries to keep consistensies between the data
@ -1737,4 +1745,4 @@ __forceinline void write32(u32 val )
x86Ptr += 4; x86Ptr += 4;
} }
#endif // __IX86_H__ #endif // __IX86_H__

View File

@ -1152,6 +1152,74 @@ __forceinline void SSE4_PINSRD_R32_to_XMM(x86SSERegType to, x86IntRegType from,
write8(imm8); write8(imm8);
} }
__forceinline void SSE4_PMAXSD_XMM_to_XMM(x86SSERegType to, x86SSERegType from)
{
write8(0x66);
RexRB(0, to, from);
write24(0x3D380F);
ModRM(3, to, from);
}
__forceinline void SSE4_PMINSD_XMM_to_XMM(x86SSERegType to, x86SSERegType from)
{
write8(0x66);
RexRB(0, to, from);
write24(0x39380F);
ModRM(3, to, from);
}
__forceinline void SSE4_PMAXUD_XMM_to_XMM(x86SSERegType to, x86SSERegType from)
{
write8(0x66);
RexRB(0, to, from);
write24(0x3F380F);
ModRM(3, to, from);
}
__forceinline void SSE4_PMINUD_XMM_to_XMM(x86SSERegType to, x86SSERegType from)
{
write8(0x66);
RexRB(0, to, from);
write24(0x3B380F);
ModRM(3, to, from);
}
__forceinline void SSE4_PMAXSD_M128_to_XMM(x86SSERegType to, uptr from)
{
write8(0x66);
RexR(0, to);
write24(0x3D380F);
ModRM( 0, to, DISP32 );
write32(MEMADDR(from, 4));
}
__forceinline void SSE4_PMINSD_M128_to_XMM(x86SSERegType to, uptr from)
{
write8(0x66);
RexR(0, to);
write24(0x39380F);
ModRM( 0, to, DISP32 );
write32(MEMADDR(from, 4));
}
__forceinline void SSE4_PMAXUD_M128_to_XMM(x86SSERegType to, uptr from)
{
write8(0x66);
RexR(0, to);
write24(0x3F380F);
ModRM( 0, to, DISP32 );
write32(MEMADDR(from, 4));
}
__forceinline void SSE4_PMINUD_M128_to_XMM(x86SSERegType to, uptr from)
{
write8(0x66);
RexR(0, to);
write24(0x3B380F);
ModRM( 0, to, DISP32 );
write32(MEMADDR(from, 4));
}
// SSE-X // SSE-X
__forceinline void SSEX_MOVDQA_M128_to_XMM( x86SSERegType to, uptr from ) __forceinline void SSEX_MOVDQA_M128_to_XMM( x86SSERegType to, uptr from )
{ {
@ -1312,4 +1380,4 @@ __forceinline void SSEX_MOVHLPS_XMM_to_XMM( x86SSERegType to, x86SSERegType from
else { else {
SSE_MOVHLPS_XMM_to_XMM(to, from); SSE_MOVHLPS_XMM_to_XMM(to, from);
} }
} }