Tmmk fixed a bug in saveEAX where a vector wasn't saved in some situations. He also optimized some VU functions, so this revision brings about 5% more speed ;)

git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@631 a6443dda-0b58-4228-96e9-037be469359c
This commit is contained in:
ramapcsx2 2009-01-25 13:09:54 +00:00 committed by Gregory Hainaut
parent dde14aab30
commit c787b00cc7
5 changed files with 607 additions and 206 deletions

View File

@ -576,12 +576,11 @@ int _vuGetTempXMMreg(int info)
//------------------------------------------------------------------
void _unpackVF_xyzw(int dstreg, int srcreg, int xyzw)
{
SSE_MOVAPS_XMM_to_XMM(dstreg, srcreg);
switch (xyzw) {
case 0: SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0x00); break;
case 1: SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0x55); break;
case 2: SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0xaa); break;
case 3: SSE_SHUFPS_XMM_to_XMM(dstreg, dstreg, 0xff); break;
case 0: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x00); break;
case 1: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0x55); break;
case 2: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xaa); break;
case 3: SSE2_PSHUFD_XMM_to_XMM(dstreg, srcreg, 0xff); break;
}
}
@ -657,8 +656,9 @@ void VU_MERGE3(int dest, int src) { // 1100s
}
void VU_MERGE4(int dest, int src) { // 0010
SSE_MOVSS_XMM_to_XMM(src, dest);
SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4);
SSE_MOVAPS_XMM_to_XMM(dest, src);
//SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4);
//SSE_MOVAPS_XMM_to_XMM(dest, src);
SSE2_MOVSD_XMM_to_XMM(dest, src);
}
void VU_MERGE4b(int dest, int src) { // 0010s
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
@ -743,16 +743,7 @@ void VU_MERGE11(int dest, int src) { // 1101s
SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4);
}
void VU_MERGE12(int dest, int src) { // 0011
SSE_SHUFPS_XMM_to_XMM(src, dest, 0xe4);
SSE_MOVAPS_XMM_to_XMM(dest, src);
}
void VU_MERGE12b(int dest, int src) { // 0011
SSE_MOVSS_XMM_to_XMM(dest, src);
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
SSE_MOVSS_XMM_to_XMM(dest, src);
SSE_SHUFPS_XMM_to_XMM(src, src, 0xE1);
SSE_SHUFPS_XMM_to_XMM(dest, dest, 0xE1);
SSE2_MOVSD_XMM_to_XMM(dest, src);
}
void VU_MERGE13(int dest, int src) { // 1011
SSE_MOVHLPS_XMM_to_XMM(dest, src);
@ -806,7 +797,7 @@ static VUMERGEFN s_VuMerge2[16] = {
VU_MERGE0, VU_MERGE1b, VU_MERGE2b, VU_MERGE3,
VU_MERGE4b, VU_MERGE5b, VU_MERGE6b, VU_MERGE7b,
VU_MERGE8, VU_MERGE9b, VU_MERGE10b, VU_MERGE11,
VU_MERGE12b, VU_MERGE13b, VU_MERGE14b, VU_MERGE15 };
VU_MERGE12, VU_MERGE13b, VU_MERGE14b, VU_MERGE15 };
// Modifies the Source Reg!
void VU_MERGE_REGS_CUSTOM(int dest, int src, int xyzw) {
@ -876,6 +867,12 @@ void vFloat3(int regd, int regTemp) { //1100
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
}
void vFloat3b(int regd, int regTemp) { //1100 //regTemp is Modified
SSE2_MOVSD_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE2_MOVSD_XMM_to_XMM(regd, regTemp);
}
void vFloat3c(int regd, int regTemp) { //1100
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
@ -912,6 +909,12 @@ void vFloat5(int regd, int regTemp) { //1010
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
}
void vFloat5b(int regd, int regTemp) { //1010
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x5);
}
void vFloat5c(int regd, int regTemp) { //1010
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
@ -933,6 +936,12 @@ void vFloat6(int regd, int regTemp) { //0110
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
}
void vFloat6b(int regd, int regTemp) { //0110
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x9);
}
void vFloat6c(int regd, int regTemp) { //0110
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
@ -946,37 +955,44 @@ void vFloat6c(int regd, int regTemp) { //0110
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
void vFloat7(int regd, int regTemp) { //1110
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE2_MOVD_XMM_to_R(EAX, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
}
else {
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
}
}
void vFloat7b(int regd, int regTemp) { //1110 //regTemp is Modified
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regTemp, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regTemp, (uptr)g_minvals);
VU_MERGE_REGS_CUSTOM(regd, regTemp, 7);
SSE_MOVSS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_MOVSS_XMM_to_XMM(regd, regTemp);
}
void vFloat7c(int regd, int regTemp) { //1110
SSE2_MOVD_XMM_to_R(EAX, regd);
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x39);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
if ( cpucaps.hasStreamingSIMD4Extensions )
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
else {
SSE2_MOVD_R_to_XMM(regTemp, EAX);
SSE_MOVSS_XMM_to_XMM(regd, regTemp);
}
}
void vFloat8(int regd, int regTemp) { //0001
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
@ -997,6 +1013,12 @@ void vFloat9(int regd, int regTemp) { //1001
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
}
void vFloat9b(int regd, int regTemp) { //1001
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x6);
}
void vFloat9c(int regd, int regTemp) { //1001
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
@ -1016,6 +1038,12 @@ void vFloat10(int regd, int regTemp) { //0101
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
}
void vFloat10b(int regd, int regTemp) { //0101
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0xa);
}
void vFloat10c(int regd, int regTemp) { //0101
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
@ -1028,35 +1056,54 @@ void vFloat10c(int regd, int regTemp) { //0101
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
void vFloat11(int regd, int regTemp) { //1101
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE2_MOVD_XMM_to_R(EAX, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
}
else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
}
}
void vFloat11b(int regd, int regTemp) { //1101 //regTemp is Modified
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regTemp, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regTemp, (uptr)g_minvals);
VU_MERGE_REGS_CUSTOM(regd, regTemp, 11);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
if ( cpucaps.hasStreamingSIMD4Extensions )
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x02);
else {
SSE_MOVSS_XMM_to_XMM(regTemp, regd);
SSE2_MOVSD_XMM_to_XMM(regd, regTemp);
}
}
void vFloat11c(int regd, int regTemp) { //1101
SSE2_PSHUFD_XMM_to_XMM(regTemp, regd, 0xe1);
SSE2_MOVD_XMM_to_R(EAX, regTemp);
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x36);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_ORPS_XMM_to_XMM(regd, regTemp);
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x01);
}
else {
SSE_ORPS_XMM_to_XMM(regTemp, regd);
SSE2_MOVD_R_to_XMM(regd, EAX);
SSE_MOVLHPS_XMM_to_XMM(regd, regTemp);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xe2);
}
}
void vFloat12(int regd, int regTemp) { //0011
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
@ -1066,6 +1113,12 @@ void vFloat12(int regd, int regTemp) { //0011
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
}
void vFloat12b(int regd, int regTemp) { //0011 //regTemp is Modified
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE2_PUNPCKLQDQ_XMM_to_XMM(regd, regTemp);
}
void vFloat12c(int regd, int regTemp) { //0011
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
@ -1078,66 +1131,100 @@ void vFloat12c(int regd, int regTemp) { //0011
SSE_ORPS_XMM_to_XMM(regd, regTemp);
}
void vFloat13(int regd, int regTemp) { //1011
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE2_MOVD_XMM_to_R(EAX, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
}
else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
}
}
void vFloat13b(int regd, int regTemp) { //1011 //regTemp is Modified
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regTemp, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regTemp, (uptr)g_minvals);
VU_MERGE_REGS_CUSTOM(regd, regTemp, 13);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
if ( cpucaps.hasStreamingSIMD4Extensions )
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x04);
else {
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x64);
}
}
void vFloat13c(int regd, int regTemp) { //1011
SSE2_PSHUFD_XMM_to_XMM(regTemp, regd, 0xd2);
SSE2_MOVD_XMM_to_R(EAX, regTemp);
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x2d);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
if ( cpucaps.hasStreamingSIMD4Extensions )
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x02);
else {
SSE2_MOVD_R_to_XMM(regTemp, EAX);
SSE_SHUFPS_XMM_to_XMM(regTemp, regd, 0xf0);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x84);
}
}
void vFloat14(int regd, int regTemp) { //0111
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
SSE2_MOVD_XMM_to_R(EAX, regd);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x00);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0x27);
}
else {
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
}
}
void vFloat14b(int regd, int regTemp) { //0111 //regTemp is Modified
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_MINPS_M128_to_XMM(regTemp, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regTemp, (uptr)g_minvals);
VU_MERGE_REGS_CUSTOM(regd, regTemp, 14);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
if ( cpucaps.hasStreamingSIMD4Extensions )
SSE4_BLENDPS_XMM_to_XMM(regd, regTemp, 0x08);
else {
SSE_MOVHLPS_XMM_to_XMM(regTemp, regd);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0xc4);
}
}
void vFloat14c(int regd, int regTemp) { //0111
SSE2_PSHUFD_XMM_to_XMM(regTemp, regd, 0x93);
SSE2_MOVD_XMM_to_R(EAX, regTemp);
SSE_MOVAPS_XMM_to_XMM(regTemp, regd);
SSE_ANDPS_M128_to_XMM(regTemp, (uptr)&const_clip[4]);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xe1);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc6);
SSE_MINSS_M32_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXSS_M32_to_XMM(regd, (uptr)g_minvals);
SSE_SHUFPS_XMM_to_XMM(regd, regd, 0xc9);
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
SSE_MAXPS_M128_to_XMM(regd, (uptr)g_minvals);
SSE_ORPS_XMM_to_XMM(regd, regTemp);
if ( cpucaps.hasStreamingSIMD4Extensions )
SSE4_PINSRD_R32_to_XMM(regd, EAX, 0x03);
else {
SSE2_MOVD_R_to_XMM(regTemp, EAX);
SSE_SHUFPS_XMM_to_XMM(regTemp, regd, 0xa0);
SSE_SHUFPS_XMM_to_XMM(regd, regTemp, 0x24);
}
}
void vFloat15(int regd, int regTemp) { //1111
SSE_MINPS_M128_to_XMM(regd, (uptr)g_maxvals);
@ -1158,10 +1245,10 @@ vFloat vFloats1[16] = { //regTemp is not modified
vFloat12, vFloat13, vFloat14, vFloat15 };
vFloat vFloats2[16] = { //regTemp is modified
vFloat0, vFloat1, vFloat2, vFloat3,
vFloat0, vFloat1, vFloat2, vFloat3b,
vFloat4, vFloat5, vFloat6, vFloat7b,
vFloat8, vFloat9, vFloat10, vFloat11b,
vFloat12, vFloat13b, vFloat14b, vFloat15 };
vFloat12b, vFloat13b, vFloat14b, vFloat15 };
vFloat vFloats4[16] = { //regTemp is modified
vFloat0, vFloat1c, vFloat2c, vFloat3c,
@ -1269,4 +1356,4 @@ void SetVUNanMode(int mode)
{
g_VuNanHandling = mode;
if ( mode ) SysPrintf("enabling vunan mode");
}
}

View File

@ -799,10 +799,9 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
switch ( _X_Y_Z_W ) {
case 1: // W*
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0x27);
if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset+12);
else SSE_MOVSS_XMM_to_M32(offset+12, EEREC_S);
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0x27);
SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0x27);
if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
else SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
break;
case 2: // Z*
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
@ -817,39 +816,35 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB1);
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
if ( x86reg >= 0 ) {
SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4);
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4);
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
}
else {
SSE_MOVLPS_XMM_to_M64(offset+4, EEREC_S);
SSE_MOVSS_XMM_to_M32(offset+4, EEREC_S);
SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
}
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB1);
break;
case 4: // Y*
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xe1);
if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4);
else SSE_MOVSS_XMM_to_M32(offset+4, EEREC_S);
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xe1);
SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0xe1);
if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+4);
else SSE_MOVSS_XMM_to_M32(offset+4, EEREC_TEMP);
break;
case 6: // YZ
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xc9);
if ( x86reg >= 0 ) SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4);
else SSE_MOVLPS_XMM_to_M64(offset+4, EEREC_S);
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xd2);
SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0xc9);
if ( x86reg >= 0 ) SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+4);
else SSE_MOVLPS_XMM_to_M64(offset+4, EEREC_TEMP);
break;
case 7: // YZW
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0x39);
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0x93); //ZYXW
if ( x86reg >= 0 ) {
SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+4);
SSE_MOVHPS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+4);
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
}
else {
SSE_MOVLPS_XMM_to_M64(offset+4, EEREC_S);
SSE_MOVHPS_XMM_to_M64(offset+4, EEREC_TEMP);
SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
}
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0x93);
break;
case 8: // X*
if ( x86reg >= 0 ) SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset);
@ -867,26 +862,41 @@ void _saveEAX(VURegs *VU, int x86reg, uptr offset, int info)
else SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
break;
//case 10: break;
//case 11: break;
case 10: //XZ
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
if ( x86reg >= 0 ) {
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset);
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+8);
}
else {
SSE_MOVSS_XMM_to_M32(offset, EEREC_S);
SSE_MOVSS_XMM_to_M32(offset+8, EEREC_TEMP);
}
break;
case 11: break; //XZW
if ( x86reg >= 0 ) {
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_S, offset);
SSE_MOVHPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+8);
}
else {
SSE_MOVSS_XMM_to_M32(offset, EEREC_S);
SSE_MOVHPS_XMM_to_M64(offset+8, EEREC_S);
}
break;
case 12: // XY
if ( x86reg >= 0 ) SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+0);
else SSE_MOVLPS_XMM_to_M64(offset, EEREC_S);
break;
case 13: // XYW
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB4); //ZWYX
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
SSE2_PSHUFD_XMM_to_XMM(EEREC_TEMP, EEREC_S, 0x4b); //YXZW
if ( x86reg >= 0 ) {
SSE_MOVLPS_XMM_to_RmOffset(x86reg, EEREC_S, offset+0);
SSE_MOVHPS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+0);
SSE_MOVSS_XMM_to_RmOffset(x86reg, EEREC_TEMP, offset+12);
}
else {
SSE_MOVLPS_XMM_to_M64(offset, EEREC_S);
SSE_MOVHPS_XMM_to_M64(offset, EEREC_TEMP);
SSE_MOVSS_XMM_to_M32(offset+12, EEREC_TEMP);
}
SSE_SHUFPS_XMM_to_XMM(EEREC_S, EEREC_S, 0xB4);
break;
case 14: // XYZ
SSE_MOVHLPS_XMM_to_XMM(EEREC_TEMP, EEREC_S);
@ -2006,4 +2016,4 @@ void VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
mtgsThread->SendDataPacket();
}
}
//------------------------------------------------------------------
//------------------------------------------------------------------

View File

@ -182,7 +182,6 @@ void recUpdateFlags(VURegs * VU, int reg, int info)
}
SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x1B); // Flip wzyx to xyzw
XOR32RtoR(x86macflag, x86macflag); // Clear Mac Flag
MOV32MtoR(x86temp, prevstataddr); // Load the previous status in to x86temp
AND16ItoR(x86temp, 0xff0); // Keep Sticky and D/I flags
@ -202,13 +201,12 @@ void recUpdateFlags(VURegs * VU, int reg, int info)
SSE_ANDPS_M128_to_XMM(t1reg, (uptr)VU_Zero_Helper_Mask);
SSE_CMPEQPS_M128_to_XMM(t1reg, (uptr)VU_Pos_Infinity); // If infinity, then overflow has occured (NaN's don't report as overflow)
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation
SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the previous calculation
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Has Overflowed" bits from the previous calculation (also make sure we're only grabbing from the XYZW being modified)
AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Has Overflowed" bits from the previous calculation (also make sure we're only grabbing from the XYZW being modified)
pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x208); // OS, O flags
SHL16ItoR(EAX, 12);
OR32RtoR(x86macflag, EAX);
SHL16ItoR(x86macflag, 12);
if (_XYZW_SS) pjmp32 = JMP32(0); // Skip Underflow Check
x86SetJ8(pjmp);
@ -246,35 +244,91 @@ void recUpdateFlags(VURegs * VU, int reg, int info)
vuFloat2(reg, t1reg, flipMask[_X_Y_Z_W]); // Clamp overflowed vectors that were modified (remember reg's vectors have been flipped, so have to use a flipmask)
//-------------------------Check for Signed flags------------------------------
if (_XYZW_SS) {
//-------------------------Check for Signed flags------------------------------
// The following code makes sure the Signed Bit isn't set with Negative Zero
SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
SSE_CMPNEPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is not zero
SSE_ANDPS_XMM_to_XMM(t1reg, reg);
// The following code makes sure the Signed Bit isn't set with Negative Zero
SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg
if (CHECK_VU_EXTRA_FLAGS) {
SSE_ANDNPS_XMM_to_XMM(t1reg, reg);
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x82); // SS, S flags
SHL16ItoR(EAX, 4);
OR32RtoR(x86macflag, EAX);
if (_XYZW_SS) pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking
x86SetJ8(pjmp);
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x82); // SS, S flags
SHL16ItoR(EAX, 4);
OR32RtoR(x86macflag, EAX);
pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking
x86SetJ8(pjmp);
}
else {
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg (for zero flag)
SSE_ANDNPS_XMM_to_XMM(t1reg, reg);
SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the t1reg
//-------------------------Check for Zero flags------------------------------
SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x82); // SS, S flags
SHL16ItoR(x86macflag, 4);
pjmp2 = JMP8(0); // If negative and not Zero, we can skip the Zero Flag checking
x86SetJ8(pjmp);
}
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation
//-------------------------Check for Zero flags------------------------------
if (CHECK_VU_EXTRA_FLAGS) {
SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x41); // ZS, Z flags
OR32RtoR(x86macflag, EAX);
x86SetJ8(pjmp);
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation
}
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x41); // ZS, Z flags
OR32RtoR(x86macflag, EAX);
x86SetJ8(pjmp);
}
else {
//-------------------------Check for Zero flags------------------------------
SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg
SSE_CMPEQPS_XMM_to_XMM(t1reg, reg); // Set all F's if each vector is zero
if (CHECK_VU_EXTRA_FLAGS) {
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the previous calculation
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x41); // ZS, Z flags
OR32RtoR(x86macflag, EAX);
x86SetJ8(pjmp);
}
else {
SSE_MOVMSKPS_XMM_to_R32(x86macflag, t1reg); // Move the sign bits of the previous calculation
AND16ItoR(x86macflag, _X_Y_Z_W ); // Grab "Is Zero" bits from the previous calculation
pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x41); // ZS, Z flags
x86SetJ8(pjmp);
}
//-------------------------Check for Signed flags------------------------------
// The following code makes sure the Signed Bit isn't set with Negative Zero
SSE_ANDNPS_XMM_to_XMM(t1reg, reg);
SSE_MOVMSKPS_XMM_to_R32(EAX, t1reg); // Move the sign bits of the t1reg
AND16ItoR(EAX, _X_Y_Z_W ); // Grab "Is Signed" bits from the previous calculation
pjmp = JZ8(0); // Skip if none are
OR16ItoR(x86temp, 0x82); // SS, S flags
SHL16ItoR(EAX, 4);
OR32RtoR(x86macflag, EAX);
x86SetJ8(pjmp);
}
//-------------------------Finally: Send the Flags to the Mac Flag Address------------------------------
@ -298,8 +352,6 @@ void recUpdateFlags(VURegs * VU, int reg, int info)
//
// Note: See FPU_ADD_SUB() for more info on what this is doing.
//------------------------------------------------------------------
static const PCSX2_ALIGNED16(u32 VU_fullmask[4]) = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff};
static const PCSX2_ALIGNED16(u32 VU_helperbyte[4]) = {0xff, 0xff, 0xff, 0xff};
static PCSX2_ALIGNED16(u32 VU_addsuband[2][4]);
static PCSX2_ALIGNED16(u32 VU_addsub_reg[2][4]);
static u32 ecx_temp_loc;
@ -313,17 +365,16 @@ void VU_ADD_SUB(u32 regd, u32 regt, int is_sub, int info)
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_fullmask[0]);
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[0][0], regd);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[1][0], regd);
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]);
SSE2_PSRLD_I8_to_XMM(regd, 23);
SSE2_PSRLD_I8_to_XMM(regt, 23);
SSE2_PSLLD_I8_to_XMM(regd, 1);
SSE2_PSLLD_I8_to_XMM(regt, 1);
SSE2_PAND_M128_to_XMM(regd, (uptr)&VU_helperbyte[0]);
SSE2_PAND_M128_to_XMM(regt, (uptr)&VU_helperbyte[0]);
SSE2_PSRLD_I8_to_XMM(regd, 24);
SSE2_PSRLD_I8_to_XMM(regt, 24);
SSE2_PSUBD_XMM_to_XMM(regd, regt);
@ -389,6 +440,88 @@ void VU_ADD_SUB(u32 regd, u32 regt, int is_sub, int info)
_freeX86reg(temp2);
}
void VU_ADD_SUB_SSE4(u32 regd, u32 regt, int is_sub, int info)
{
u8 *localptr[4][8];
int temp1 = _allocX86reg(ECX, X86TYPE_TEMP, 0, 0); //receives regd//_allocX86reg(ECX, X86TYPE_TEMP, 0, ((info&PROCESS_VU_SUPER)?0:MODE_NOFRAME)|mode);
int temp2 = ALLOCTEMPX86(0);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
SSE2_PSLLD_I8_to_XMM(regd, 1);
SSE2_PSLLD_I8_to_XMM(regt, 1);
SSE2_PSRLD_I8_to_XMM(regd, 24);
SSE2_PSRLD_I8_to_XMM(regt, 24);
SSE2_PSUBD_XMM_to_XMM(regd, regt);
#define PERFORM_SSE4(i) \
\
SSE_PEXTRW_XMM_to_R32(temp1, regd, i*2); \
MOVSX32R16toR(temp1, temp1); \
CMP32ItoR(temp1, 25);\
localptr[i][0] = JGE8(0);\
CMP32ItoR(temp1, 0);\
localptr[i][1] = JG8(0);\
localptr[i][2] = JE8(0);\
CMP32ItoR(temp1, -25);\
localptr[i][3] = JLE8(0);\
\
NEG32R(temp1); \
DEC32R(temp1);\
MOV32ItoR(temp2, 0xffffffff); \
SHL32CLtoR(temp2); \
SSE4_PINSRD_R32_to_XMM(regd, temp2, i); \
localptr[i][4] = JMP8(0);\
\
x86SetJ8(localptr[i][0]);\
MOV32ItoR(temp2, 0xffffffff); \
SSE4_PINSRD_R32_to_XMM(regd, temp2, i); \
SHL32ItoR(temp2, 31); \
SSE4_PINSRD_R32_to_XMM(regt, temp2, i); \
localptr[i][5] = JMP8(0);\
\
x86SetJ8(localptr[i][1]);\
DEC32R(temp1);\
MOV32ItoR(temp2, 0xffffffff);\
SSE4_PINSRD_R32_to_XMM(regd, temp2, i); \
SHL32CLtoR(temp2); \
SSE4_PINSRD_R32_to_XMM(regt, temp2, i); \
localptr[i][6] = JMP8(0);\
\
x86SetJ8(localptr[i][3]);\
MOV32ItoR(temp2, 0x80000000); \
SSE4_PINSRD_R32_to_XMM(regd, temp2, i); \
localptr[i][7] = JMP8(0);\
\
x86SetJ8(localptr[i][2]);\
\
x86SetJ8(localptr[i][4]);\
x86SetJ8(localptr[i][5]);\
x86SetJ8(localptr[i][6]);\
x86SetJ8(localptr[i][7]);
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
PERFORM_SSE4(0);
PERFORM_SSE4(1);
PERFORM_SSE4(2);
PERFORM_SSE4(3);
#undef PERFORM_SSE4
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); //regt contains mask
if (is_sub) SSE_SUBPS_XMM_to_XMM(regd, regt);
else SSE_ADDPS_XMM_to_XMM(regd, regt);
SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]);
_freeX86reg(temp1);
_freeX86reg(temp2);
}
void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
{
u8 *localptr[8];
@ -399,22 +532,17 @@ void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
if (!is_mem) SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_fullmask[0]);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[0][0], regd);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsuband[1][0], regd);
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]);
SSE_PEXTRW_XMM_to_R32(temp1, regd, 1);
SHR32ItoR(temp1, 23 - 16);
SSE2_MOVD_XMM_to_R(temp1, regd);
SHR32ItoR(temp1, 23);
if (is_mem) {
MOV32MtoR(temp2, addrt);
MOV32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
SHR32ItoR(temp2, 23);
}
else {
SSE_PEXTRW_XMM_to_R32(temp2, regt, 1);
SHR32ItoR(temp2, 23 - 16);
SSE2_MOVD_XMM_to_R(temp2, regt);
SHR32ItoR(temp2, 23);
}
AND32ItoR(temp1, 0xff);
@ -432,24 +560,60 @@ void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
NEG32R(temp1);
DEC32R(temp1);
MOV32ItoR(temp2, 0xffffffff);
SHL32CLtoR(temp2);
MOV32RtoM((uptr)&VU_addsuband[0][0], temp2);
MOV32ItoR(temp2, 0xffffffff);
SHL32CLtoR(temp2);
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
if (is_mem) {
SSE_PINSRW_R32_to_XMM(regd, temp2, 0);
SHR32ItoR(temp2, 16);
SSE_PINSRW_R32_to_XMM(regd, temp2, 1);
}
else {
SSE2_MOVD_R_to_XMM(regt, temp2);
SSE_MOVSS_XMM_to_XMM(regd, regt);
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
}
localptr[4] = JMP8(0);
x86SetJ8(localptr[0]);
MOV32ItoM((uptr)&VU_addsuband[1][0], 0x80000000);
MOV32ItoR(temp2, 0x80000000);
if (is_mem)
AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
else {
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
SSE2_MOVD_R_to_XMM(regd, temp2);
SSE_MOVSS_XMM_to_XMM(regt, regd);
}
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
localptr[5] = JMP8(0);
x86SetJ8(localptr[1]);
DEC32R(temp1);
MOV32ItoR(temp2, 0xffffffff);
SHL32CLtoR(temp2);
MOV32RtoM((uptr)&VU_addsuband[1][0], temp2);
if (is_mem)
AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
else {
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
SSE2_MOVD_R_to_XMM(regd, temp2);
SSE_MOVSS_XMM_to_XMM(regt, regd);
}
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
localptr[6] = JMP8(0);
x86SetJ8(localptr[3]);
MOV32ItoM((uptr)&VU_addsuband[0][0], 0x80000000);
MOV32ItoR(temp2, 0x80000000);
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
if (is_mem) {
SSE_PINSRW_R32_to_XMM(regd, temp2, 0);
SHR32ItoR(temp2, 16);
SSE_PINSRW_R32_to_XMM(regd, temp2, 1);
}
else {
SSE2_MOVD_R_to_XMM(regt, temp2);
SSE_MOVSS_XMM_to_XMM(regd, regt);
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
}
localptr[7] = JMP8(0);
x86SetJ8(localptr[2]);
@ -460,21 +624,121 @@ void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
if (is_mem)
{
SSE_MOVSS_M32_to_XMM(regd, addrt);
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsuband[1][0]); //regd contains addrt
SSE_MOVSS_XMM_to_M32((uptr)&VU_addsub_reg[1][0], regd);
SSE_MOVAPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]);
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsuband[0][0]);
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
if (is_sub) SSE_SUBSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
else SSE_ADDSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
}
else
{
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsuband[0][0]);
SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsuband[1][0]);
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); //regt contains mask
if (is_sub) SSE_SUBSS_XMM_to_XMM(regd, regt);
else SSE_ADDSS_XMM_to_XMM(regd, regt);
SSE_MOVAPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]);
}
_freeX86reg(temp1);
_freeX86reg(temp2);
}
void VU_ADD_SUB_SS_SSE4(u32 regd, u32 regt, int is_sub, int is_mem, int info)
{
u8 *localptr[8];
u32 addrt = regt; //for case is_mem
int temp1 = _allocX86reg(ECX, X86TYPE_TEMP, 0, 0); //receives regd //_allocX86reg(ECX, X86TYPE_TEMP, 0, ((info&PROCESS_VU_SUPER)?0:MODE_NOFRAME)|mode);
int temp2 = ALLOCTEMPX86(0);
SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[0][0], regd);
if (!is_mem) SSE_MOVAPS_XMM_to_M128((uptr)&VU_addsub_reg[1][0], regt);
SSE2_MOVD_XMM_to_R(temp1, regd);
SHR32ItoR(temp1, 23);
if (is_mem) {
MOV32MtoR(temp2, addrt);
MOV32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
SHR32ItoR(temp2, 23);
}
else {
SSE2_MOVD_XMM_to_R(temp2, regt);
SHR32ItoR(temp2, 23);
}
AND32ItoR(temp1, 0xff);
AND32ItoR(temp2, 0xff);
SUB32RtoR(temp1, temp2); //temp1 = exponent difference
CMP32ItoR(temp1, 25);
localptr[0] = JGE8(0);
CMP32ItoR(temp1, 0);
localptr[1] = JG8(0);
localptr[2] = JE8(0);
CMP32ItoR(temp1, -25);
localptr[3] = JLE8(0);
NEG32R(temp1);
DEC32R(temp1);
MOV32ItoR(temp2, 0xffffffff);
SHL32CLtoR(temp2);
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
SSE4_PINSRD_R32_to_XMM(regd, temp2, 0);
if (!is_mem)
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
localptr[4] = JMP8(0);
x86SetJ8(localptr[0]);
MOV32ItoR(temp2, 0x80000000);
if (is_mem)
AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
else {
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
SSE4_PINSRD_R32_to_XMM(regt, temp2, 0);
}
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
localptr[5] = JMP8(0);
x86SetJ8(localptr[1]);
DEC32R(temp1);
MOV32ItoR(temp2, 0xffffffff);
SHL32CLtoR(temp2);
if (is_mem)
AND32RtoM((uptr)&VU_addsub_reg[1][0], temp2);
else {
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
SSE4_PINSRD_R32_to_XMM(regt, temp2, 0);
}
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
localptr[6] = JMP8(0);
x86SetJ8(localptr[3]);
MOV32ItoR(temp2, 0x80000000);
SSE2_PCMPEQB_XMM_to_XMM(regd, regd);
SSE4_PINSRD_R32_to_XMM(regd, temp2, 0);
if (!is_mem)
SSE2_PCMPEQB_XMM_to_XMM(regt, regt);
localptr[7] = JMP8(0);
x86SetJ8(localptr[2]);
x86SetJ8(localptr[4]);
x86SetJ8(localptr[5]);
x86SetJ8(localptr[6]);
x86SetJ8(localptr[7]);
if (is_mem)
{
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
if (is_sub) SSE_SUBSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
else SSE_ADDSS_M32_to_XMM(regd, (uptr)&VU_addsub_reg[1][0]);
}
else
{
SSE_ANDPS_M128_to_XMM(regd, (uptr)&VU_addsub_reg[0][0]); //regd contains mask
SSE_ANDPS_M128_to_XMM(regt, (uptr)&VU_addsub_reg[1][0]); //regt contains mask
if (is_sub) SSE_SUBSS_XMM_to_XMM(regd, regt);
else SSE_ADDSS_XMM_to_XMM(regd, regt);
@ -487,27 +751,57 @@ void VU_ADD_SUB_SS(u32 regd, u32 regt, int is_sub, int is_mem, int info)
}
void SSE_ADDPS_XMM_to_XMM_custom(int info, int regd, int regt) {
if (CHECK_VUADDSUBHACK) VU_ADD_SUB(regd, regt, 0, info);
if (CHECK_VUADDSUBHACK) {
if ( cpucaps.hasStreamingSIMD4Extensions )
VU_ADD_SUB_SSE4(regd, regt, 0, info);
else
VU_ADD_SUB(regd, regt, 0, info);
}
else SSE_ADDPS_XMM_to_XMM(regd, regt);
}
void SSE_SUBPS_XMM_to_XMM_custom(int info, int regd, int regt) {
if (CHECK_VUADDSUBHACK) VU_ADD_SUB(regd, regt, 1, info);
if (CHECK_VUADDSUBHACK) {
if ( cpucaps.hasStreamingSIMD4Extensions )
VU_ADD_SUB_SSE4(regd, regt, 1, info);
else
VU_ADD_SUB(regd, regt, 1, info);
}
else SSE_SUBPS_XMM_to_XMM(regd, regt);
}
void SSE_ADDSS_XMM_to_XMM_custom(int info, int regd, int regt) {
if (CHECK_VUADDSUBHACK) VU_ADD_SUB_SS(regd, regt, 0, 0, info);
if (CHECK_VUADDSUBHACK) {
if ( cpucaps.hasStreamingSIMD4Extensions )
VU_ADD_SUB_SS_SSE4(regd, regt, 0, 0, info);
else
VU_ADD_SUB_SS(regd, regt, 0, 0, info);
}
else SSE_ADDSS_XMM_to_XMM(regd, regt);
}
void SSE_SUBSS_XMM_to_XMM_custom(int info, int regd, int regt) {
if (CHECK_VUADDSUBHACK) VU_ADD_SUB_SS(regd, regt, 1, 0, info);
if (CHECK_VUADDSUBHACK) {
if ( cpucaps.hasStreamingSIMD4Extensions )
VU_ADD_SUB_SS_SSE4(regd, regt, 1, 0, info);
else
VU_ADD_SUB_SS(regd, regt, 1, 0, info);
}
else SSE_SUBSS_XMM_to_XMM(regd, regt);
}
void SSE_ADDSS_M32_to_XMM_custom(int info, int regd, int regt) {
if (CHECK_VUADDSUBHACK) VU_ADD_SUB_SS(regd, regt, 0, 1, info);
if (CHECK_VUADDSUBHACK) {
if ( cpucaps.hasStreamingSIMD4Extensions )
VU_ADD_SUB_SS_SSE4(regd, regt, 0, 1, info);
else
VU_ADD_SUB_SS(regd, regt, 0, 1, info);
}
else SSE_ADDSS_M32_to_XMM(regd, regt);
}
void SSE_SUBSS_M32_to_XMM_custom(int info, int regd, int regt) {
if (CHECK_VUADDSUBHACK) VU_ADD_SUB_SS(regd, regt, 1, 1, info);
if (CHECK_VUADDSUBHACK) {
if ( cpucaps.hasStreamingSIMD4Extensions )
VU_ADD_SUB_SS_SSE4(regd, regt, 1, 1, info);
else
VU_ADD_SUB_SS(regd, regt, 1, 1, info);
}
else SSE_SUBSS_M32_to_XMM(regd, regt);
}
//------------------------------------------------------------------
@ -2814,4 +3108,4 @@ void recVUMI_CLIP(VURegs *VU, int info)
_freeX86reg(x86temp1);
_freeX86reg(x86temp2);
}
}

View File

@ -1617,6 +1617,7 @@ extern void SSE4_BLENDPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm
extern void SSE4_BLENDVPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
extern void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from);
extern void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
extern void SSE4_PINSRD_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 imm8);
//*********************
// SSE-X - uses both SSE,SSE2 code and tries to keep consistensies between the data

View File

@ -1143,6 +1143,15 @@ __forceinline void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from
ModRM(3, to, from);
}
__forceinline void SSE4_PINSRD_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 imm8)
{
write8(0x66);
RexRB(0, to, from);
write24(0x223A0F);
ModRM(3, to, from);
write8(imm8);
}
// SSE-X
__forceinline void SSEX_MOVDQA_M128_to_XMM( x86SSERegType to, uptr from )
{