From 2d805b498783efb755a958efaab7d6fb727606fe Mon Sep 17 00:00:00 2001 From: cottonvibes Date: Fri, 5 Sep 2008 21:39:38 +0000 Subject: [PATCH] recoded some more FPU opcodes, and added 2 new speedhacks: "disable extra VU flags" and "disable extra FPU flags". in the PS2, certain "flags" are set to indicate different statuses. There are flags for overflow, underflow, invalid operation, divide by zero, is Zero, is Negative, etc... some of these flags are rarely checked by games; so what these speedhacks do is not perform the extra code for flags that are rarely used by games. git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@95 a6443dda-0b58-4228-96e9-037be469359c --- pcsx2/Misc.h | 3 + pcsx2/windows/WinMain.c | 8 +- pcsx2/windows/pcsx2.rc | 34 +++++---- pcsx2/windows/resource.h | 4 +- pcsx2/x86/iFPU.c | 153 ++++++++++++++++++++++++++------------- pcsx2/x86/iVUmicro.c | 140 +++++++++++++++++------------------ 6 files changed, 203 insertions(+), 139 deletions(-) diff --git a/pcsx2/Misc.h b/pcsx2/Misc.h index e251b4f9eb..f6f2cfcdb3 100644 --- a/pcsx2/Misc.h +++ b/pcsx2/Misc.h @@ -63,6 +63,9 @@ #define CHECK_UNDERFLOW (!(Config.Hacks & 0x8)) //#define CHECK_DENORMALS ((Config.Hacks & 0x400) ? 0xffc0 : 0x7f80) //If enabled, Denormals are Zero for the recs and flush to zero is enabled as well #define CHECK_FASTBRANCHES (Config.Hacks & 0x80) +#define CHECK_VU_EXTRA_FLAGS (!(Config.Hacks & 0x100)) // Sets correct flags in the VU recs +#define CHECK_FPU_EXTRA_FLAGS (!(Config.Hacks & 0x200)) // Sets correct flags in the FPU recs + //------------ SPECIAL GAME FIXES!!! --------------- #define CHECK_FPUCLAMPHACK (Config.GameFixes & 0x1) // Special Fix for GT4, different clamping for FPU (Note: sets negative infinity to positive fMax when clamping, which the real ps2 doesn't do) #define CHECK_VUCLIPHACK (Config.GameFixes & 0x2) // Special Fix for GoW, updates the clipflag differently in recVUMI_CLIP() (note: turning this hack on, breaks Rockstar games) diff --git a/pcsx2/windows/WinMain.c b/pcsx2/windows/WinMain.c index 9c8353a4c9..9424cd8fea 100644 --- a/pcsx2/windows/WinMain.c +++ b/pcsx2/windows/WinMain.c @@ -759,8 +759,8 @@ BOOL APIENTRY HacksProc(HWND hDlg, UINT message, WPARAM wParam, LPARAM lParam) { if(Config.Hacks & 0x20) CheckDlgButton(hDlg, IDC_SYNCHACK3, TRUE); if(Config.Hacks & 0x40) CheckDlgButton(hDlg, IDC_VU_OVERFLOWHACK, 2); if(Config.Hacks & 0x80) CheckDlgButton(hDlg, IDC_FASTBRANCHES, TRUE); - //if(Config.Hacks & 0x100) CheckDlgButton(hDlg, IDC_VUCLIPHACK, TRUE); - //if(Config.Hacks & 0x200) CheckDlgButton(hDlg, IDC_FPUCLAMPHACK, TRUE); + if(Config.Hacks & 0x100) CheckDlgButton(hDlg, IDC_VU_FLAGS, TRUE); + if(Config.Hacks & 0x200) CheckDlgButton(hDlg, IDC_FPU_FLAGS, TRUE); //if(Config.Hacks & 0x400) CheckDlgButton(hDlg, IDC_DENORMALS, 2); if(Config.Hacks & 0x800) CheckDlgButton(hDlg, IDC_FPU_OVERFLOWHACK, TRUE); if(Config.Hacks & 0x1000) CheckDlgButton(hDlg, IDC_FPU_OVERFLOWHACK, 2); @@ -778,8 +778,8 @@ BOOL APIENTRY HacksProc(HWND hDlg, UINT message, WPARAM wParam, LPARAM lParam) { Config.Hacks |= IsDlgButtonChecked(hDlg, IDC_SYNCHACK2) ? 0x10 : 0; Config.Hacks |= IsDlgButtonChecked(hDlg, IDC_SYNCHACK3) ? 0x20 : 0; Config.Hacks |= IsDlgButtonChecked(hDlg, IDC_FASTBRANCHES) ? 0x80 : 0; - //Config.Hacks |= IsDlgButtonChecked(hDlg, IDC_VUCLIPHACK) ? 0x100 : 0; - //Config.Hacks |= IsDlgButtonChecked(hDlg, IDC_FPUCLAMPHACK) ? 0x200 : 0; + Config.Hacks |= IsDlgButtonChecked(hDlg, IDC_VU_FLAGS) ? 0x100 : 0; + Config.Hacks |= IsDlgButtonChecked(hDlg, IDC_FPU_FLAGS) ? 0x200 : 0; //Config.Hacks |= ( IsDlgButtonChecked(hDlg, IDC_DENORMALS) == 2 ) ? 0x408 : (IsDlgButtonChecked(hDlg, IDC_DENORMALS) ? 0x8 : 0); // 0x408 == greyed checkbox (DaZ SSE flag; so the CPU sets denormals to zero) Config.Hacks |= ( IsDlgButtonChecked(hDlg, IDC_FPU_OVERFLOWHACK) == 2 ) ? 0x1000 : (IsDlgButtonChecked(hDlg, IDC_FPU_OVERFLOWHACK) ? 0x800 : 0); // 0x1000 == greyed checkbox (extra overflow checking); 0x800 == checked (disable overflow checking) diff --git a/pcsx2/windows/pcsx2.rc b/pcsx2/windows/pcsx2.rc index 72e4675f58..678f0a0eed 100644 --- a/pcsx2/windows/pcsx2.rc +++ b/pcsx2/windows/pcsx2.rc @@ -1030,34 +1030,39 @@ BEGIN CONTROL 132,IDC_PS2SILVER_RECT,"Static",SS_BITMAP,0,167,70,74 END -IDD_HACKS DIALOGEX 0, 0, 511, 243 +IDD_HACKS DIALOGEX 0, 0, 511, 295 STYLE DS_SETFONT | DS_MODALFRAME | DS_FIXEDSYS | WS_POPUP | WS_CAPTION | WS_SYSMENU CAPTION "PCSX2 Speed Hacks" FONT 8, "MS Shell Dlg", 400, 0, 0x1 BEGIN - DEFPUSHBUTTON "OK",IDOK,205,222,50,14 - PUSHBUTTON "Cancel",IDCANCEL,261,222,50,14 + DEFPUSHBUTTON "OK",IDOK,205,274,50,14 + PUSHBUTTON "Cancel",IDCANCEL,261,274,50,14 CONTROL "EE Sync Hack (x2) - Doubles the cycle rate of the EE. ( Big Speedup in most games! )",IDC_SYNCHACK, - "Button",BS_AUTOCHECKBOX | WS_TABSTOP,15,111,418,10 + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,15,162,418,10 CONTROL "Disable VU Overflow Checks - *Checked = Disables overflow checks. ( Speedup! ) *Greyed = Extra overflow checks. ( Helps SPS, Slow! )",IDC_VU_OVERFLOWHACK, - "Button",BS_AUTO3STATE | WS_TABSTOP,14,49,475,10 + "Button",BS_AUTO3STATE | WS_TABSTOP,15,49,475,10 CTEXT "These hacks will effect the speed of PCSX2 but possibly comprimise on compatability",IDC_HACKDESC,7,7,497,8 CONTROL "Tighter SPU2 Sync ( FFXII vids) - Slower, not very useful anymore.",IDC_SOUNDHACK, - "Button",BS_AUTOCHECKBOX | WS_TABSTOP,15,194,421,10 + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,15,245,421,10 CONTROL "IOP Sync Hack (x2) - Doubles the cycle rate of the IOP. ( Speedup but breaks some games. )",IDC_SYNCHACK2, - "Button",BS_AUTOCHECKBOX | WS_TABSTOP,15,125,410,10 + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,15,176,410,10 CONTROL "EE/IOP Sync Hack (x3) - Makes EE and IOP hacks triple the cycle rate. ( Sometimes speeds games a bit more, but can break games. )",IDC_SYNCHACK3, - "Button",BS_AUTOCHECKBOX | WS_TABSTOP,14,139,464,11 + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,15,190,464,11 CONTROL "Disable FPU Overflow Checks - *Checked = Disables overflow checks. ( Speedup! ) *Greyed = Extra overflow checks. ( Helps SPS, Slow! )",IDC_FPU_OVERFLOWHACK, - "Button",BS_AUTO3STATE | WS_TABSTOP,14,63,483,10 + "Button",BS_AUTO3STATE | WS_TABSTOP,15,63,483,10 CONTROL "EE/IOP Fast Branches - Quick branching ( Very small speedup; Not Recommended! )",IDC_FASTBRANCHES, - "Button",BS_AUTOCHECKBOX | WS_TABSTOP,15,180,423,10 + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,15,231,423,10 CTEXT "If you have problems, disable all these and try again",IDC_STATIC,7,22,497,8 GROUPBOX "Overflow and Underflow",IDC_STATIC,7,36,497,58 CONTROL "Disable Underflow Checks - *Checked = Disables underflow checks. ( Speedup! )",IDC_DENORMALS, - "Button",BS_AUTOCHECKBOX | WS_TABSTOP,14,77,319,10 - GROUPBOX "Sync Hacks",IDC_STATIC,7,98,497,63 - GROUPBOX "Miscellaneous",IDC_STATIC,7,165,497,50 + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,15,77,319,10 + GROUPBOX "Sync Hacks",IDC_STATIC,7,149,497,63 + GROUPBOX "Miscellaneous",IDC_STATIC,7,216,497,50 + GROUPBOX "Flag Setting",IDC_STATIC,7,100,497,41 + CONTROL "Disable Extra VU Flags - When checked, PCSX2 doesn't set some flags that are rarely used by games. ( Speedup! )",IDC_VU_FLAGS, + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,15,113,442,10 + CONTROL "Disable Extra FPU Flags - When checked, PCSX2 doesn't set some flags that are rarely used by games. ( Speedup! )",IDC_FPU_FLAGS, + "Button",BS_AUTOCHECKBOX | WS_TABSTOP,15,126,414,10 END @@ -1079,8 +1084,9 @@ BEGIN BEGIN LEFTMARGIN, 7 RIGHTMARGIN, 504 + VERTGUIDE, 15 TOPMARGIN, 7 - BOTTOMMARGIN, 236 + BOTTOMMARGIN, 288 END END #endif // APSTUDIO_INVOKED diff --git a/pcsx2/windows/resource.h b/pcsx2/windows/resource.h index 3582b5961f..8ee4d72237 100644 --- a/pcsx2/windows/resource.h +++ b/pcsx2/windows/resource.h @@ -614,13 +614,15 @@ #define IDC_GAMEFIX1 1300 #define IDC_DENORMALS 1301 #define IDC_EE_CHECK2 1301 -#define IDC_CHECK3 1301 #define IDC_GAMEFIX2 1301 #define IDC_VUCLIPHACK 1302 #define IDC_VU_CHECK1 1302 +#define IDC_VU_FLAGS 1302 #define IDC_FRAMELIMIT_OPTIONS 1303 #define IDC_FPUCLAMPHACK 1303 #define IDC_VU_CHECK2 1303 +#define IDC_VU_FLAGS2 1303 +#define IDC_FPU_FLAGS 1303 #define IDC_ROUNDMODE 1304 #define IDC_EE_ROUNDMODE0 1305 #define IDC_EE_ROUNDMODE1 1306 diff --git a/pcsx2/x86/iFPU.c b/pcsx2/x86/iFPU.c index 999baedc94..07351194f5 100644 --- a/pcsx2/x86/iFPU.c +++ b/pcsx2/x86/iFPU.c @@ -978,25 +978,27 @@ void recSQRT_S_xmm(int info) int tempReg; u8* pjmp; - SysPrintf("FPU: SQRT \n"); + SysPrintf("FPU: SQRT\n"); tempReg = _allocX86reg(-1, X86TYPE_TEMP, 0, 0); - if (tempReg == -1) {SysPrintf("FPU: SQRT Allocation Error! \n"); tempReg = EAX;} + if (tempReg == -1) {SysPrintf("FPU: SQRT Allocation Error!\n"); tempReg = EAX;} if( info & PROCESS_EE_T ) { if ( EEREC_D != EEREC_T ) SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_T); } else SSE_MOVSS_M32_to_XMM(EEREC_D, (uptr)&fpuRegs.fpr[_Ft_]); - AND32ItoM((uptr)&fpuRegs.fprc[31], ~(FPUflagI|FPUflagD)); // Clear I and D flags + if (CHECK_FPU_EXTRA_FLAGS) { + AND32ItoM((uptr)&fpuRegs.fprc[31], ~(FPUflagI|FPUflagD)); // Clear I and D flags - /*--- Check for negative SQRT ---*/ - XOR32RtoR(tempReg, tempReg); - SSE_MOVMSKPS_XMM_to_R32(tempReg, EEREC_D); - AND32ItoR(tempReg, 1); //Check sign - pjmp = JZ8(0); //Skip if none are - OR32ItoM((uptr)&fpuRegs.fprc[31], FPUflagI|FPUflagSI); // Set I and SI flags - SSE_ANDPS_M128_to_XMM(EEREC_D, (uptr)&s_pos[0]); // Make EEREC_D Positive - x86SetJ8(pjmp); + /*--- Check for negative SQRT ---*/ + SSE_MOVMSKPS_XMM_to_R32(tempReg, EEREC_D); + AND32ItoR(tempReg, 1); //Check sign + pjmp = JZ8(0); //Skip if none are + OR32ItoM((uptr)&fpuRegs.fprc[31], FPUflagI|FPUflagSI); // Set I and SI flags + SSE_ANDPS_M128_to_XMM(EEREC_D, (uptr)&s_pos[0]); // Make EEREC_D Positive + x86SetJ8(pjmp); + } + else SSE_ANDPS_M128_to_XMM(EEREC_D, (uptr)&s_pos[0]); // Make EEREC_D Positive if (CHECK_FPU_OVERFLOW) // Only need to do positive clamp, since EEREC_D is positive SSE_MINSS_M32_to_XMM(EEREC_D, (uptr)&g_maxvals[0]); @@ -1051,54 +1053,105 @@ void recNEG_S_xmm(int info) { FPURECOMPILE_CONSTCODE(NEG_S, XMMINFO_WRITED|XMMINFO_READS); -void recRSQRT_S_xmm(int info) -{ - int t0reg = _allocTempXMMreg(XMMT_FPS, -1); +// Preforms the RSQRT function when regd <- Fs and t0reg <- Ft (Sets correct flags) +void recRSQRThelper1(int regd, int t0reg) +{ + u8* pjmp1; + u8* pjmp2; + u32* pjmp32; + int t1reg = _allocTempXMMreg(XMMT_FPS, -1); + int tempReg = _allocX86reg(-1, X86TYPE_TEMP, 0, 0); + if (t1reg == -1) {SysPrintf("FPU: RSQRT Allocation Error!\n");} + if (tempReg == -1) {SysPrintf("FPU: RSQRT Allocation Error!\n"); tempReg = EAX;} + AND32ItoM((uptr)&fpuRegs.fprc[31], ~(FPUflagI|FPUflagD)); // Clear I and D flags + + /*--- Check for zero ---*/ + SSE_XORPS_XMM_to_XMM(t1reg, t1reg); + SSE_CMPEQSS_XMM_to_XMM(t1reg, t0reg); + SSE_MOVMSKPS_XMM_to_R32(tempReg, t1reg); + AND32ItoR(tempReg, 1); //Check sign (if t0reg == zero, sign will be set) + pjmp1 = JZ8(0); //Skip if not set + OR32ItoM((uptr)&fpuRegs.fprc[31], FPUflagD|FPUflagSD); // Set D and SD flags + SSE_XORPS_XMM_to_XMM(regd, t0reg); // Make regd Positive or Negative + SSE_ANDPS_M128_to_XMM(regd, (uptr)&s_neg[0]); // Get the sign bit + SSE_ORPS_M128_to_XMM(regd, (uptr)&g_maxvals[0]); // regd = +/- Maximum + pjmp32 = JMP32(0); + x86SetJ8(pjmp1); + + /*--- Check for negative SQRT ---*/ + SSE_MOVMSKPS_XMM_to_R32(tempReg, t0reg); + AND32ItoR(tempReg, 1); //Check sign + pjmp2 = JZ8(0); //Skip if not set + OR32ItoM((uptr)&fpuRegs.fprc[31], FPUflagI|FPUflagSI); // Set I and SI flags + SSE_ANDPS_M128_to_XMM(t0reg, (uptr)&s_pos[0]); // Make t0reg Positive + x86SetJ8(pjmp2); + + if (CHECK_FPU_EXTRA_OVERFLOW) { + SSE_MINSS_M32_to_XMM(t0reg, (uptr)&g_maxvals[0]); // Only need to do positive clamp, since t0reg is positive + ClampValues(regd); + } + + SSE_SQRTSS_XMM_to_XMM(t0reg, t0reg); + SSE_DIVSS_XMM_to_XMM(regd, t0reg); + + ClampValues(regd); + x86SetJ32(pjmp32); + + _freeXMMreg(t1reg); + _freeX86reg(tempReg); +} + +// Preforms the RSQRT function when regd <- Fs and t0reg <- Ft (Doesn't set flags) +void recRSQRThelper2(int regd, int t0reg) +{ + SSE_ANDPS_M128_to_XMM(t0reg, (uptr)&s_pos[0]); // Make t0reg Positive + if (CHECK_FPU_EXTRA_OVERFLOW) { + SSE_MINSS_M32_to_XMM(t0reg, (uptr)&g_maxvals[0]); // Only need to do positive clamp, since t0reg is positive + ClampValues(regd); + } + SSE_SQRTSS_XMM_to_XMM(t0reg, t0reg); + SSE_DIVSS_XMM_to_XMM(regd, t0reg); + ClampValues(regd); +} + +void recRSQRT_S_xmm(int info) +{ + int t0reg = _allocTempXMMreg(XMMT_FPS, -1); + SysPrintf("FPU: RSQRT\n"); + if (t0reg == -1) {SysPrintf("FPU: RSQRT Allocation Error!\n");} + switch(info & (PROCESS_EE_S|PROCESS_EE_T) ) { case PROCESS_EE_S: - if( EEREC_D == EEREC_S ) { - SSE_SQRTSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]); - SSE_DIVSS_XMM_to_XMM(EEREC_D, t0reg); - } - else { - SSE_SQRTSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_DIVSS_XMM_to_XMM(EEREC_D, t0reg); - } - + //SysPrintf("FPU: RSQRT case 1\n"); + if( EEREC_D != EEREC_S ) SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]); + if (CHECK_FPU_EXTRA_FLAGS) recRSQRThelper1(EEREC_D, t0reg); + else recRSQRThelper2(EEREC_D, t0reg); break; - case PROCESS_EE_T: - SSE_SQRTSS_XMM_to_XMM(t0reg, EEREC_T); - SSE_MOVSS_M32_to_XMM(EEREC_D, (uptr)&fpuRegs.fpr[_Fs_]); - - SSE_DIVSS_XMM_to_XMM(EEREC_D, t0reg); + case PROCESS_EE_T: + //SysPrintf("FPU: RSQRT case 2\n"); + SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_T); + SSE_MOVSS_M32_to_XMM(EEREC_D, (uptr)&fpuRegs.fpr[_Fs_]); + if (CHECK_FPU_EXTRA_FLAGS) recRSQRThelper1(EEREC_D, t0reg); + else recRSQRThelper2(EEREC_D, t0reg); + break; + case (PROCESS_EE_S|PROCESS_EE_T): + //SysPrintf("FPU: RSQRT case 3\n"); + SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_T); + if( EEREC_D != EEREC_S ) SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); + if (CHECK_FPU_EXTRA_FLAGS) recRSQRThelper1(EEREC_D, t0reg); + else recRSQRThelper2(EEREC_D, t0reg); break; default: - if( (info & PROCESS_EE_T) && (info & PROCESS_EE_S) ) { - if( EEREC_D == EEREC_T ){ - SSE_SQRTSS_XMM_to_XMM(t0reg, EEREC_T); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_DIVSS_XMM_to_XMM(EEREC_D, t0reg); - } - else if( EEREC_D == EEREC_S ){ - SSE_SQRTSS_XMM_to_XMM(t0reg, EEREC_T); - SSE_DIVSS_XMM_to_XMM(EEREC_D, t0reg); - } else { - SSE_SQRTSS_XMM_to_XMM(t0reg, EEREC_T); - SSE_MOVSS_XMM_to_XMM(EEREC_D, EEREC_S); - SSE_DIVSS_XMM_to_XMM(EEREC_D, t0reg); - } - }else{ - SSE_SQRTSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]); - SSE_MOVSS_M32_to_XMM(EEREC_D, (uptr)&fpuRegs.fpr[_Fs_]); - SSE_DIVSS_XMM_to_XMM(EEREC_D, t0reg); - } - + //SysPrintf("FPU: RSQRT case 4\n"); + SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]); + SSE_MOVSS_M32_to_XMM(EEREC_D, (uptr)&fpuRegs.fpr[_Fs_]); + if (CHECK_FPU_EXTRA_FLAGS) recRSQRThelper1(EEREC_D, t0reg); + else recRSQRThelper2(EEREC_D, t0reg); break; } _freeXMMreg(t0reg); - ClampValues(EEREC_D); } FPURECOMPILE_CONSTCODE(RSQRT_S, XMMINFO_WRITED|XMMINFO_READS|XMMINFO_READT); diff --git a/pcsx2/x86/iVUmicro.c b/pcsx2/x86/iVUmicro.c index a62eb471cf..b745c01fd6 100644 --- a/pcsx2/x86/iVUmicro.c +++ b/pcsx2/x86/iVUmicro.c @@ -1398,53 +1398,53 @@ void recUpdateFlags(VURegs * VU, int reg, int info) if( EEREC_TEMP != reg ) { SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x1B); // Flip wzyx to xyzw - - //-------------------------Check for Overflow flags------------------------------ - - SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); // Clear EEREC_TEMP - SSE_CMPUNORDPS_XMM_to_XMM(EEREC_TEMP, reg); // If reg == NaN then set Vector to 0xFFFFFFFF - XOR32RtoR(x86macflag, x86macflag); // Clear Mac Flag - - SSE_MOVMSKPS_XMM_to_R32(x86newflag, EEREC_TEMP); // Move the sign bits of the previous calculation - XOR32RtoR(x86temp, x86temp); //Clear x86temp - AND32ItoR(x86newflag, 0x0f & _X_Y_Z_W ); // Grab "Has Overflowed" bits from the previous calculation (also make sure we're only grabbing from the XYZW being modified) - pjmp = JZ8(0); // Skip if none are - OR32ItoR(x86temp, 8); // Set if they are - x86SetJ8(pjmp); + if (CHECK_VU_EXTRA_FLAGS) { + //-------------------------Check for Overflow flags------------------------------ - OR32RtoR(x86macflag, x86newflag); - SHL32ItoR(x86macflag, 4); // Shift the Overflow flags left 4 + SSE_XORPS_XMM_to_XMM(EEREC_TEMP, EEREC_TEMP); // Clear EEREC_TEMP + SSE_CMPUNORDPS_XMM_to_XMM(EEREC_TEMP, reg); // If reg == NaN then set Vector to 0xFFFFFFFF - //-------------------------Check for Underflow flags------------------------------ + SSE_MOVMSKPS_XMM_to_R32(x86newflag, EEREC_TEMP); // Move the sign bits of the previous calculation - SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, reg); // EEREC_TEMP <- reg + AND32ItoR(x86newflag, 0x0f & _X_Y_Z_W ); // Grab "Has Overflowed" bits from the previous calculation (also make sure we're only grabbing from the XYZW being modified) + pjmp = JZ8(0); // Skip if none are + OR32ItoR(x86temp, 8); // Set if they are + x86SetJ8(pjmp); - SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU_Underflow_Mask1[ 0 ]); - SSE_CMPEQPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU_Zero_Mask[ 0 ]); // If (EEREC_TEMP == zero exponent) then set Vector to 0xFFFFFFFF + OR32RtoR(x86macflag, x86newflag); + SHL32ItoR(x86macflag, 4); // Shift the Overflow flags left 4 - SSE_ANDPS_XMM_to_XMM(EEREC_TEMP, reg); - SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU_Underflow_Mask2[ 0 ]); - SSE_CMPNEPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU_Zero_Mask[ 0 ]); // If (EEREC_TEMP != zero mantisa) then set Vector to 0xFFFFFFFF + //-------------------------Check for Underflow flags------------------------------ - SSE_MOVMSKPS_XMM_to_R32(x86newflag, EEREC_TEMP); // Move the sign bits of the previous calculation + SSE_MOVAPS_XMM_to_XMM(EEREC_TEMP, reg); // EEREC_TEMP <- reg - AND32ItoR(x86newflag, 0x0f & _X_Y_Z_W ); // Grab "Has Underflowed" bits from the previous calculation - pjmp = JZ8(0); // Skip if none are - OR32ItoR(x86temp, 4); // Set if they are - x86SetJ8(pjmp); + SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU_Underflow_Mask1[ 0 ]); + SSE_CMPEQPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU_Zero_Mask[ 0 ]); // If (EEREC_TEMP == zero exponent) then set Vector to 0xFFFFFFFF - OR32RtoR(x86macflag, x86newflag); - SHL32ItoR(x86macflag, 4); // Shift the Overflow and Underflow flags left 4 + SSE_ANDPS_XMM_to_XMM(EEREC_TEMP, reg); + SSE_ANDPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU_Underflow_Mask2[ 0 ]); + SSE_CMPNEPS_M128_to_XMM(EEREC_TEMP, (uptr)&VU_Zero_Mask[ 0 ]); // If (EEREC_TEMP != zero mantisa) then set Vector to 0xFFFFFFFF - //-------------------------Optional Code: Denormals Are Zero------------------------------ - if (CHECK_UNDERFLOW) { // Sets underflow/denormals to zero - SSE_ANDNPS_XMM_to_XMM(EEREC_TEMP, reg); // EEREC_TEMP = !EEREC_TEMP & reg - // Now we have Denormals are Positive Zero in EEREC_TEMP; the next two lines take Signed Zero into account - SSE_ANDPS_M128_to_XMM(reg, (uptr)&VU_Signed_Zero_Mask[ 0 ]); - SSE_ORPS_XMM_to_XMM(reg, EEREC_TEMP); + SSE_MOVMSKPS_XMM_to_R32(x86newflag, EEREC_TEMP); // Move the sign bits of the previous calculation + + AND32ItoR(x86newflag, 0x0f & _X_Y_Z_W ); // Grab "Has Underflowed" bits from the previous calculation + pjmp = JZ8(0); // Skip if none are + OR32ItoR(x86temp, 4); // Set if they are + x86SetJ8(pjmp); + + OR32RtoR(x86macflag, x86newflag); + SHL32ItoR(x86macflag, 4); // Shift the Overflow and Underflow flags left 4 + + //-------------------------Optional Code: Denormals Are Zero------------------------------ + if (CHECK_UNDERFLOW) { // Sets underflow/denormals to zero + SSE_ANDNPS_XMM_to_XMM(EEREC_TEMP, reg); // EEREC_TEMP = !EEREC_TEMP & reg + // Now we have Denormals are Positive Zero in EEREC_TEMP; the next two lines take Signed Zero into account + SSE_ANDPS_M128_to_XMM(reg, (uptr)&VU_Signed_Zero_Mask[ 0 ]); + SSE_ORPS_XMM_to_XMM(reg, EEREC_TEMP); + } } //-------------------------Check for Signed flags------------------------------ @@ -1512,53 +1512,53 @@ void recUpdateFlags(VURegs * VU, int reg, int info) } SSE_SHUFPS_XMM_to_XMM(reg, reg, 0x1B); // Flip wzyx to xyzw - - //-------------------------Check for Overflow flags------------------------------ - - SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg - SSE_CMPUNORDPS_XMM_to_XMM(t1reg, reg); // If reg == NaN then set Vector to 0xFFFFFFFF - XOR32RtoR(x86macflag, x86macflag); // Clear Mac Flag - - SSE_MOVMSKPS_XMM_to_R32(x86newflag, t1reg); // Move the sign bits of the previous calculation - XOR32RtoR(x86temp, x86temp); //Clear x86temp - AND32ItoR(x86newflag, 0x0f & _X_Y_Z_W ); // Grab "Has Overflowed" bits from the previous calculation (also make sure we're only grabbing from the XYZW being modified) - pjmp = JZ8(0); // Skip if none are - OR32ItoR(x86temp, 8); // Set if they are - x86SetJ8(pjmp); + if (CHECK_VU_EXTRA_FLAGS) { + //-------------------------Check for Overflow flags------------------------------ - OR32RtoR(x86macflag, x86newflag); - SHL32ItoR(x86macflag, 4); // Shift the Overflow flags left 4 + SSE_XORPS_XMM_to_XMM(t1reg, t1reg); // Clear t1reg + SSE_CMPUNORDPS_XMM_to_XMM(t1reg, reg); // If reg == NaN then set Vector to 0xFFFFFFFF - //-------------------------Check for Underflow flags------------------------------ + SSE_MOVMSKPS_XMM_to_R32(x86newflag, t1reg); // Move the sign bits of the previous calculation - SSE_MOVAPS_XMM_to_XMM(t1reg, reg); // t1reg <- reg + AND32ItoR(x86newflag, 0x0f & _X_Y_Z_W ); // Grab "Has Overflowed" bits from the previous calculation (also make sure we're only grabbing from the XYZW being modified) + pjmp = JZ8(0); // Skip if none are + OR32ItoR(x86temp, 8); // Set if they are + x86SetJ8(pjmp); - SSE_ANDPS_M128_to_XMM(t1reg, (uptr)&VU_Underflow_Mask1[ 0 ]); - SSE_CMPEQPS_M128_to_XMM(t1reg, (uptr)&VU_Zero_Mask[ 0 ]); // If (t1reg == zero exponent) then set Vector to 0xFFFFFFFF + OR32RtoR(x86macflag, x86newflag); + SHL32ItoR(x86macflag, 4); // Shift the Overflow flags left 4 - SSE_ANDPS_XMM_to_XMM(t1reg, reg); - SSE_ANDPS_M128_to_XMM(t1reg, (uptr)&VU_Underflow_Mask2[ 0 ]); - SSE_CMPNEPS_M128_to_XMM(t1reg, (uptr)&VU_Zero_Mask[ 0 ]); // If (t1reg != zero mantisa) then set Vector to 0xFFFFFFFF + //-------------------------Check for Underflow flags------------------------------ - SSE_MOVMSKPS_XMM_to_R32(x86newflag, t1reg); // Move the sign bits of the previous calculation + SSE_MOVAPS_XMM_to_XMM(t1reg, reg); // t1reg <- reg - AND32ItoR(x86newflag, 0x0f & _X_Y_Z_W ); // Grab "Has Underflowed" bits from the previous calculation - pjmp = JZ8(0); // Skip if none are - OR32ItoR(x86temp, 4); // Set if they are - x86SetJ8(pjmp); + SSE_ANDPS_M128_to_XMM(t1reg, (uptr)&VU_Underflow_Mask1[ 0 ]); + SSE_CMPEQPS_M128_to_XMM(t1reg, (uptr)&VU_Zero_Mask[ 0 ]); // If (t1reg == zero exponent) then set Vector to 0xFFFFFFFF - OR32RtoR(x86macflag, x86newflag); - SHL32ItoR(x86macflag, 4); // Shift the Overflow and Underflow flags left 4 + SSE_ANDPS_XMM_to_XMM(t1reg, reg); + SSE_ANDPS_M128_to_XMM(t1reg, (uptr)&VU_Underflow_Mask2[ 0 ]); + SSE_CMPNEPS_M128_to_XMM(t1reg, (uptr)&VU_Zero_Mask[ 0 ]); // If (t1reg != zero mantisa) then set Vector to 0xFFFFFFFF - //-------------------------Optional Code: Denormals Are Zero------------------------------ - if (CHECK_UNDERFLOW) { // Sets underflow/denormals to zero - SSE_ANDNPS_XMM_to_XMM(t1reg, reg); // t1reg = !t1reg & reg - // Now we have Denormals are Positive Zero in t1reg; the next two lines take Signed Zero into account - SSE_ANDPS_M128_to_XMM(reg, (uptr)&VU_Signed_Zero_Mask[ 0 ]); - SSE_ORPS_XMM_to_XMM(reg, t1reg); + SSE_MOVMSKPS_XMM_to_R32(x86newflag, t1reg); // Move the sign bits of the previous calculation + + AND32ItoR(x86newflag, 0x0f & _X_Y_Z_W ); // Grab "Has Underflowed" bits from the previous calculation + pjmp = JZ8(0); // Skip if none are + OR32ItoR(x86temp, 4); // Set if they are + x86SetJ8(pjmp); + + OR32RtoR(x86macflag, x86newflag); + SHL32ItoR(x86macflag, 4); // Shift the Overflow and Underflow flags left 4 + + //-------------------------Optional Code: Denormals Are Zero------------------------------ + if (CHECK_UNDERFLOW) { // Sets underflow/denormals to zero + SSE_ANDNPS_XMM_to_XMM(t1reg, reg); // t1reg = !t1reg & reg + // Now we have Denormals are Positive Zero in t1reg; the next two lines take Signed Zero into account + SSE_ANDPS_M128_to_XMM(reg, (uptr)&VU_Signed_Zero_Mask[ 0 ]); + SSE_ORPS_XMM_to_XMM(reg, t1reg); + } } //-------------------------Check for Signed flags------------------------------