From aef3af8a29fc9d13c18f2fc6b8d14299680eb11c Mon Sep 17 00:00:00 2001 From: "Jake.Stine" Date: Mon, 9 Mar 2009 23:54:38 +0000 Subject: [PATCH] nneeve improves the software-emulated FPU accuracy ("Full" mode in Advanced Dialog). Appended notes: * ADD in iFPUd should be bit accurate (unless it isn't. needs TESTING) * MUL in iFPUd with Software Emulate MUL is as much as I could get near bit accurate (not quite enough, probably. needs TESTING) git-svn-id: http://pcsx2.googlecode.com/svn/trunk@728 96395faa-99c1-11dd-bbfe-3dabce05a288 --- common/include/Pcsx2Config.h | 1 + pcsx2/windows/AdvancedDlg.cpp | 4 ++ pcsx2/windows/pcsx2.rc | 33 ++++----- pcsx2/windows/resource.h | 1 + pcsx2/x86/iFPU.cpp | 131 +++++++++++++++++++++++++++------- pcsx2/x86/iFPUd.cpp | 52 +++++++++++--- 6 files changed, 173 insertions(+), 49 deletions(-) diff --git a/common/include/Pcsx2Config.h b/common/include/Pcsx2Config.h index eeb238e302..a5326d388c 100644 --- a/common/include/Pcsx2Config.h +++ b/common/include/Pcsx2Config.h @@ -77,6 +77,7 @@ extern SessionOverrideFlags g_Session; #define CHECK_FPU_EXTRA_OVERFLOW (Config.eeOptions & 0x2) // If enabled, Operands are checked for infinities before being used in the FPU recs #define CHECK_FPU_EXTRA_FLAGS 1 // Always enabled now // Sets D/I flags on FPU instructions #define CHECK_FPU_FULL (Config.eeOptions & 0x4) +#define CHECK_FPU_ATTEMPT_MUL (Config.eeOptions & 0x8) #define DEFAULT_eeOptions 0x01 #define DEFAULT_vuOptions 0x01 //------------ DEFAULT sseMXCSR VALUES!!! --------------- diff --git a/pcsx2/windows/AdvancedDlg.cpp b/pcsx2/windows/AdvancedDlg.cpp index 16a6f0d3aa..1546ae31e7 100644 --- a/pcsx2/windows/AdvancedDlg.cpp +++ b/pcsx2/windows/AdvancedDlg.cpp @@ -35,6 +35,8 @@ static void InitRoundClampModes( HWND hDlg, u32 new_eeopt, u32 new_vuopt ) else if (new_vuopt & 0x1) CheckRadioButton(hDlg, IDC_VU_CLAMPMODE0, IDC_VU_CLAMPMODE3, IDC_VU_CLAMPMODE0 + 1); else CheckRadioButton(hDlg, IDC_VU_CLAMPMODE0, IDC_VU_CLAMPMODE3, IDC_VU_CLAMPMODE0 + 0); + CheckDlgButton(hDlg, IDC_EE_CHECK3, (new_eeopt & 0x8) ? TRUE : FALSE); + if (new_eeopt & 0x4) CheckRadioButton(hDlg, IDC_EE_CLAMPMODE0, IDC_EE_CLAMPMODE3, IDC_EE_CLAMPMODE0 + 3); else if (new_eeopt & 0x2) CheckRadioButton(hDlg, IDC_EE_CLAMPMODE0, IDC_EE_CLAMPMODE3, IDC_EE_CLAMPMODE0 + 2); else if (new_eeopt & 0x1) CheckRadioButton(hDlg, IDC_EE_CLAMPMODE0, IDC_EE_CLAMPMODE3, IDC_EE_CLAMPMODE0 + 1); @@ -94,6 +96,8 @@ BOOL APIENTRY AdvancedOptionsProc(HWND hDlg, UINT message, WPARAM wParam, LPARAM new_eeopt |= IsDlgButtonChecked(hDlg, IDC_EE_CLAMPMODE2) ? 0x3 : 0; new_eeopt |= IsDlgButtonChecked(hDlg, IDC_EE_CLAMPMODE3) ? 0x7 : 0; + new_eeopt |= IsDlgButtonChecked(hDlg, IDC_EE_CHECK3) ? 0x8 : 0; + new_vuopt |= IsDlgButtonChecked(hDlg, IDC_VU_CLAMPMODE0) ? 0x0 : 0; new_vuopt |= IsDlgButtonChecked(hDlg, IDC_VU_CLAMPMODE1) ? 0x1 : 0; new_vuopt |= IsDlgButtonChecked(hDlg, IDC_VU_CLAMPMODE2) ? 0x3 : 0; diff --git a/pcsx2/windows/pcsx2.rc b/pcsx2/windows/pcsx2.rc index 89a7f40458..bd892dca42 100644 --- a/pcsx2/windows/pcsx2.rc +++ b/pcsx2/windows/pcsx2.rc @@ -195,30 +195,31 @@ BEGIN RADIOBUTTON "Chop / Zero",IDC_EE_ROUNDMODE3,156,36,54,16 CONTROL " Flush to Zero",IDC_EE_CHECK1,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,20,105,58,13 CONTROL " Denormals are Zero",IDC_EE_CHECK2,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,111,105,79,13 - CONTROL " Flush to Zero",IDC_VU_CHECK1,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,20,214,58,13 - CONTROL " Denormals are Zero",IDC_VU_CHECK2,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,111,214,81,13 - RADIOBUTTON "Nearest",IDC_VU_ROUNDMODE0,20,154,44,12 - RADIOBUTTON "Negative",IDC_VU_ROUNDMODE1,64,154,47,12 - RADIOBUTTON "Positive",IDC_VU_ROUNDMODE2,111,154,45,12 - RADIOBUTTON "Chop / Zero",IDC_VU_ROUNDMODE3,156,154,52,12 + CONTROL " Flush to Zero",IDC_VU_CHECK1,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,20,232,58,13 + CONTROL " Denormals are Zero",IDC_VU_CHECK2,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,111,232,81,13 + RADIOBUTTON "Nearest",IDC_VU_ROUNDMODE0,20,172,44,12 + RADIOBUTTON "Negative",IDC_VU_ROUNDMODE1,64,172,47,12 + RADIOBUTTON "Positive",IDC_VU_ROUNDMODE2,111,172,45,12 + RADIOBUTTON "Chop / Zero",IDC_VU_ROUNDMODE3,156,172,52,12 PUSHBUTTON "Defaults",IDDEFAULT,346,254,50,14 - GROUPBOX "VU Recs Options",IDC_STATIC,7,128,250,122,BS_CENTER - GROUPBOX "EE Recs Options",IDC_STATIC,7,12,251,111,BS_CENTER - GROUPBOX "Round Mode",IDC_STATIC,11,141,236,32 + GROUPBOX "VU Recs Options",IDC_STATIC,7,146,250,122,BS_CENTER + GROUPBOX "EE Recs Options",IDC_STATIC,7,12,250,129,BS_CENTER + GROUPBOX "Round Mode",IDC_STATIC,11,159,236,32 GROUPBOX "Round Mode",IDC_STATIC,11,26,236,36 GROUPBOX "Help",IDC_STATIC,271,12,251,238,BS_CENTER - GROUPBOX "Clamp Mode",IDC_STATIC,11,178,236,31 - RADIOBUTTON "None",IDC_VU_CLAMPMODE0,20,189,44,12 - RADIOBUTTON "Normal",IDC_VU_CLAMPMODE1,64,189,47,12 - RADIOBUTTON "Extra",IDC_VU_CLAMPMODE2,111,189,45,12 - RADIOBUTTON "Extra + Preserve Sign",IDC_VU_CLAMPMODE3,156,189,85,12 - CONTROL " Set O and U Flags",IDC_VU_CHECK3,"Button",BS_AUTOCHECKBOX | WS_DISABLED | WS_TABSTOP,20,232,91,13 - CONTROL " Software Emulate DaZ",IDC_VU_CHECK4,"Button",BS_AUTOCHECKBOX | WS_DISABLED | WS_TABSTOP,111,232,116,13 + GROUPBOX "Clamp Mode",IDC_STATIC,11,196,236,31 + RADIOBUTTON "None",IDC_VU_CLAMPMODE0,20,207,44,12 + RADIOBUTTON "Normal",IDC_VU_CLAMPMODE1,64,207,47,12 + RADIOBUTTON "Extra",IDC_VU_CLAMPMODE2,111,207,45,12 + RADIOBUTTON "Extra + Preserve Sign",IDC_VU_CLAMPMODE3,156,207,85,12 + CONTROL " Set O and U Flags",IDC_VU_CHECK3,"Button",BS_AUTOCHECKBOX | WS_DISABLED | WS_TABSTOP,20,250,91,13 + CONTROL " Software Emulate DaZ",IDC_VU_CHECK4,"Button",BS_AUTOCHECKBOX | WS_DISABLED | WS_TABSTOP,111,250,116,13 GROUPBOX "Clamp Mode",IDC_STATIC,11,67,236,31 RADIOBUTTON "None",IDC_EE_CLAMPMODE0,20,76,44,16 RADIOBUTTON "Normal",IDC_EE_CLAMPMODE1,64,76,47,16 RADIOBUTTON "Extra + Preserve Sign",IDC_EE_CLAMPMODE2,111,76,91,16 RADIOBUTTON "Full",IDC_EE_CLAMPMODE3,202,76,38,16 + CONTROL " Software Emulate MUL",IDC_EE_CHECK3,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,20,123,116,13 LTEXT "These options specify how your CPU rounds floating point values.\n\nTry changing the roundmode for EE if your game hangs, it could make it work again.",IDC_STATIC,287,33,216,35 GROUPBOX "Round Mode",IDC_STATIC,281,22,235,51,BS_LEFT GROUPBOX "Clamp Mode",IDC_STATIC,281,80,236,127,BS_LEFT diff --git a/pcsx2/windows/resource.h b/pcsx2/windows/resource.h index 6176504618..6e9f1ab34c 100644 --- a/pcsx2/windows/resource.h +++ b/pcsx2/windows/resource.h @@ -271,6 +271,7 @@ #define IDC_MCD_LABEL1 1324 #define IDC_MCD_LABEL2 1325 #define IDC_INTCSTATHACK 1326 +#define IDC_EE_CHECK3 1327 #define IDC_CPULOG 1500 #define IDC_MEMLOG 1501 #define IDC_HWLOG 1502 diff --git a/pcsx2/x86/iFPU.cpp b/pcsx2/x86/iFPU.cpp index 4623989b62..fcfdc6a140 100644 --- a/pcsx2/x86/iFPU.cpp +++ b/pcsx2/x86/iFPU.cpp @@ -588,12 +588,92 @@ void FPU_SUB(int regd, int regt) { else SSE_SUBSS_XMM_to_XMM(regd, regt); } +//------------------------------------------------------------------ +// FPU_MUL (Used to approximate PS2's FPU mul behavior) +//------------------------------------------------------------------ +// PS2's multiplication uses some modification (possibly not the one used in this function) +// of booth multiplication with wallace trees (not used in this function) +// it cuts of some bits, resulting in inaccurate and non-commutative results. +// This function attempts to replicate this. It is currently inaccurate. But still not too bad. +//------------------------------------------------------------------ +// Tales of Destiny hangs in a (very) certain place without this function. Probably its only use. +// Can be optimized, of course. +// shouldn't be compiled with SSE/MMX optimizations (but none of PCSX2 should be, right?) +u32 __fastcall FPU_MUL_MANTISSA(u32 s, u32 t) +{ + s = (s & 0x7fffff) | 0x800000; + t = (t & 0x7fffff) | 0x800000; + t<<=1; + u32 part[13]; //partial products + u32 bit[13]; //more partial products. 0 or 1. + for (int i = 0; i <= 12; i++, t>>=2) + { + u32 test = t & 7; + if (test == 0 || test == 7) + { + part[i] = 0; + bit[i] = 0; + } + else if (test == 3) + { + part[i] = (s<<1); + bit[i] = 0; + } + else if (test == 4) + { + part[i] = ~(s<<1); + bit[i] = 1; + } + else if (test < 4) + { + part[i] = s; + bit[i] = 0; + } + else + { + part[i] = ~s; + bit[i] = 1; + } + } + s64 res = 0; + u64 mask = 0; + mask = (~mask) << 12; //mask + for (int i=0; i<=12; i++) + { + res += (s64)(s32)part[i]<<(i*2); + res &= mask; + res += bit[i]<<(i*2); + } + u32 man_res = (res >> 23); + if (man_res & (1 << 24)) + man_res >>= 1; + man_res &= 0x7fffff; + return man_res; +} + +void FPU_MUL(int regd, int regt) +{ + if (CHECK_FPU_ATTEMPT_MUL) + { + SSE2_MOVD_XMM_to_R(ECX, regd); + SSE2_MOVD_XMM_to_R(EDX, regt); + SSE_MULSS_XMM_to_XMM(regd, regt); + CALLFunc( (uptr)&FPU_MUL_MANTISSA ); + SSE2_MOVD_XMM_to_R(ECX, regd); + AND32ItoR(ECX, 0xff800000); + OR32RtoR(EAX, ECX); + SSE2_MOVD_R_to_XMM(regd, EAX); + } + else + SSE_MULSS_XMM_to_XMM(regd, regt); +} + //------------------------------------------------------------------ // CommutativeOp XMM (used for ADD, MUL, MAX, and MIN opcodes) //------------------------------------------------------------------ static void (*recComOpXMM_to_XMM[] )(x86SSERegType, x86SSERegType) = { - FPU_ADD, SSE_MULSS_XMM_to_XMM, SSE_MAXSS_XMM_to_XMM, SSE_MINSS_XMM_to_XMM }; + FPU_ADD, FPU_MUL, SSE_MAXSS_XMM_to_XMM, SSE_MINSS_XMM_to_XMM }; //static void (*recComOpM32_to_XMM[] )(x86SSERegType, uptr) = { // SSE_ADDSS_M32_to_XMM, SSE_MULSS_M32_to_XMM, SSE_MAXSS_M32_to_XMM, SSE_MINSS_M32_to_XMM }; @@ -1125,6 +1205,7 @@ FPURECOMPILE_CONSTCODE(DIV_S, XMMINFO_WRITED|XMMINFO_READS|XMMINFO_READT); //------------------------------------------------------------------ + //------------------------------------------------------------------ // MADD XMM //------------------------------------------------------------------ @@ -1138,7 +1219,7 @@ void recMADDtemp(int info, int regd) if(regd == EEREC_S) { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); } - SSE_MULSS_XMM_to_XMM(regd, t0reg); + FPU_MUL(regd, t0reg); if (info & PROCESS_EE_ACC) { if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(regd); } FPU_ADD(regd, EEREC_ACC); @@ -1152,14 +1233,14 @@ void recMADDtemp(int info, int regd) else if (regd == EEREC_ACC){ SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(EEREC_S); fpuFloat2(t0reg); } - SSE_MULSS_XMM_to_XMM(t0reg, EEREC_S); + FPU_MUL(t0reg, EEREC_S); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } FPU_ADD(regd, t0reg); } else { SSE_MOVSS_M32_to_XMM(regd, (uptr)&fpuRegs.fpr[_Ft_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_S); } - SSE_MULSS_XMM_to_XMM(regd, EEREC_S); + FPU_MUL(regd, EEREC_S); if (info & PROCESS_EE_ACC) { if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(regd); } FPU_ADD(regd, EEREC_ACC); @@ -1175,7 +1256,7 @@ void recMADDtemp(int info, int regd) if(regd == EEREC_T) { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Fs_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); } - SSE_MULSS_XMM_to_XMM(regd, t0reg); + FPU_MUL(regd, t0reg); if (info & PROCESS_EE_ACC) { if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(regd); } FPU_ADD(regd, EEREC_ACC); @@ -1189,14 +1270,14 @@ void recMADDtemp(int info, int regd) else if (regd == EEREC_ACC){ SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Fs_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(EEREC_T); fpuFloat2(t0reg); } - SSE_MULSS_XMM_to_XMM(t0reg, EEREC_T); + FPU_MUL(t0reg, EEREC_T); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } FPU_ADD(regd, t0reg); } else { SSE_MOVSS_M32_to_XMM(regd, (uptr)&fpuRegs.fpr[_Fs_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); } - SSE_MULSS_XMM_to_XMM(regd, EEREC_T); + FPU_MUL(regd, EEREC_T); if (info & PROCESS_EE_ACC) { if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(regd); } FPU_ADD(regd, EEREC_ACC); @@ -1211,7 +1292,7 @@ void recMADDtemp(int info, int regd) case (PROCESS_EE_S|PROCESS_EE_T): if(regd == EEREC_S) { if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); } - SSE_MULSS_XMM_to_XMM(regd, EEREC_T); + FPU_MUL(regd, EEREC_T); if (info & PROCESS_EE_ACC) { if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(EEREC_ACC); } FPU_ADD(regd, EEREC_ACC); @@ -1224,7 +1305,7 @@ void recMADDtemp(int info, int regd) } else if(regd == EEREC_T) { if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_S); } - SSE_MULSS_XMM_to_XMM(regd, EEREC_S); + FPU_MUL(regd, EEREC_S); if (info & PROCESS_EE_ACC) { if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(EEREC_ACC); } FPU_ADD(regd, EEREC_ACC); @@ -1238,14 +1319,14 @@ void recMADDtemp(int info, int regd) else if(regd == EEREC_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_S); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(EEREC_T); } - SSE_MULSS_XMM_to_XMM(t0reg, EEREC_T); + FPU_MUL(t0reg, EEREC_T); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } FPU_ADD(regd, t0reg); } else { SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); } - SSE_MULSS_XMM_to_XMM(regd, EEREC_T); + FPU_MUL(regd, EEREC_T); if (info & PROCESS_EE_ACC) { if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(EEREC_ACC); } FPU_ADD(regd, EEREC_ACC); @@ -1263,7 +1344,7 @@ void recMADDtemp(int info, int regd) SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Fs_]); SSE_MOVSS_M32_to_XMM(t1reg, (uptr)&fpuRegs.fpr[_Ft_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(t1reg); } - SSE_MULSS_XMM_to_XMM(t0reg, t1reg); + FPU_MUL(t0reg, t1reg); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } FPU_ADD(regd, t0reg); _freeXMMreg(t1reg); @@ -1273,7 +1354,7 @@ void recMADDtemp(int info, int regd) SSE_MOVSS_M32_to_XMM(regd, (uptr)&fpuRegs.fpr[_Fs_]); SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); } - SSE_MULSS_XMM_to_XMM(regd, t0reg); + FPU_MUL(regd, t0reg); if (info & PROCESS_EE_ACC) { if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(EEREC_ACC); } FPU_ADD(regd, EEREC_ACC); @@ -1356,7 +1437,7 @@ int t1reg; if(regd == EEREC_S) { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); } - SSE_MULSS_XMM_to_XMM(regd, t0reg); + FPU_MUL(regd, t0reg); if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); } else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); } if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } @@ -1366,14 +1447,14 @@ int t1reg; else if (regd == EEREC_ACC){ SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(EEREC_S); fpuFloat2(t0reg); } - SSE_MULSS_XMM_to_XMM(t0reg, EEREC_S); + FPU_MUL(t0reg, EEREC_S); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } FPU_SUB(regd, t0reg); } else { SSE_MOVSS_M32_to_XMM(regd, (uptr)&fpuRegs.fpr[_Ft_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_S); } - SSE_MULSS_XMM_to_XMM(regd, EEREC_S); + FPU_MUL(regd, EEREC_S); if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); } else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); } if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } @@ -1385,7 +1466,7 @@ int t1reg; if(regd == EEREC_T) { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Fs_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); } - SSE_MULSS_XMM_to_XMM(regd, t0reg); + FPU_MUL(regd, t0reg); if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); } else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); } if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } @@ -1395,14 +1476,14 @@ int t1reg; else if (regd == EEREC_ACC){ SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Fs_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(EEREC_T); fpuFloat2(t0reg); } - SSE_MULSS_XMM_to_XMM(t0reg, EEREC_T); + FPU_MUL(t0reg, EEREC_T); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } FPU_SUB(regd, t0reg); } else { SSE_MOVSS_M32_to_XMM(regd, (uptr)&fpuRegs.fpr[_Fs_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); } - SSE_MULSS_XMM_to_XMM(regd, EEREC_T); + FPU_MUL(regd, EEREC_T); if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); } else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); } if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } @@ -1413,7 +1494,7 @@ int t1reg; case (PROCESS_EE_S|PROCESS_EE_T): if(regd == EEREC_S) { if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); } - SSE_MULSS_XMM_to_XMM(regd, EEREC_T); + FPU_MUL(regd, EEREC_T); if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); } else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); } if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } @@ -1422,7 +1503,7 @@ int t1reg; } else if(regd == EEREC_T) { if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_S); } - SSE_MULSS_XMM_to_XMM(regd, EEREC_S); + FPU_MUL(regd, EEREC_S); if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); } else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); } if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } @@ -1432,14 +1513,14 @@ int t1reg; else if(regd == EEREC_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_S); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(EEREC_T); } - SSE_MULSS_XMM_to_XMM(t0reg, EEREC_T); + FPU_MUL(t0reg, EEREC_T); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } FPU_SUB(regd, t0reg); } else { SSE_MOVSS_XMM_to_XMM(regd, EEREC_S); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); } - SSE_MULSS_XMM_to_XMM(regd, EEREC_T); + FPU_MUL(regd, EEREC_T); if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); } else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); } if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } @@ -1453,7 +1534,7 @@ int t1reg; SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Fs_]); SSE_MOVSS_M32_to_XMM(t1reg, (uptr)&fpuRegs.fpr[_Ft_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(t1reg); } - SSE_MULSS_XMM_to_XMM(t0reg, t1reg); + FPU_MUL(t0reg, t1reg); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } FPU_SUB(regd, t0reg); _freeXMMreg(t1reg); @@ -1463,7 +1544,7 @@ int t1reg; SSE_MOVSS_M32_to_XMM(regd, (uptr)&fpuRegs.fpr[_Fs_]); SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]); if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); } - SSE_MULSS_XMM_to_XMM(regd, t0reg); + FPU_MUL(regd, t0reg); if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); } else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); } if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); } diff --git a/pcsx2/x86/iFPUd.cpp b/pcsx2/x86/iFPUd.cpp index d563881610..641d601ebe 100644 --- a/pcsx2/x86/iFPUd.cpp +++ b/pcsx2/x86/iFPUd.cpp @@ -76,6 +76,9 @@ namespace R5900 { namespace Dynarec { namespace OpcodeImpl { namespace COP1 { + +u32 __fastcall FPU_MUL_MANTISSA(u32 s, u32 t); + namespace DOUBLE { //------------------------------------------------------------------ @@ -408,6 +411,8 @@ static u64 PCSX2_ALIGNED16(dbl_s_neg[2]) = {0x8000000000000000ULL, 0}; // converts small normal numbers to double equivalent // converts large normal numbers (which represent NaN/inf in IEEE) to double equivalent + +//mustn't use EAX/ECX/EDX/x86regs (MUL) void ToDouble(int reg) { SSE_UCOMISS_M32_to_XMM(reg, (uptr)&pos_inf); //sets ZF if equal or uncomparable @@ -439,6 +444,7 @@ void ToDouble(int reg) otherwise, results are still usually better than iFPU.cpp. */ +//mustn't use EAX/ECX/EDX/x86regs (MUL) // converts small normal numbers to PS2 equivalent // converts large normal numbers to PS2 equivalent (which represent NaN/inf in IEEE) @@ -501,6 +507,7 @@ void ToPS2FPU_Full(int reg, bool flags, int absreg, bool acc) x86SetJ8(end3); } +//mustn't use EAX/ECX/EDX/x86regs (MUL) void ToPS2FPU(int reg, bool flags, int absreg, bool acc) { if (FPU_RESULT) @@ -642,12 +649,36 @@ void FPU_ADD_SUB(int tempd, int tempt) //tempd and tempt are overwritten, they a } - +void FPU_MUL(int info, int regd, int sreg, int treg, bool acc) +{ + if (CHECK_FPU_ATTEMPT_MUL) + { + SSE2_MOVD_XMM_to_R(ECX, sreg); + SSE2_MOVD_XMM_to_R(EDX, treg); + CALLFunc( (uptr)&FPU_MUL_MANTISSA ); + ToDouble(sreg); ToDouble(treg); + SSE2_MULSD_XMM_to_XMM(sreg, treg); + ToPS2FPU(sreg, true, treg, acc); + SSE_MOVSS_XMM_to_XMM(regd, sreg); + SSE2_MOVD_XMM_to_R(ECX, regd); + AND32ItoR(ECX, 0xff800000); + OR32RtoR(EAX, ECX); + SSE2_MOVD_R_to_XMM(regd, EAX); + } + else + { + ToDouble(sreg); ToDouble(treg); + SSE2_MULSD_XMM_to_XMM(sreg, treg); + ToPS2FPU(sreg, true, treg, acc); + SSE_MOVSS_XMM_to_XMM(regd, sreg); + } +} + //------------------------------------------------------------------ // CommutativeOp XMM (used for ADD, MUL, MAX, MIN and SUB opcodes) //------------------------------------------------------------------ static void (*recFPUOpXMM_to_XMM[] )(x86SSERegType, x86SSERegType) = { - SSE2_ADDSD_XMM_to_XMM, SSE2_MULSD_XMM_to_XMM, SSE2_MAXSD_XMM_to_XMM, SSE2_MINSD_XMM_to_XMM, SSE2_SUBSD_XMM_to_XMM }; + SSE2_ADDSD_XMM_to_XMM, NULL, SSE2_MAXSD_XMM_to_XMM, SSE2_MINSD_XMM_to_XMM, SSE2_SUBSD_XMM_to_XMM }; void recFPUOp(int info, int regd, int op, bool acc) { @@ -951,13 +982,10 @@ FPURECOMPILE_CONSTCODE(DIV_S, XMMINFO_WRITED|XMMINFO_READS|XMMINFO_READT); void recMaddsub(int info, int regd, int op, bool acc) { int sreg, treg; - ALLOC_S(sreg); ALLOC_T(treg); - ToDouble(sreg); ToDouble(treg); - SSE2_MULSD_XMM_to_XMM(sreg, treg); + FPU_MUL(info, sreg, sreg, treg, false); - ToPS2FPU(sreg, true, treg, false); GET_ACC(treg); if (FPU_ADD_SUB_HACK) //ADD or SUB @@ -1077,14 +1105,22 @@ FPURECOMPILE_CONSTCODE(MSUBA_S, XMMINFO_WRITEACC|XMMINFO_READACC|XMMINFO_READS|X //------------------------------------------------------------------ void recMUL_S_xmm(int info) { - recFPUOp(info, EEREC_D, 1, false); + int sreg, treg; + ALLOC_S(sreg); ALLOC_T(treg); + + FPU_MUL(info, EEREC_D, sreg, treg, false); + _freeXMMreg(sreg); _freeXMMreg(treg); } FPURECOMPILE_CONSTCODE(MUL_S, XMMINFO_WRITED|XMMINFO_READS|XMMINFO_READT); void recMULA_S_xmm(int info) { - recFPUOp(info, EEREC_ACC, 1, true); + int sreg, treg; + ALLOC_S(sreg); ALLOC_T(treg); + + FPU_MUL(info, EEREC_ACC, sreg, treg, true); + _freeXMMreg(sreg); _freeXMMreg(treg); } FPURECOMPILE_CONSTCODE(MULA_S, XMMINFO_WRITEACC|XMMINFO_READS|XMMINFO_READT);