Changed the fpu MUL hack for tales of destiny. (Doesn't break under most settings now).

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2324 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Nneeve 2009-12-07 17:33:32 +00:00
parent 611444b9d5
commit 00f14b5760
2 changed files with 119 additions and 145 deletions

View File

@ -75,7 +75,8 @@ void recRSQRT_S_xmm(int info);
#define FPUflagSO 0X00000010
#define FPUflagSU 0X00000008
#define FPU_ADD_SUB_HACK 1 // Add/Sub opcodes produce more ps2-like results if set to 1
// Add/Sub opcodes produce the same results as the ps2
#define FPU_CORRECT_ADD_SUB 1
static const __aligned16 u32 s_neg[4] = { 0x80000000, 0xffffffff, 0xffffffff, 0xffffffff };
static const __aligned16 u32 s_pos[4] = { 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff };
@ -546,95 +547,54 @@ void FPU_ADD_SUB(int regd, int regt, int issub)
}
void FPU_ADD(int regd, int regt) {
if (FPU_ADD_SUB_HACK) FPU_ADD_SUB(regd, regt, 0);
if (FPU_CORRECT_ADD_SUB) FPU_ADD_SUB(regd, regt, 0);
else SSE_ADDSS_XMM_to_XMM(regd, regt);
}
void FPU_SUB(int regd, int regt) {
if (FPU_ADD_SUB_HACK) FPU_ADD_SUB(regd, regt, 1);
if (FPU_CORRECT_ADD_SUB) FPU_ADD_SUB(regd, regt, 1);
else SSE_SUBSS_XMM_to_XMM(regd, regt);
}
//------------------------------------------------------------------
// FPU_MUL (Used to approximate PS2's FPU mul behavior)
// Note: PS2's multiplication uses some variant of booth multiplication with wallace trees:
// It cuts off some bits, resulting in inaccurate and non-commutative results.
// The PS2's result mantissa is either equal to x86's rounding to zero result mantissa
// or SMALLER (by 0x1). (this means that x86's other rounding modes are only less similar to PS2's mul)
//------------------------------------------------------------------
// PS2's multiplication uses some modification (possibly not the one used in this function)
// of booth multiplication with wallace trees (not used in this function)
// it cuts of some bits, resulting in inaccurate and non-commutative results.
// This function attempts to replicate this. It is currently inaccurate. But still not too bad.
//------------------------------------------------------------------
// Tales of Destiny hangs in a (very) certain place without this function. Probably its only use.
// Can be optimized, of course.
// shouldn't be compiled with SSE/MMX optimizations (but none of PCSX2 should be, right?)
u32 __fastcall FPU_MUL_MANTISSA(u32 s, u32 t)
u32 __fastcall FPU_MUL_HACK(u32 s, u32 t)
{
s = (s & 0x7fffff) | 0x800000;
t = (t & 0x7fffff) | 0x800000;
t<<=1;
u32 part[13]; //partial products
u32 bit[13]; //more partial products. 0 or 1.
for (int i = 0; i <= 12; i++, t>>=2)
{
u32 test = t & 7;
if (test == 0 || test == 7)
{
part[i] = 0;
bit[i] = 0;
}
else if (test == 3)
{
part[i] = (s<<1);
bit[i] = 0;
}
else if (test == 4)
{
part[i] = ~(s<<1);
bit[i] = 1;
}
else if (test < 4)
{
part[i] = s;
bit[i] = 0;
}
else
{
part[i] = ~s;
bit[i] = 1;
}
}
s64 res = 0;
u64 mask = 0;
mask = (~mask) << 12; //mask
for (int i=0; i<=12; i++)
{
res += (s64)(s32)part[i]<<(i*2);
res &= mask;
res += bit[i]<<(i*2);
}
u32 man_res = (res >> 23);
if (man_res & (1 << 24))
man_res >>= 1;
man_res &= 0x7fffff;
return man_res;
if ((s == 0x3e800000) && (t == 0x40490fdb))
return 0x3f490fda; // needed for Tales of Destiny Remake (only in a very specific room late-game)
else
return 0;
}
void FPU_MUL(int regd, int regt)
void FPU_MUL(int regd, int regt, bool reverseOperands)
{
u8 *noHack, *endMul;
if (CHECK_FPUMULHACK)
{
SSE2_MOVD_XMM_to_R(ECX, regd);
SSE2_MOVD_XMM_to_R(EDX, regt);
SSE_MULSS_XMM_to_XMM(regd, regt);
CALLFunc( (uptr)&FPU_MUL_MANTISSA );
SSE2_MOVD_XMM_to_R(ECX, regd);
AND32ItoR(ECX, 0xff800000);
OR32RtoR(EAX, ECX);
SSE2_MOVD_R_to_XMM(regd, EAX);
SSE2_MOVD_XMM_to_R(ECX, reverseOperands ? regt : regd);
SSE2_MOVD_XMM_to_R(EDX, reverseOperands ? regd : regt);
CALLFunc( (uptr)&FPU_MUL_HACK ); //returns the hacked result or 0
TEST32RtoR(EAX, EAX);
noHack = JZ8(0);
SSE2_MOVD_R_to_XMM(regd, EAX);
endMul = JMP8(0);
x86SetJ8(noHack);
}
else
SSE_MULSS_XMM_to_XMM(regd, regt);
SSE_MULSS_XMM_to_XMM(regd, regt);
if (CHECK_FPUMULHACK)
x86SetJ8(endMul);
}
void FPU_MUL(int regd, int regt) { FPU_MUL(regd, regt, false); }
void FPU_MUL_REV(int regd, int regt) { FPU_MUL(regd, regt, true); } //reversed operands
//------------------------------------------------------------------
// CommutativeOp XMM (used for ADD, MUL, MAX, and MIN opcodes)
@ -642,6 +602,9 @@ void FPU_MUL(int regd, int regt)
static void (*recComOpXMM_to_XMM[] )(x86SSERegType, x86SSERegType) = {
FPU_ADD, FPU_MUL, SSE_MAXSS_XMM_to_XMM, SSE_MINSS_XMM_to_XMM };
static void (*recComOpXMM_to_XMM_REV[] )(x86SSERegType, x86SSERegType) = { //reversed operands
FPU_ADD, FPU_MUL_REV, SSE_MAXSS_XMM_to_XMM, SSE_MINSS_XMM_to_XMM };
//static void (*recComOpM32_to_XMM[] )(x86SSERegType, uptr) = {
// SSE_ADDSS_M32_to_XMM, SSE_MULSS_M32_to_XMM, SSE_MAXSS_M32_to_XMM, SSE_MINSS_M32_to_XMM };
@ -660,14 +623,14 @@ int recCommutativeOp(int info, int regd, int op)
else {
SSE_MOVSS_M32_to_XMM(regd, (uptr)&fpuRegs.fpr[_Ft_]);
if (CHECK_FPU_EXTRA_OVERFLOW || (op >= 2)) { fpuFloat2(regd); fpuFloat2(EEREC_S); }
recComOpXMM_to_XMM[op](regd, EEREC_S);
recComOpXMM_to_XMM_REV[op](regd, EEREC_S);
}
break;
case PROCESS_EE_T:
if (regd == EEREC_T) {
SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Fs_]);
if (CHECK_FPU_EXTRA_OVERFLOW || (op >= 2)) { fpuFloat2(regd); fpuFloat2(t0reg); }
recComOpXMM_to_XMM[op](regd, t0reg);
recComOpXMM_to_XMM_REV[op](regd, t0reg);
}
else {
SSE_MOVSS_M32_to_XMM(regd, (uptr)&fpuRegs.fpr[_Fs_]);
@ -678,7 +641,7 @@ int recCommutativeOp(int info, int regd, int op)
case (PROCESS_EE_S|PROCESS_EE_T):
if (regd == EEREC_T) {
if (CHECK_FPU_EXTRA_OVERFLOW || (op >= 2)) { fpuFloat2(regd); fpuFloat2(EEREC_S); }
recComOpXMM_to_XMM[op](regd, EEREC_S);
recComOpXMM_to_XMM_REV[op](regd, EEREC_S);
}
else {
SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
@ -1204,7 +1167,7 @@ void recMADDtemp(int info, int regd)
if(regd == EEREC_S) {
SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); }
FPU_MUL(regd, t0reg);
SSE_MULSS_XMM_to_XMM(regd, t0reg);
if (info & PROCESS_EE_ACC) {
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(regd); }
FPU_ADD(regd, EEREC_ACC);
@ -1218,14 +1181,14 @@ void recMADDtemp(int info, int regd)
else if (regd == EEREC_ACC){
SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(EEREC_S); fpuFloat2(t0reg); }
FPU_MUL(t0reg, EEREC_S);
SSE_MULSS_XMM_to_XMM(t0reg, EEREC_S);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
}
else {
SSE_MOVSS_M32_to_XMM(regd, (uptr)&fpuRegs.fpr[_Ft_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_S); }
FPU_MUL(regd, EEREC_S);
SSE_MULSS_XMM_to_XMM(regd, EEREC_S);
if (info & PROCESS_EE_ACC) {
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(regd); }
FPU_ADD(regd, EEREC_ACC);
@ -1241,7 +1204,7 @@ void recMADDtemp(int info, int regd)
if(regd == EEREC_T) {
SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Fs_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); }
FPU_MUL(regd, t0reg);
SSE_MULSS_XMM_to_XMM(regd, t0reg);
if (info & PROCESS_EE_ACC) {
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(regd); }
FPU_ADD(regd, EEREC_ACC);
@ -1255,14 +1218,14 @@ void recMADDtemp(int info, int regd)
else if (regd == EEREC_ACC){
SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Fs_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(EEREC_T); fpuFloat2(t0reg); }
FPU_MUL(t0reg, EEREC_T);
SSE_MULSS_XMM_to_XMM(t0reg, EEREC_T);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
}
else {
SSE_MOVSS_M32_to_XMM(regd, (uptr)&fpuRegs.fpr[_Fs_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); }
FPU_MUL(regd, EEREC_T);
SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
if (info & PROCESS_EE_ACC) {
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(regd); }
FPU_ADD(regd, EEREC_ACC);
@ -1277,7 +1240,7 @@ void recMADDtemp(int info, int regd)
case (PROCESS_EE_S|PROCESS_EE_T):
if(regd == EEREC_S) {
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); }
FPU_MUL(regd, EEREC_T);
SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
if (info & PROCESS_EE_ACC) {
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(EEREC_ACC); }
FPU_ADD(regd, EEREC_ACC);
@ -1290,7 +1253,7 @@ void recMADDtemp(int info, int regd)
}
else if(regd == EEREC_T) {
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_S); }
FPU_MUL(regd, EEREC_S);
SSE_MULSS_XMM_to_XMM(regd, EEREC_S);
if (info & PROCESS_EE_ACC) {
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(EEREC_ACC); }
FPU_ADD(regd, EEREC_ACC);
@ -1304,14 +1267,14 @@ void recMADDtemp(int info, int regd)
else if(regd == EEREC_ACC) {
SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_S);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(EEREC_T); }
FPU_MUL(t0reg, EEREC_T);
SSE_MULSS_XMM_to_XMM(t0reg, EEREC_T);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
}
else {
SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); }
FPU_MUL(regd, EEREC_T);
SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
if (info & PROCESS_EE_ACC) {
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(EEREC_ACC); }
FPU_ADD(regd, EEREC_ACC);
@ -1329,7 +1292,7 @@ void recMADDtemp(int info, int regd)
SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Fs_]);
SSE_MOVSS_M32_to_XMM(t1reg, (uptr)&fpuRegs.fpr[_Ft_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(t1reg); }
FPU_MUL(t0reg, t1reg);
SSE_MULSS_XMM_to_XMM(t0reg, t1reg);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
_freeXMMreg(t1reg);
@ -1339,7 +1302,7 @@ void recMADDtemp(int info, int regd)
SSE_MOVSS_M32_to_XMM(regd, (uptr)&fpuRegs.fpr[_Fs_]);
SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); }
FPU_MUL(regd, t0reg);
SSE_MULSS_XMM_to_XMM(regd, t0reg);
if (info & PROCESS_EE_ACC) {
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(EEREC_ACC); }
FPU_ADD(regd, EEREC_ACC);
@ -1422,7 +1385,7 @@ int t1reg;
if(regd == EEREC_S) {
SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); }
FPU_MUL(regd, t0reg);
SSE_MULSS_XMM_to_XMM(regd, t0reg);
if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); }
else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); }
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
@ -1432,14 +1395,14 @@ int t1reg;
else if (regd == EEREC_ACC){
SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(EEREC_S); fpuFloat2(t0reg); }
FPU_MUL(t0reg, EEREC_S);
SSE_MULSS_XMM_to_XMM(t0reg, EEREC_S);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(regd, t0reg);
}
else {
SSE_MOVSS_M32_to_XMM(regd, (uptr)&fpuRegs.fpr[_Ft_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_S); }
FPU_MUL(regd, EEREC_S);
SSE_MULSS_XMM_to_XMM(regd, EEREC_S);
if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); }
else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); }
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
@ -1451,7 +1414,7 @@ int t1reg;
if(regd == EEREC_T) {
SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Fs_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); }
FPU_MUL(regd, t0reg);
SSE_MULSS_XMM_to_XMM(regd, t0reg);
if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); }
else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); }
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
@ -1461,14 +1424,14 @@ int t1reg;
else if (regd == EEREC_ACC){
SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Fs_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(EEREC_T); fpuFloat2(t0reg); }
FPU_MUL(t0reg, EEREC_T);
SSE_MULSS_XMM_to_XMM(t0reg, EEREC_T);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(regd, t0reg);
}
else {
SSE_MOVSS_M32_to_XMM(regd, (uptr)&fpuRegs.fpr[_Fs_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); }
FPU_MUL(regd, EEREC_T);
SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); }
else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); }
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
@ -1479,7 +1442,7 @@ int t1reg;
case (PROCESS_EE_S|PROCESS_EE_T):
if(regd == EEREC_S) {
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); }
FPU_MUL(regd, EEREC_T);
SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); }
else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); }
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
@ -1488,7 +1451,7 @@ int t1reg;
}
else if(regd == EEREC_T) {
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_S); }
FPU_MUL(regd, EEREC_S);
SSE_MULSS_XMM_to_XMM(regd, EEREC_S);
if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); }
else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); }
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
@ -1498,14 +1461,14 @@ int t1reg;
else if(regd == EEREC_ACC) {
SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_S);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(EEREC_T); }
FPU_MUL(t0reg, EEREC_T);
SSE_MULSS_XMM_to_XMM(t0reg, EEREC_T);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(regd, t0reg);
}
else {
SSE_MOVSS_XMM_to_XMM(regd, EEREC_S);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); }
FPU_MUL(regd, EEREC_T);
SSE_MULSS_XMM_to_XMM(regd, EEREC_T);
if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); }
else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); }
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
@ -1519,7 +1482,7 @@ int t1reg;
SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Fs_]);
SSE_MOVSS_M32_to_XMM(t1reg, (uptr)&fpuRegs.fpr[_Ft_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(t1reg); }
FPU_MUL(t0reg, t1reg);
SSE_MULSS_XMM_to_XMM(t0reg, t1reg);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(regd, t0reg);
_freeXMMreg(t1reg);
@ -1529,7 +1492,7 @@ int t1reg;
SSE_MOVSS_M32_to_XMM(regd, (uptr)&fpuRegs.fpr[_Fs_]);
SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.fpr[_Ft_]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); }
FPU_MUL(regd, t0reg);
SSE_MULSS_XMM_to_XMM(regd, t0reg);
if (info & PROCESS_EE_ACC) { SSE_MOVSS_XMM_to_XMM(t0reg, EEREC_ACC); }
else { SSE_MOVSS_M32_to_XMM(t0reg, (uptr)&fpuRegs.ACC); }
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }

View File

@ -22,24 +22,38 @@
#include "iR5900.h"
#include "iFPU.h"
/* Version of the FPU that emulates an exponent of 0xff and overflow/underflow flags */
/* This is a version of the FPU that emulates an exponent of 0xff and overflow/underflow flags */
/* Can be made faster by not converting stuff back and forth between instructions. */
//----------------------------------------------------------------
// FPU emulation status:
// ADD, SUB (incl. accumulation stage of MADD/MSUB) - no known problems.
// Mul (incl. multiplication stage of MADD/MSUB) - incorrect. PS2's result mantissa is sometimes
// smaller by 0x1 than IEEE's result (with round to zero).
// DIV, SQRT, RSQRT - incorrect. PS2's result varies between IEEE's result with round to zero
// and IEEE's result with round to +/-infinity.
// other stuff - no known problems.
//----------------------------------------------------------------
using namespace x86Emitter;
//set overflow flag (set only if FPU_RESULT is 1)
// Set overflow flag (define only if FPU_RESULT is 1)
#define FPU_FLAGS_OVERFLOW 1
//set underflow flag (set only if FPU_RESULT is 1)
// Set underflow flag (define only if FPU_RESULT is 1)
#define FPU_FLAGS_UNDERFLOW 1
//if 1, result is not clamped (Gives correct results as in PS2,
//but can cause problems due to insufficient clamping levels in the VUs)
// If 1, result is not clamped (Gives correct results as in PS2,
// but can cause problems due to insufficient clamping levels in the VUs)
#define FPU_RESULT 1
//set I&D flags. also impacts other aspects of DIV/R/SQRT correctness
// Set I&D flags. also impacts other aspects of DIV/R/SQRT correctness
#define FPU_FLAGS_ID 1
// Add/Sub opcodes produce the same results as the ps2
#define FPU_CORRECT_ADD_SUB 1
#ifdef FPU_RECOMPILE
@ -48,8 +62,8 @@ namespace R5900 {
namespace Dynarec {
namespace OpcodeImpl {
namespace COP1 {
u32 __fastcall FPU_MUL_MANTISSA(u32 s, u32 t);
u32 __fastcall FPU_MUL_HACK(u32 s, u32 t);
namespace DOUBLE {
@ -71,8 +85,6 @@ namespace DOUBLE {
#define FPUflagSO 0X00000010
#define FPUflagSU 0X00000008
#define FPU_ADD_SUB_HACK 1 // Add/Sub opcodes produce more ps2-like results if set to 1
#define REC_FPUBRANCH(f) \
void f(); \
void rec##f() { \
@ -145,26 +157,25 @@ static const __aligned(32) FPUd_Globals s_const =
};
// converts small normal numbers to double equivalent
// converts large normal numbers (which represent NaN/inf in IEEE) to double equivalent
// ToDouble : converts single-precision PS2 float to double-precision IEEE float
//mustn't use EAX/ECX/EDX/x86regs (MUL)
void ToDouble(int reg)
{
SSE_UCOMISS_M32_to_XMM(reg, (uptr)s_const.pos_inf); //sets ZF if equal or incomparable
u8 *to_complex = JE8(0); //complex conversion if positive infinity or NaN
SSE_UCOMISS_M32_to_XMM(reg, (uptr)s_const.pos_inf); // Sets ZF if reg is equal or incomparable to pos_inf
u8 *to_complex = JE8(0); // Complex conversion if positive infinity or NaN
SSE_UCOMISS_M32_to_XMM(reg, (uptr)s_const.neg_inf);
u8 *to_complex2 = JE8(0); //complex conversion if negative infinity
u8 *to_complex2 = JE8(0); // Complex conversion if negative infinity
SSE2_CVTSS2SD_XMM_to_XMM(reg, reg); //simply convert
SSE2_CVTSS2SD_XMM_to_XMM(reg, reg); // Simply convert
u8 *end = JMP8(0);
x86SetJ8(to_complex);
x86SetJ8(to_complex2);
SSE2_PSUBD_M128_to_XMM(reg, (uptr)s_const.one_exp); //lower exponent
// Special conversion for when IEEE sees the value in reg as an INF/NaN
SSE2_PSUBD_M128_to_XMM(reg, (uptr)s_const.one_exp); // Lower exponent by one
SSE2_CVTSS2SD_XMM_to_XMM(reg, reg);
SSE2_PADDQ_M128_to_XMM(reg, (uptr)s_const.dbl_one_exp); //raise exponent
SSE2_PADDQ_M128_to_XMM(reg, (uptr)s_const.dbl_one_exp); // Raise exponent by one
x86SetJ8(end);
}
@ -174,19 +185,20 @@ void ToDouble(int reg)
//------------------------------------------------------------------
/*
if FPU_RESULT, results are more like the real PS2's FPU. But if the VU doesn't clamp all operands,
new issues may happen if a game transfers the FPU results into the VU and continues operations there.
ar tonelico 1 does this with the result from DIV/RSQRT (when a division by zero occurs)
if FPU_RESULT is defined, results are more like the real PS2's FPU. But new issues may happen if
the VU isn't clamping all operands since games may transfer FPU results into the VU.
Ar tonelico 1 does this with the result from DIV/RSQRT (when a division by zero occurs)
otherwise, results are still usually better than iFPU.cpp.
*/
//mustn't use EAX/ECX/EDX/x86regs (MUL)
// ToPS2FPU_Full - converts double-precision IEEE float to single-precision PS2 float
// converts small normal numbers to PS2 equivalent
// converts large normal numbers to PS2 equivalent (which represent NaN/inf in IEEE)
// converts really large normal numbers to PS2 signed max
// converts really small normal numbers to zero (flush)
// doesn't handle inf/nan/denormal
void ToPS2FPU_Full(int reg, bool flags, int absreg, bool acc, bool addsub)
{
if (flags)
@ -259,7 +271,6 @@ void ToPS2FPU_Full(int reg, bool flags, int absreg, bool acc, bool addsub)
x86SetJ8(end4);
}
//mustn't use EAX/ECX/EDX/x86regs (MUL)
void ToPS2FPU(int reg, bool flags, int absreg, bool acc, bool addsub = false)
{
if (FPU_RESULT)
@ -399,31 +410,31 @@ void FPU_ADD_SUB(int tempd, int tempt) //tempd and tempt are overwritten, they a
_freeX86reg(temp2);
_freeX86reg(tempecx);
}
void FPU_MUL(int info, int regd, int sreg, int treg, bool acc)
{
u8 *noHack;
u32 *endMul;
if (CHECK_FPUMULHACK)
{
SSE2_MOVD_XMM_to_R(ECX, sreg);
SSE2_MOVD_XMM_to_R(EDX, treg);
CALLFunc( (uptr)&FPU_MUL_MANTISSA );
ToDouble(sreg); ToDouble(treg);
SSE2_MULSD_XMM_to_XMM(sreg, treg);
ToPS2FPU(sreg, true, treg, acc);
SSE_MOVSS_XMM_to_XMM(regd, sreg);
SSE2_MOVD_XMM_to_R(ECX, regd);
AND32ItoR(ECX, 0xff800000);
OR32RtoR(EAX, ECX);
SSE2_MOVD_R_to_XMM(regd, EAX);
}
else
{
ToDouble(sreg); ToDouble(treg);
SSE2_MULSD_XMM_to_XMM(sreg, treg);
ToPS2FPU(sreg, true, treg, acc);
SSE_MOVSS_XMM_to_XMM(regd, sreg);
CALLFunc( (uptr)&FPU_MUL_HACK ); //returns the hacked result or 0
TEST32RtoR(EAX, EAX);
noHack = JZ8(0);
SSE2_MOVD_R_to_XMM(regd, EAX);
endMul = JMP32(0);
x86SetJ8(noHack);
}
ToDouble(sreg); ToDouble(treg);
SSE2_MULSD_XMM_to_XMM(sreg, treg);
ToPS2FPU(sreg, true, treg, acc);
SSE_MOVSS_XMM_to_XMM(regd, sreg);
if (CHECK_FPUMULHACK)
x86SetJ32(endMul);
}
//------------------------------------------------------------------
@ -437,7 +448,7 @@ void recFPUOp(int info, int regd, int op, bool acc)
int sreg, treg;
ALLOC_S(sreg); ALLOC_T(treg);
if (FPU_ADD_SUB_HACK) //ADD or SUB
if (FPU_CORRECT_ADD_SUB)
FPU_ADD_SUB(sreg, treg);
ToDouble(sreg); ToDouble(treg);
@ -709,7 +720,7 @@ void recMaddsub(int info, int regd, int op, bool acc)
GET_ACC(treg);
if (FPU_ADD_SUB_HACK) //ADD or SUB
if (FPU_CORRECT_ADD_SUB)
FPU_ADD_SUB(treg, sreg); //might be problematic for something!!!!
// TEST FOR ACC/MUL OVERFLOWS, PROPOGATE THEM IF THEY OCCUR