diff --git a/core/hw/sh4/dyna/shil_canonical.h b/core/hw/sh4/dyna/shil_canonical.h index 8204b1ab0..3d171016f 100644 --- a/core/hw/sh4/dyna/shil_canonical.h +++ b/core/hw/sh4/dyna/shil_canonical.h @@ -911,7 +911,6 @@ shil_opc(fsqrt) UN_OP_F(sqrtf) shil_opc_end() - //shop_fipr shil_opc(fipr) @@ -920,13 +919,13 @@ shil_canonical ( f32,f1,(float* fn, float* fm), - // Using double for better precision on x86 (Sonic Adventure) - double idp = (double)fn[0] * fm[0]; - idp += (double)fn[1] * fm[1]; - idp += (double)fn[2] * fm[2]; - idp += (double)fn[3] * fm[3]; + // multiplications are done with 28 bits of precision (53 - 25) and the final sum at 30 bits + double idp = reduce_precision<25>((double)fn[0] * fm[0]); + idp += reduce_precision<25>((double)fn[1] * fm[1]); + idp += reduce_precision<25>((double)fn[2] * fm[2]); + idp += reduce_precision<25>((double)fn[3] * fm[3]); - return fixNaN((float)idp); + return (float)fixNaN64(idp); ) #else shil_canonical @@ -956,39 +955,67 @@ shil_opc_end() //shop_ftrv shil_opc(ftrv) +#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 shil_canonical ( void,f1,(float* fd,float* fn, float* fm), - float v1; - float v2; - float v3; - float v4; - v1 = fm[0] * fn[0] + - fm[4] * fn[1] + - fm[8] * fn[2] + - fm[12] * fn[3]; + double v1 = reduce_precision<25>((double)fm[0] * fn[0]) + + reduce_precision<25>((double)fm[4] * fn[1]) + + reduce_precision<25>((double)fm[8] * fn[2]) + + reduce_precision<25>((double)fm[12] * fn[3]); - v2 = fm[1] * fn[0] + - fm[5] * fn[1] + - fm[9] * fn[2] + - fm[13] * fn[3]; + double v2 = reduce_precision<25>((double)fm[1] * fn[0]) + + reduce_precision<25>((double)fm[5] * fn[1]) + + reduce_precision<25>((double)fm[9] * fn[2]) + + reduce_precision<25>((double)fm[13] * fn[3]); - v3 = fm[2] * fn[0] + - fm[6] * fn[1] + - fm[10] * fn[2] + - fm[14] * fn[3]; + double v3 = reduce_precision<25>((double)fm[2] * fn[0]) + + reduce_precision<25>((double)fm[6] * fn[1]) + + reduce_precision<25>((double)fm[10] * fn[2]) + + reduce_precision<25>((double)fm[14] * fn[3]); - v4 = fm[3] * fn[0] + - fm[7] * fn[1] + - fm[11] * fn[2] + - fm[15] * fn[3]; + double v4 = reduce_precision<25>((double)fm[3] * fn[0]) + + reduce_precision<25>((double)fm[7] * fn[1]) + + reduce_precision<25>((double)fm[11] * fn[2]) + + reduce_precision<25>((double)fm[15] * fn[3]); + + fd[0] = (float)fixNaN64(v1); + fd[1] = (float)fixNaN64(v2); + fd[2] = (float)fixNaN64(v3); + fd[3] = (float)fixNaN64(v4); +) +#else +shil_canonical +( +void,f1,(float* fd,float* fn, float* fm), + + float v1 = fm[0] * fn[0] + + fm[4] * fn[1] + + fm[8] * fn[2] + + fm[12] * fn[3]; + + float v2 = fm[1] * fn[0] + + fm[5] * fn[1] + + fm[9] * fn[2] + + fm[13] * fn[3]; + + float v3 = fm[2] * fn[0] + + fm[6] * fn[1] + + fm[10] * fn[2] + + fm[14] * fn[3]; + + float v4 = fm[3] * fn[0] + + fm[7] * fn[1] + + fm[11] * fn[2] + + fm[15] * fn[3]; fd[0] = fixNaN(v1); fd[1] = fixNaN(v2); fd[2] = fixNaN(v3); fd[3] = fixNaN(v4); ) +#endif shil_compile ( shil_cf_arg_ptr(rs2); diff --git a/core/hw/sh4/interpr/sh4_fpu.cpp b/core/hw/sh4/interpr/sh4_fpu.cpp index 5551bcde2..e3ad0ccda 100644 --- a/core/hw/sh4/interpr/sh4_fpu.cpp +++ b/core/hw/sh4/interpr/sh4_fpu.cpp @@ -504,23 +504,25 @@ sh4op(i1111_nnmm_1110_1101) { int n=GetN(op)&0xC; int m=(GetN(op)&0x3)<<2; - if(fpscr.PR ==0) + if (fpscr.PR == 0) { #if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 - double idp = (double)fr[n + 0] * fr[m + 0]; - idp += (double)fr[n + 1] * fr[m + 1]; - idp += (double)fr[n + 2] * fr[m + 2]; - idp += (double)fr[n + 3] * fr[m + 3]; - float rv = (float)idp; + // multiplications are done with 28 bits of precision (53 - 25) and the final sum at 30 bits + double idp = reduce_precision<25>((double)fr[n + 0] * fr[m + 0]); + idp += reduce_precision<25>((double)fr[n + 1] * fr[m + 1]); + idp += reduce_precision<25>((double)fr[n + 2] * fr[m + 2]); + idp += reduce_precision<25>((double)fr[n + 3] * fr[m + 3]); + + fr[n + 3] = (float)fixNaN64(idp); #else float rv = fr[n + 0] * fr[m + 0]; rv += fr[n + 1] * fr[m + 1]; rv += fr[n + 2] * fr[m + 2]; rv += fr[n + 3] * fr[m + 3]; -#endif CHECK_FPU_32(rv); fr[n + 3] = rv; +#endif } else { @@ -697,7 +699,32 @@ sh4op(i1111_nn01_1111_1101) if (fpscr.PR==0) { +#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 + double v1 = reduce_precision<25>((double)xf[0] * fr[n + 0]) + + reduce_precision<25>((double)xf[4] * fr[n + 1]) + + reduce_precision<25>((double)xf[8] * fr[n + 2]) + + reduce_precision<25>((double)xf[12] * fr[n + 3]); + double v2 = reduce_precision<25>((double)xf[1] * fr[n + 0]) + + reduce_precision<25>((double)xf[5] * fr[n + 1]) + + reduce_precision<25>((double)xf[9] * fr[n + 2]) + + reduce_precision<25>((double)xf[13] * fr[n + 3]); + + double v3 = reduce_precision<25>((double)xf[2] * fr[n + 0]) + + reduce_precision<25>((double)xf[6] * fr[n + 1]) + + reduce_precision<25>((double)xf[10] * fr[n + 2]) + + reduce_precision<25>((double)xf[14] * fr[n + 3]); + + double v4 = reduce_precision<25>((double)xf[3] * fr[n + 0]) + + reduce_precision<25>((double)xf[7] * fr[n + 1]) + + reduce_precision<25>((double)xf[11] * fr[n + 2]) + + reduce_precision<25>((double)xf[15] * fr[n + 3]); + + fr[n + 0] = (float)fixNaN64(v1); + fr[n + 1] = (float)fixNaN64(v2); + fr[n + 2] = (float)fixNaN64(v3); + fr[n + 3] = (float)fixNaN64(v4); +#else float v1, v2, v3, v4; v1 = xf[0] * fr[n + 0] + @@ -729,7 +756,7 @@ sh4op(i1111_nn01_1111_1101) fr[n + 1] = v2; fr[n + 2] = v3; fr[n + 3] = v4; - +#endif } else { diff --git a/core/hw/sh4/sh4_core.h b/core/hw/sh4/sh4_core.h index 42eb69dac..f97ae5d89 100644 --- a/core/hw/sh4/sh4_core.h +++ b/core/hw/sh4/sh4_core.h @@ -152,3 +152,12 @@ static INLINE f64 fixNaN64(f64 f) // return (*(u64 *)&f & 0x7fffffffffffffffll) <= 0x7f80000000000000ll ? f : 0x7ff7ffffffffffffll; return f; } + +// Reduces the precision of the argument f by a given number of bits +// double have 53 bits of precision so the returned result will have a precision of 53 - bits +template +static INLINE double reduce_precision(double f) +{ + double c = (double)((1 << bits) + 1) * f; + return c - (c - f); +} diff --git a/core/rec-x64/rec_x64.cpp b/core/rec-x64/rec_x64.cpp index 88d454bab..8d588ed76 100644 --- a/core/rec-x64/rec_x64.cpp +++ b/core/rec-x64/rec_x64.cpp @@ -1086,7 +1086,7 @@ public: mov(qword[rdx], rcx); #endif break; - +/* case shop_fipr: { // Using doubles for better precision @@ -1164,7 +1164,7 @@ public: } break; - +*/ case shop_frswap: mov(rax, (uintptr_t)op.rs1.reg_ptr()); mov(rcx, (uintptr_t)op.rd.reg_ptr());