reduce precision of FIPR and FTRV sh4 ops on x86 platforms

This commit is contained in:
Flyinghead 2019-12-17 20:23:58 +01:00
parent 7ab275d4b5
commit 12b9b7254a
4 changed files with 100 additions and 37 deletions

View File

@ -911,7 +911,6 @@ shil_opc(fsqrt)
UN_OP_F(sqrtf)
shil_opc_end()
//shop_fipr
shil_opc(fipr)
@ -920,13 +919,13 @@ shil_canonical
(
f32,f1,(float* fn, float* fm),
// Using double for better precision on x86 (Sonic Adventure)
double idp = (double)fn[0] * fm[0];
idp += (double)fn[1] * fm[1];
idp += (double)fn[2] * fm[2];
idp += (double)fn[3] * fm[3];
// multiplications are done with 28 bits of precision (53 - 25) and the final sum at 30 bits
double idp = reduce_precision<25>((double)fn[0] * fm[0]);
idp += reduce_precision<25>((double)fn[1] * fm[1]);
idp += reduce_precision<25>((double)fn[2] * fm[2]);
idp += reduce_precision<25>((double)fn[3] * fm[3]);
return fixNaN((float)idp);
return (float)fixNaN64(idp);
)
#else
shil_canonical
@ -956,39 +955,67 @@ shil_opc_end()
//shop_ftrv
shil_opc(ftrv)
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
shil_canonical
(
void,f1,(float* fd,float* fn, float* fm),
float v1;
float v2;
float v3;
float v4;
v1 = fm[0] * fn[0] +
fm[4] * fn[1] +
fm[8] * fn[2] +
fm[12] * fn[3];
double v1 = reduce_precision<25>((double)fm[0] * fn[0]) +
reduce_precision<25>((double)fm[4] * fn[1]) +
reduce_precision<25>((double)fm[8] * fn[2]) +
reduce_precision<25>((double)fm[12] * fn[3]);
v2 = fm[1] * fn[0] +
fm[5] * fn[1] +
fm[9] * fn[2] +
fm[13] * fn[3];
double v2 = reduce_precision<25>((double)fm[1] * fn[0]) +
reduce_precision<25>((double)fm[5] * fn[1]) +
reduce_precision<25>((double)fm[9] * fn[2]) +
reduce_precision<25>((double)fm[13] * fn[3]);
v3 = fm[2] * fn[0] +
fm[6] * fn[1] +
fm[10] * fn[2] +
fm[14] * fn[3];
double v3 = reduce_precision<25>((double)fm[2] * fn[0]) +
reduce_precision<25>((double)fm[6] * fn[1]) +
reduce_precision<25>((double)fm[10] * fn[2]) +
reduce_precision<25>((double)fm[14] * fn[3]);
v4 = fm[3] * fn[0] +
fm[7] * fn[1] +
fm[11] * fn[2] +
fm[15] * fn[3];
double v4 = reduce_precision<25>((double)fm[3] * fn[0]) +
reduce_precision<25>((double)fm[7] * fn[1]) +
reduce_precision<25>((double)fm[11] * fn[2]) +
reduce_precision<25>((double)fm[15] * fn[3]);
fd[0] = (float)fixNaN64(v1);
fd[1] = (float)fixNaN64(v2);
fd[2] = (float)fixNaN64(v3);
fd[3] = (float)fixNaN64(v4);
)
#else
shil_canonical
(
void,f1,(float* fd,float* fn, float* fm),
float v1 = fm[0] * fn[0] +
fm[4] * fn[1] +
fm[8] * fn[2] +
fm[12] * fn[3];
float v2 = fm[1] * fn[0] +
fm[5] * fn[1] +
fm[9] * fn[2] +
fm[13] * fn[3];
float v3 = fm[2] * fn[0] +
fm[6] * fn[1] +
fm[10] * fn[2] +
fm[14] * fn[3];
float v4 = fm[3] * fn[0] +
fm[7] * fn[1] +
fm[11] * fn[2] +
fm[15] * fn[3];
fd[0] = fixNaN(v1);
fd[1] = fixNaN(v2);
fd[2] = fixNaN(v3);
fd[3] = fixNaN(v4);
)
#endif
shil_compile
(
shil_cf_arg_ptr(rs2);

View File

@ -504,23 +504,25 @@ sh4op(i1111_nnmm_1110_1101)
{
int n=GetN(op)&0xC;
int m=(GetN(op)&0x3)<<2;
if(fpscr.PR ==0)
if (fpscr.PR == 0)
{
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
double idp = (double)fr[n + 0] * fr[m + 0];
idp += (double)fr[n + 1] * fr[m + 1];
idp += (double)fr[n + 2] * fr[m + 2];
idp += (double)fr[n + 3] * fr[m + 3];
float rv = (float)idp;
// multiplications are done with 28 bits of precision (53 - 25) and the final sum at 30 bits
double idp = reduce_precision<25>((double)fr[n + 0] * fr[m + 0]);
idp += reduce_precision<25>((double)fr[n + 1] * fr[m + 1]);
idp += reduce_precision<25>((double)fr[n + 2] * fr[m + 2]);
idp += reduce_precision<25>((double)fr[n + 3] * fr[m + 3]);
fr[n + 3] = (float)fixNaN64(idp);
#else
float rv = fr[n + 0] * fr[m + 0];
rv += fr[n + 1] * fr[m + 1];
rv += fr[n + 2] * fr[m + 2];
rv += fr[n + 3] * fr[m + 3];
#endif
CHECK_FPU_32(rv);
fr[n + 3] = rv;
#endif
}
else
{
@ -697,7 +699,32 @@ sh4op(i1111_nn01_1111_1101)
if (fpscr.PR==0)
{
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
double v1 = reduce_precision<25>((double)xf[0] * fr[n + 0]) +
reduce_precision<25>((double)xf[4] * fr[n + 1]) +
reduce_precision<25>((double)xf[8] * fr[n + 2]) +
reduce_precision<25>((double)xf[12] * fr[n + 3]);
double v2 = reduce_precision<25>((double)xf[1] * fr[n + 0]) +
reduce_precision<25>((double)xf[5] * fr[n + 1]) +
reduce_precision<25>((double)xf[9] * fr[n + 2]) +
reduce_precision<25>((double)xf[13] * fr[n + 3]);
double v3 = reduce_precision<25>((double)xf[2] * fr[n + 0]) +
reduce_precision<25>((double)xf[6] * fr[n + 1]) +
reduce_precision<25>((double)xf[10] * fr[n + 2]) +
reduce_precision<25>((double)xf[14] * fr[n + 3]);
double v4 = reduce_precision<25>((double)xf[3] * fr[n + 0]) +
reduce_precision<25>((double)xf[7] * fr[n + 1]) +
reduce_precision<25>((double)xf[11] * fr[n + 2]) +
reduce_precision<25>((double)xf[15] * fr[n + 3]);
fr[n + 0] = (float)fixNaN64(v1);
fr[n + 1] = (float)fixNaN64(v2);
fr[n + 2] = (float)fixNaN64(v3);
fr[n + 3] = (float)fixNaN64(v4);
#else
float v1, v2, v3, v4;
v1 = xf[0] * fr[n + 0] +
@ -729,7 +756,7 @@ sh4op(i1111_nn01_1111_1101)
fr[n + 1] = v2;
fr[n + 2] = v3;
fr[n + 3] = v4;
#endif
}
else
{

View File

@ -152,3 +152,12 @@ static INLINE f64 fixNaN64(f64 f)
// return (*(u64 *)&f & 0x7fffffffffffffffll) <= 0x7f80000000000000ll ? f : 0x7ff7ffffffffffffll;
return f;
}
// Reduces the precision of the argument f by a given number of bits
// double have 53 bits of precision so the returned result will have a precision of 53 - bits
template<int bits>
static INLINE double reduce_precision(double f)
{
double c = (double)((1 << bits) + 1) * f;
return c - (c - f);
}

View File

@ -1086,7 +1086,7 @@ public:
mov(qword[rdx], rcx);
#endif
break;
/*
case shop_fipr:
{
// Using doubles for better precision
@ -1164,7 +1164,7 @@ public:
}
break;
*/
case shop_frswap:
mov(rax, (uintptr_t)op.rs1.reg_ptr());
mov(rcx, (uintptr_t)op.rd.reg_ptr());