reduce precision of FIPR and FTRV sh4 ops on x86 platforms
This commit is contained in:
parent
7ab275d4b5
commit
12b9b7254a
|
|
@ -911,7 +911,6 @@ shil_opc(fsqrt)
|
|||
UN_OP_F(sqrtf)
|
||||
shil_opc_end()
|
||||
|
||||
|
||||
//shop_fipr
|
||||
shil_opc(fipr)
|
||||
|
||||
|
|
@ -920,13 +919,13 @@ shil_canonical
|
|||
(
|
||||
f32,f1,(float* fn, float* fm),
|
||||
|
||||
// Using double for better precision on x86 (Sonic Adventure)
|
||||
double idp = (double)fn[0] * fm[0];
|
||||
idp += (double)fn[1] * fm[1];
|
||||
idp += (double)fn[2] * fm[2];
|
||||
idp += (double)fn[3] * fm[3];
|
||||
// multiplications are done with 28 bits of precision (53 - 25) and the final sum at 30 bits
|
||||
double idp = reduce_precision<25>((double)fn[0] * fm[0]);
|
||||
idp += reduce_precision<25>((double)fn[1] * fm[1]);
|
||||
idp += reduce_precision<25>((double)fn[2] * fm[2]);
|
||||
idp += reduce_precision<25>((double)fn[3] * fm[3]);
|
||||
|
||||
return fixNaN((float)idp);
|
||||
return (float)fixNaN64(idp);
|
||||
)
|
||||
#else
|
||||
shil_canonical
|
||||
|
|
@ -956,39 +955,67 @@ shil_opc_end()
|
|||
|
||||
//shop_ftrv
|
||||
shil_opc(ftrv)
|
||||
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
|
||||
shil_canonical
|
||||
(
|
||||
void,f1,(float* fd,float* fn, float* fm),
|
||||
float v1;
|
||||
float v2;
|
||||
float v3;
|
||||
float v4;
|
||||
|
||||
v1 = fm[0] * fn[0] +
|
||||
fm[4] * fn[1] +
|
||||
fm[8] * fn[2] +
|
||||
fm[12] * fn[3];
|
||||
double v1 = reduce_precision<25>((double)fm[0] * fn[0]) +
|
||||
reduce_precision<25>((double)fm[4] * fn[1]) +
|
||||
reduce_precision<25>((double)fm[8] * fn[2]) +
|
||||
reduce_precision<25>((double)fm[12] * fn[3]);
|
||||
|
||||
v2 = fm[1] * fn[0] +
|
||||
fm[5] * fn[1] +
|
||||
fm[9] * fn[2] +
|
||||
fm[13] * fn[3];
|
||||
double v2 = reduce_precision<25>((double)fm[1] * fn[0]) +
|
||||
reduce_precision<25>((double)fm[5] * fn[1]) +
|
||||
reduce_precision<25>((double)fm[9] * fn[2]) +
|
||||
reduce_precision<25>((double)fm[13] * fn[3]);
|
||||
|
||||
v3 = fm[2] * fn[0] +
|
||||
fm[6] * fn[1] +
|
||||
fm[10] * fn[2] +
|
||||
fm[14] * fn[3];
|
||||
double v3 = reduce_precision<25>((double)fm[2] * fn[0]) +
|
||||
reduce_precision<25>((double)fm[6] * fn[1]) +
|
||||
reduce_precision<25>((double)fm[10] * fn[2]) +
|
||||
reduce_precision<25>((double)fm[14] * fn[3]);
|
||||
|
||||
v4 = fm[3] * fn[0] +
|
||||
fm[7] * fn[1] +
|
||||
fm[11] * fn[2] +
|
||||
fm[15] * fn[3];
|
||||
double v4 = reduce_precision<25>((double)fm[3] * fn[0]) +
|
||||
reduce_precision<25>((double)fm[7] * fn[1]) +
|
||||
reduce_precision<25>((double)fm[11] * fn[2]) +
|
||||
reduce_precision<25>((double)fm[15] * fn[3]);
|
||||
|
||||
fd[0] = (float)fixNaN64(v1);
|
||||
fd[1] = (float)fixNaN64(v2);
|
||||
fd[2] = (float)fixNaN64(v3);
|
||||
fd[3] = (float)fixNaN64(v4);
|
||||
)
|
||||
#else
|
||||
shil_canonical
|
||||
(
|
||||
void,f1,(float* fd,float* fn, float* fm),
|
||||
|
||||
float v1 = fm[0] * fn[0] +
|
||||
fm[4] * fn[1] +
|
||||
fm[8] * fn[2] +
|
||||
fm[12] * fn[3];
|
||||
|
||||
float v2 = fm[1] * fn[0] +
|
||||
fm[5] * fn[1] +
|
||||
fm[9] * fn[2] +
|
||||
fm[13] * fn[3];
|
||||
|
||||
float v3 = fm[2] * fn[0] +
|
||||
fm[6] * fn[1] +
|
||||
fm[10] * fn[2] +
|
||||
fm[14] * fn[3];
|
||||
|
||||
float v4 = fm[3] * fn[0] +
|
||||
fm[7] * fn[1] +
|
||||
fm[11] * fn[2] +
|
||||
fm[15] * fn[3];
|
||||
|
||||
fd[0] = fixNaN(v1);
|
||||
fd[1] = fixNaN(v2);
|
||||
fd[2] = fixNaN(v3);
|
||||
fd[3] = fixNaN(v4);
|
||||
)
|
||||
#endif
|
||||
shil_compile
|
||||
(
|
||||
shil_cf_arg_ptr(rs2);
|
||||
|
|
|
|||
|
|
@ -504,23 +504,25 @@ sh4op(i1111_nnmm_1110_1101)
|
|||
{
|
||||
int n=GetN(op)&0xC;
|
||||
int m=(GetN(op)&0x3)<<2;
|
||||
if(fpscr.PR ==0)
|
||||
if (fpscr.PR == 0)
|
||||
{
|
||||
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
|
||||
double idp = (double)fr[n + 0] * fr[m + 0];
|
||||
idp += (double)fr[n + 1] * fr[m + 1];
|
||||
idp += (double)fr[n + 2] * fr[m + 2];
|
||||
idp += (double)fr[n + 3] * fr[m + 3];
|
||||
float rv = (float)idp;
|
||||
// multiplications are done with 28 bits of precision (53 - 25) and the final sum at 30 bits
|
||||
double idp = reduce_precision<25>((double)fr[n + 0] * fr[m + 0]);
|
||||
idp += reduce_precision<25>((double)fr[n + 1] * fr[m + 1]);
|
||||
idp += reduce_precision<25>((double)fr[n + 2] * fr[m + 2]);
|
||||
idp += reduce_precision<25>((double)fr[n + 3] * fr[m + 3]);
|
||||
|
||||
fr[n + 3] = (float)fixNaN64(idp);
|
||||
#else
|
||||
float rv = fr[n + 0] * fr[m + 0];
|
||||
rv += fr[n + 1] * fr[m + 1];
|
||||
rv += fr[n + 2] * fr[m + 2];
|
||||
rv += fr[n + 3] * fr[m + 3];
|
||||
#endif
|
||||
|
||||
CHECK_FPU_32(rv);
|
||||
fr[n + 3] = rv;
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
@ -697,7 +699,32 @@ sh4op(i1111_nn01_1111_1101)
|
|||
|
||||
if (fpscr.PR==0)
|
||||
{
|
||||
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
|
||||
double v1 = reduce_precision<25>((double)xf[0] * fr[n + 0]) +
|
||||
reduce_precision<25>((double)xf[4] * fr[n + 1]) +
|
||||
reduce_precision<25>((double)xf[8] * fr[n + 2]) +
|
||||
reduce_precision<25>((double)xf[12] * fr[n + 3]);
|
||||
|
||||
double v2 = reduce_precision<25>((double)xf[1] * fr[n + 0]) +
|
||||
reduce_precision<25>((double)xf[5] * fr[n + 1]) +
|
||||
reduce_precision<25>((double)xf[9] * fr[n + 2]) +
|
||||
reduce_precision<25>((double)xf[13] * fr[n + 3]);
|
||||
|
||||
double v3 = reduce_precision<25>((double)xf[2] * fr[n + 0]) +
|
||||
reduce_precision<25>((double)xf[6] * fr[n + 1]) +
|
||||
reduce_precision<25>((double)xf[10] * fr[n + 2]) +
|
||||
reduce_precision<25>((double)xf[14] * fr[n + 3]);
|
||||
|
||||
double v4 = reduce_precision<25>((double)xf[3] * fr[n + 0]) +
|
||||
reduce_precision<25>((double)xf[7] * fr[n + 1]) +
|
||||
reduce_precision<25>((double)xf[11] * fr[n + 2]) +
|
||||
reduce_precision<25>((double)xf[15] * fr[n + 3]);
|
||||
|
||||
fr[n + 0] = (float)fixNaN64(v1);
|
||||
fr[n + 1] = (float)fixNaN64(v2);
|
||||
fr[n + 2] = (float)fixNaN64(v3);
|
||||
fr[n + 3] = (float)fixNaN64(v4);
|
||||
#else
|
||||
float v1, v2, v3, v4;
|
||||
|
||||
v1 = xf[0] * fr[n + 0] +
|
||||
|
|
@ -729,7 +756,7 @@ sh4op(i1111_nn01_1111_1101)
|
|||
fr[n + 1] = v2;
|
||||
fr[n + 2] = v3;
|
||||
fr[n + 3] = v4;
|
||||
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
|||
|
|
@ -152,3 +152,12 @@ static INLINE f64 fixNaN64(f64 f)
|
|||
// return (*(u64 *)&f & 0x7fffffffffffffffll) <= 0x7f80000000000000ll ? f : 0x7ff7ffffffffffffll;
|
||||
return f;
|
||||
}
|
||||
|
||||
// Reduces the precision of the argument f by a given number of bits
|
||||
// double have 53 bits of precision so the returned result will have a precision of 53 - bits
|
||||
template<int bits>
|
||||
static INLINE double reduce_precision(double f)
|
||||
{
|
||||
double c = (double)((1 << bits) + 1) * f;
|
||||
return c - (c - f);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1086,7 +1086,7 @@ public:
|
|||
mov(qword[rdx], rcx);
|
||||
#endif
|
||||
break;
|
||||
|
||||
/*
|
||||
case shop_fipr:
|
||||
{
|
||||
// Using doubles for better precision
|
||||
|
|
@ -1164,7 +1164,7 @@ public:
|
|||
|
||||
}
|
||||
break;
|
||||
|
||||
*/
|
||||
case shop_frswap:
|
||||
mov(rax, (uintptr_t)op.rs1.reg_ptr());
|
||||
mov(rcx, (uintptr_t)op.rd.reg_ptr());
|
||||
|
|
|
|||
Loading…
Reference in New Issue