dynarec: proper ftrc implem. (arm32/64) Use double for fipr/ftrv

Use double for canonical FIPR and FTRV on all platforms.
Fix interpreter implementation of FTRC.
Fix canonical implementation of cvt_f2i (FTRC).
arm32: use Vfma instead of Vmla for FMAC. Vfma does not a fused
muliply-add, Vmla doesn't.
arm32: Use canonical implementations of FIPR and FTRV.
arm32,arm64: Correct implemetation of cvt_f2i (FTRC)

Fixes desync with NBA 2k1/2k2 online games.
This commit is contained in:
Flyinghead 2025-01-27 17:30:06 +01:00
parent ccdd53ee0e
commit b70c2791b2
4 changed files with 65 additions and 50 deletions

View File

@ -141,19 +141,11 @@ shil_compile( \
template<int Stride = 1>
static inline float innerProduct(const float *f1, const float *f2)
{
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 || HOST_CPU == CPU_ARM64
const double f = (double)f1[0] * f2[Stride * 0]
+ (double)f1[1] * f2[Stride * 1]
+ (double)f1[2] * f2[Stride * 2]
+ (double)f1[3] * f2[Stride * 3];
return fixNaN((float)f);
#else
const float f = f1[0] * f2[Stride * 0]
+ f1[1] * f2[Stride * 1]
+ f1[2] * f2[Stride * 2]
+ f1[3] * f2[Stride * 3];
return fixNaN(f);
#endif
}
#endif
@ -723,33 +715,27 @@ shil_opc_end()
//shop_cvt_f2i_t //float to integer : truncate
shil_opc(cvt_f2i_t)
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
shil_canonical
(
u32,f1,(f32 f1),
if (f1 > 2147483520.0f) // IEEE 754: 0x4effffff
return 0x7fffffff;
else
{
s32 res = (s32)f1;
// Fix result sign for Intel CPUs
if ((u32)res == 0x80000000 && f1 == f1 && *(s32 *)&f1 > 0)
res = 0x7fffffff;
return res;
s32 res;
if (f1 > 2147483520.0f) { // IEEE 754: 0x4effffff
res = 0x7fffffff;
}
)
#else
shil_canonical
(
u32,f1,(f32 f1),
if (f1 > 2147483520.0f) // IEEE 754: 0x4effffff
return 0x7fffffff;
else
return (s32)f1;
)
else {
res = (s32)f1;
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
// Fix result sign for Intel CPUs
if ((u32)res == 0x80000000 && f1 > 0)
res = 0x7fffffff;
#elif HOST_CPU == CPU_ARM || HOST_CPU == CPU_ARM64
// conversion of NaN returns 0 on ARM
if (std::isnan(f1))
res = 0x80000000;
#endif
}
return res;
)
shil_compile
(

View File

@ -519,28 +519,35 @@ sh4op(i1111_nnnn_0011_1101)
if (ctx->fpscr.PR == 0)
{
u32 n = GetN(op);
ctx->fpul = (u32)(s32)ctx->fr[n];
if ((s32)ctx->fpul > 0x7fffff80)
ctx->fpul = 0x7fffffff;
// Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign
else if (ctx->fpul == 0x80000000 && ctx->fr[n] == ctx->fr[n])
if (std::isnan(ctx->fr[n])) {
ctx->fpul = 0x80000000;
}
else
{
if (*(int *)&ctx->fr[n] > 0) // Using integer math to avoid issues with Inf and NaN
ctx->fpul = (u32)(s32)ctx->fr[n];
if ((s32)ctx->fpul > 0x7fffff80)
ctx->fpul = 0x7fffffff;
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
// Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign
else if (ctx->fpul == 0x80000000 && ctx->fr[n] > 0)
ctx->fpul--;
#endif
}
}
else
{
f64 f = getDRn(ctx, op);
ctx->fpul = (u32)(s32)f;
// TODO saturate
// Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign
if (ctx->fpul == 0x80000000 && f == f)
if (std::isnan(f)) {
ctx->fpul = 0x80000000;
}
else
{
if (*(s64 *)&f > 0) // Using integer math to avoid issues with Inf and NaN
ctx->fpul = (u32)(s32)f;
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
// Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign
if (ctx->fpul == 0x80000000 && f > 0)
ctx->fpul--;
#endif
}
}
}

View File

@ -1901,7 +1901,6 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
unaryFpOp(op, &MacroAssembler::Vsqrt);
break;
case shop_fmac:
{
SRegister rd = reg.mapFReg(op->rd);
@ -1945,7 +1944,7 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
}
if (!rd.Is(rs1))
Vmov(rd, rs1);
Vmla(rd, rs2, rs3);
Vfma(rd, rs2, rs3);
}
break;
@ -2001,7 +2000,7 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
Vstr(d0, MemOperand(r8, op->rd.reg_nofs()));
}
break;
/* fall back to the canonical implementations for better precision
case shop_fipr:
{
QRegister _r1 = q0;
@ -2098,7 +2097,7 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
#endif
}
break;
*/
case shop_frswap:
Sub(r0, r8, -op->rs1.reg_nofs());
Sub(r1, r8, -op->rd.reg_nofs());
@ -2111,8 +2110,19 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
break;
case shop_cvt_f2i_t:
Vcvt(S32, F32, s0, reg.mapFReg(op->rs1));
Vmov(reg.mapReg(op->rd), s0);
{
SRegister from = reg.mapFReg(op->rs1);
Register to = reg.mapReg(op->rd);
Vcvt(S32, F32, s0, from);
Vmov(to, s0);
Mvn(r0, 127);
Sub(r0, r0, 0x80000000);
Cmp(to, r0);
Mvn(gt, to, 0xf8000000);
Vcmp(from, from);
Vmrs(RegisterOrAPSR_nzcv(APSR_nzcv), FPSCR);
Mov(ne, to, 0x80000000);
}
break;
case shop_cvt_i2f_n: // may be some difference should be made ?

View File

@ -952,8 +952,20 @@ public:
break;
case shop_cvt_f2i_t:
Fcvtzs(regalloc.MapRegister(op.rd), regalloc.MapVRegister(op.rs1));
{
const VRegister& from = regalloc.MapVRegister(op.rs1);
const Register& to = regalloc.MapRegister(op.rd);
Fcvtzs(to, from);
Mov(w0, 0x7FFFFF80);
Cmp(to, w0);
Mov(w0, 0x7FFFFFF);
Csel(to, w0, to, gt);
Fcmp(from, from);
Mov(w0, 0x80000000);
Csel(to, to, w0, vc);
}
break;
case shop_cvt_i2f_n:
case shop_cvt_i2f_z:
Scvtf(regalloc.MapVRegister(op.rd), regalloc.MapRegister(op.rs1));