dynarec: proper ftrc implem. (arm32/64) Use double for fipr/ftrv
Use double for canonical FIPR and FTRV on all platforms. Fix interpreter implementation of FTRC. Fix canonical implementation of cvt_f2i (FTRC). arm32: use Vfma instead of Vmla for FMAC. Vfma does not a fused muliply-add, Vmla doesn't. arm32: Use canonical implementations of FIPR and FTRV. arm32,arm64: Correct implemetation of cvt_f2i (FTRC) Fixes desync with NBA 2k1/2k2 online games.
This commit is contained in:
parent
ccdd53ee0e
commit
b70c2791b2
|
@ -141,19 +141,11 @@ shil_compile( \
|
|||
template<int Stride = 1>
|
||||
static inline float innerProduct(const float *f1, const float *f2)
|
||||
{
|
||||
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 || HOST_CPU == CPU_ARM64
|
||||
const double f = (double)f1[0] * f2[Stride * 0]
|
||||
+ (double)f1[1] * f2[Stride * 1]
|
||||
+ (double)f1[2] * f2[Stride * 2]
|
||||
+ (double)f1[3] * f2[Stride * 3];
|
||||
return fixNaN((float)f);
|
||||
#else
|
||||
const float f = f1[0] * f2[Stride * 0]
|
||||
+ f1[1] * f2[Stride * 1]
|
||||
+ f1[2] * f2[Stride * 2]
|
||||
+ f1[3] * f2[Stride * 3];
|
||||
return fixNaN(f);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -723,33 +715,27 @@ shil_opc_end()
|
|||
//shop_cvt_f2i_t //float to integer : truncate
|
||||
shil_opc(cvt_f2i_t)
|
||||
|
||||
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
|
||||
shil_canonical
|
||||
(
|
||||
u32,f1,(f32 f1),
|
||||
if (f1 > 2147483520.0f) // IEEE 754: 0x4effffff
|
||||
return 0x7fffffff;
|
||||
else
|
||||
{
|
||||
s32 res = (s32)f1;
|
||||
|
||||
// Fix result sign for Intel CPUs
|
||||
if ((u32)res == 0x80000000 && f1 == f1 && *(s32 *)&f1 > 0)
|
||||
res = 0x7fffffff;
|
||||
|
||||
return res;
|
||||
s32 res;
|
||||
if (f1 > 2147483520.0f) { // IEEE 754: 0x4effffff
|
||||
res = 0x7fffffff;
|
||||
}
|
||||
)
|
||||
#else
|
||||
shil_canonical
|
||||
(
|
||||
u32,f1,(f32 f1),
|
||||
if (f1 > 2147483520.0f) // IEEE 754: 0x4effffff
|
||||
return 0x7fffffff;
|
||||
else
|
||||
return (s32)f1;
|
||||
)
|
||||
else {
|
||||
res = (s32)f1;
|
||||
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
|
||||
// Fix result sign for Intel CPUs
|
||||
if ((u32)res == 0x80000000 && f1 > 0)
|
||||
res = 0x7fffffff;
|
||||
#elif HOST_CPU == CPU_ARM || HOST_CPU == CPU_ARM64
|
||||
// conversion of NaN returns 0 on ARM
|
||||
if (std::isnan(f1))
|
||||
res = 0x80000000;
|
||||
#endif
|
||||
}
|
||||
return res;
|
||||
)
|
||||
|
||||
shil_compile
|
||||
(
|
||||
|
|
|
@ -519,28 +519,35 @@ sh4op(i1111_nnnn_0011_1101)
|
|||
if (ctx->fpscr.PR == 0)
|
||||
{
|
||||
u32 n = GetN(op);
|
||||
ctx->fpul = (u32)(s32)ctx->fr[n];
|
||||
|
||||
if ((s32)ctx->fpul > 0x7fffff80)
|
||||
ctx->fpul = 0x7fffffff;
|
||||
// Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign
|
||||
else if (ctx->fpul == 0x80000000 && ctx->fr[n] == ctx->fr[n])
|
||||
if (std::isnan(ctx->fr[n])) {
|
||||
ctx->fpul = 0x80000000;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (*(int *)&ctx->fr[n] > 0) // Using integer math to avoid issues with Inf and NaN
|
||||
ctx->fpul = (u32)(s32)ctx->fr[n];
|
||||
if ((s32)ctx->fpul > 0x7fffff80)
|
||||
ctx->fpul = 0x7fffffff;
|
||||
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
|
||||
// Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign
|
||||
else if (ctx->fpul == 0x80000000 && ctx->fr[n] > 0)
|
||||
ctx->fpul--;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
f64 f = getDRn(ctx, op);
|
||||
ctx->fpul = (u32)(s32)f;
|
||||
|
||||
// TODO saturate
|
||||
// Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign
|
||||
if (ctx->fpul == 0x80000000 && f == f)
|
||||
if (std::isnan(f)) {
|
||||
ctx->fpul = 0x80000000;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (*(s64 *)&f > 0) // Using integer math to avoid issues with Inf and NaN
|
||||
ctx->fpul = (u32)(s32)f;
|
||||
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
|
||||
// Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign
|
||||
if (ctx->fpul == 0x80000000 && f > 0)
|
||||
ctx->fpul--;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1901,7 +1901,6 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
|
|||
unaryFpOp(op, &MacroAssembler::Vsqrt);
|
||||
break;
|
||||
|
||||
|
||||
case shop_fmac:
|
||||
{
|
||||
SRegister rd = reg.mapFReg(op->rd);
|
||||
|
@ -1945,7 +1944,7 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
|
|||
}
|
||||
if (!rd.Is(rs1))
|
||||
Vmov(rd, rs1);
|
||||
Vmla(rd, rs2, rs3);
|
||||
Vfma(rd, rs2, rs3);
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -2001,7 +2000,7 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
|
|||
Vstr(d0, MemOperand(r8, op->rd.reg_nofs()));
|
||||
}
|
||||
break;
|
||||
|
||||
/* fall back to the canonical implementations for better precision
|
||||
case shop_fipr:
|
||||
{
|
||||
QRegister _r1 = q0;
|
||||
|
@ -2098,7 +2097,7 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
|
|||
#endif
|
||||
}
|
||||
break;
|
||||
|
||||
*/
|
||||
case shop_frswap:
|
||||
Sub(r0, r8, -op->rs1.reg_nofs());
|
||||
Sub(r1, r8, -op->rd.reg_nofs());
|
||||
|
@ -2111,8 +2110,19 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
|
|||
break;
|
||||
|
||||
case shop_cvt_f2i_t:
|
||||
Vcvt(S32, F32, s0, reg.mapFReg(op->rs1));
|
||||
Vmov(reg.mapReg(op->rd), s0);
|
||||
{
|
||||
SRegister from = reg.mapFReg(op->rs1);
|
||||
Register to = reg.mapReg(op->rd);
|
||||
Vcvt(S32, F32, s0, from);
|
||||
Vmov(to, s0);
|
||||
Mvn(r0, 127);
|
||||
Sub(r0, r0, 0x80000000);
|
||||
Cmp(to, r0);
|
||||
Mvn(gt, to, 0xf8000000);
|
||||
Vcmp(from, from);
|
||||
Vmrs(RegisterOrAPSR_nzcv(APSR_nzcv), FPSCR);
|
||||
Mov(ne, to, 0x80000000);
|
||||
}
|
||||
break;
|
||||
|
||||
case shop_cvt_i2f_n: // may be some difference should be made ?
|
||||
|
|
|
@ -952,8 +952,20 @@ public:
|
|||
break;
|
||||
|
||||
case shop_cvt_f2i_t:
|
||||
Fcvtzs(regalloc.MapRegister(op.rd), regalloc.MapVRegister(op.rs1));
|
||||
{
|
||||
const VRegister& from = regalloc.MapVRegister(op.rs1);
|
||||
const Register& to = regalloc.MapRegister(op.rd);
|
||||
Fcvtzs(to, from);
|
||||
Mov(w0, 0x7FFFFF80);
|
||||
Cmp(to, w0);
|
||||
Mov(w0, 0x7FFFFFF);
|
||||
Csel(to, w0, to, gt);
|
||||
Fcmp(from, from);
|
||||
Mov(w0, 0x80000000);
|
||||
Csel(to, to, w0, vc);
|
||||
}
|
||||
break;
|
||||
|
||||
case shop_cvt_i2f_n:
|
||||
case shop_cvt_i2f_z:
|
||||
Scvtf(regalloc.MapVRegister(op.rd), regalloc.MapRegister(op.rs1));
|
||||
|
|
Loading…
Reference in New Issue