diff --git a/core/hw/sh4/dyna/shil_canonical.h b/core/hw/sh4/dyna/shil_canonical.h index 21e1a8230..fd9918c22 100644 --- a/core/hw/sh4/dyna/shil_canonical.h +++ b/core/hw/sh4/dyna/shil_canonical.h @@ -141,19 +141,11 @@ shil_compile( \ template static inline float innerProduct(const float *f1, const float *f2) { -#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 || HOST_CPU == CPU_ARM64 const double f = (double)f1[0] * f2[Stride * 0] + (double)f1[1] * f2[Stride * 1] + (double)f1[2] * f2[Stride * 2] + (double)f1[3] * f2[Stride * 3]; return fixNaN((float)f); -#else - const float f = f1[0] * f2[Stride * 0] - + f1[1] * f2[Stride * 1] - + f1[2] * f2[Stride * 2] - + f1[3] * f2[Stride * 3]; - return fixNaN(f); -#endif } #endif @@ -723,33 +715,27 @@ shil_opc_end() //shop_cvt_f2i_t //float to integer : truncate shil_opc(cvt_f2i_t) -#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 shil_canonical ( u32,f1,(f32 f1), - if (f1 > 2147483520.0f) // IEEE 754: 0x4effffff - return 0x7fffffff; - else - { - s32 res = (s32)f1; - - // Fix result sign for Intel CPUs - if ((u32)res == 0x80000000 && f1 == f1 && *(s32 *)&f1 > 0) - res = 0x7fffffff; - - return res; + s32 res; + if (f1 > 2147483520.0f) { // IEEE 754: 0x4effffff + res = 0x7fffffff; } -) -#else -shil_canonical -( -u32,f1,(f32 f1), - if (f1 > 2147483520.0f) // IEEE 754: 0x4effffff - return 0x7fffffff; - else - return (s32)f1; -) + else { + res = (s32)f1; +#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 + // Fix result sign for Intel CPUs + if ((u32)res == 0x80000000 && f1 > 0) + res = 0x7fffffff; +#elif HOST_CPU == CPU_ARM || HOST_CPU == CPU_ARM64 + // conversion of NaN returns 0 on ARM + if (std::isnan(f1)) + res = 0x80000000; #endif + } + return res; +) shil_compile ( diff --git a/core/hw/sh4/interpr/sh4_fpu.cpp b/core/hw/sh4/interpr/sh4_fpu.cpp index 231f45fe3..c05951102 100644 --- a/core/hw/sh4/interpr/sh4_fpu.cpp +++ b/core/hw/sh4/interpr/sh4_fpu.cpp @@ -519,28 +519,35 @@ sh4op(i1111_nnnn_0011_1101) if (ctx->fpscr.PR == 0) { u32 n = GetN(op); - ctx->fpul = (u32)(s32)ctx->fr[n]; - - if ((s32)ctx->fpul > 0x7fffff80) - ctx->fpul = 0x7fffffff; - // Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign - else if (ctx->fpul == 0x80000000 && ctx->fr[n] == ctx->fr[n]) + if (std::isnan(ctx->fr[n])) { + ctx->fpul = 0x80000000; + } + else { - if (*(int *)&ctx->fr[n] > 0) // Using integer math to avoid issues with Inf and NaN + ctx->fpul = (u32)(s32)ctx->fr[n]; + if ((s32)ctx->fpul > 0x7fffff80) + ctx->fpul = 0x7fffffff; +#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 + // Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign + else if (ctx->fpul == 0x80000000 && ctx->fr[n] > 0) ctx->fpul--; +#endif } } else { f64 f = getDRn(ctx, op); - ctx->fpul = (u32)(s32)f; - - // TODO saturate - // Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign - if (ctx->fpul == 0x80000000 && f == f) + if (std::isnan(f)) { + ctx->fpul = 0x80000000; + } + else { - if (*(s64 *)&f > 0) // Using integer math to avoid issues with Inf and NaN + ctx->fpul = (u32)(s32)f; +#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 + // Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign + if (ctx->fpul == 0x80000000 && f > 0) ctx->fpul--; +#endif } } } diff --git a/core/rec-ARM/rec_arm.cpp b/core/rec-ARM/rec_arm.cpp index 4552a3c5a..878a8a2fc 100644 --- a/core/rec-ARM/rec_arm.cpp +++ b/core/rec-ARM/rec_arm.cpp @@ -1901,7 +1901,6 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op unaryFpOp(op, &MacroAssembler::Vsqrt); break; - case shop_fmac: { SRegister rd = reg.mapFReg(op->rd); @@ -1945,7 +1944,7 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op } if (!rd.Is(rs1)) Vmov(rd, rs1); - Vmla(rd, rs2, rs3); + Vfma(rd, rs2, rs3); } break; @@ -2001,7 +2000,7 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); } break; - + /* fall back to the canonical implementations for better precision case shop_fipr: { QRegister _r1 = q0; @@ -2098,7 +2097,7 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op #endif } break; - + */ case shop_frswap: Sub(r0, r8, -op->rs1.reg_nofs()); Sub(r1, r8, -op->rd.reg_nofs()); @@ -2111,8 +2110,19 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op break; case shop_cvt_f2i_t: - Vcvt(S32, F32, s0, reg.mapFReg(op->rs1)); - Vmov(reg.mapReg(op->rd), s0); + { + SRegister from = reg.mapFReg(op->rs1); + Register to = reg.mapReg(op->rd); + Vcvt(S32, F32, s0, from); + Vmov(to, s0); + Mvn(r0, 127); + Sub(r0, r0, 0x80000000); + Cmp(to, r0); + Mvn(gt, to, 0xf8000000); + Vcmp(from, from); + Vmrs(RegisterOrAPSR_nzcv(APSR_nzcv), FPSCR); + Mov(ne, to, 0x80000000); + } break; case shop_cvt_i2f_n: // may be some difference should be made ? diff --git a/core/rec-ARM64/rec_arm64.cpp b/core/rec-ARM64/rec_arm64.cpp index c9fbe7f81..9fff1759e 100644 --- a/core/rec-ARM64/rec_arm64.cpp +++ b/core/rec-ARM64/rec_arm64.cpp @@ -952,8 +952,20 @@ public: break; case shop_cvt_f2i_t: - Fcvtzs(regalloc.MapRegister(op.rd), regalloc.MapVRegister(op.rs1)); + { + const VRegister& from = regalloc.MapVRegister(op.rs1); + const Register& to = regalloc.MapRegister(op.rd); + Fcvtzs(to, from); + Mov(w0, 0x7FFFFF80); + Cmp(to, w0); + Mov(w0, 0x7FFFFFF); + Csel(to, w0, to, gt); + Fcmp(from, from); + Mov(w0, 0x80000000); + Csel(to, to, w0, vc); + } break; + case shop_cvt_i2f_n: case shop_cvt_i2f_z: Scvtf(regalloc.MapVRegister(op.rd), regalloc.MapRegister(op.rs1));