diff --git a/core/hw/sh4/dyna/shil_canonical.h b/core/hw/sh4/dyna/shil_canonical.h
index 21e1a8230..fd9918c22 100644
--- a/core/hw/sh4/dyna/shil_canonical.h
+++ b/core/hw/sh4/dyna/shil_canonical.h
@@ -141,19 +141,11 @@ shil_compile( \
 template<int Stride = 1>
 static inline float innerProduct(const float *f1, const float *f2)
 {
-#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 || HOST_CPU == CPU_ARM64
 	const double f = (double)f1[0] * f2[Stride * 0]
 				   + (double)f1[1] * f2[Stride * 1]
 				   + (double)f1[2] * f2[Stride * 2]
 				   + (double)f1[3] * f2[Stride * 3];
 	return fixNaN((float)f);
-#else
-	const float f = f1[0] * f2[Stride * 0]
-				  + f1[1] * f2[Stride * 1]
-				  + f1[2] * f2[Stride * 2]
-				  + f1[3] * f2[Stride * 3];
-	return fixNaN(f);
-#endif
 }
 
 #endif
@@ -723,33 +715,27 @@ shil_opc_end()
 //shop_cvt_f2i_t	//float to integer : truncate
 shil_opc(cvt_f2i_t)
 
-#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
 shil_canonical
 (
 u32,f1,(f32 f1),
-	if (f1 > 2147483520.0f) // IEEE 754: 0x4effffff
-		return 0x7fffffff;
-	else
-	{
-		s32 res = (s32)f1;
-
-		// Fix result sign for Intel CPUs
-		if ((u32)res == 0x80000000 && f1 == f1 && *(s32 *)&f1 > 0)
-			res = 0x7fffffff;
-
-		return res;
+	s32 res;
+	if (f1 > 2147483520.0f) { // IEEE 754: 0x4effffff
+		res = 0x7fffffff;
 	}
-)
-#else
-shil_canonical
-(
-u32,f1,(f32 f1),
-	if (f1 > 2147483520.0f) // IEEE 754: 0x4effffff
-		return 0x7fffffff;
-	else
-		return (s32)f1;
-)
+	else {
+		res = (s32)f1;
+#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
+		// Fix result sign for Intel CPUs
+		if ((u32)res == 0x80000000 && f1 > 0)
+			res = 0x7fffffff;
+#elif HOST_CPU == CPU_ARM || HOST_CPU == CPU_ARM64
+		// conversion of NaN returns 0 on ARM
+		if (std::isnan(f1))
+			res = 0x80000000;
 #endif
+	}
+	return res;
+)
 
 shil_compile
 (
diff --git a/core/hw/sh4/interpr/sh4_fpu.cpp b/core/hw/sh4/interpr/sh4_fpu.cpp
index 231f45fe3..c05951102 100644
--- a/core/hw/sh4/interpr/sh4_fpu.cpp
+++ b/core/hw/sh4/interpr/sh4_fpu.cpp
@@ -519,28 +519,35 @@ sh4op(i1111_nnnn_0011_1101)
 	if (ctx->fpscr.PR == 0)
 	{
 		u32 n = GetN(op);
-		ctx->fpul = (u32)(s32)ctx->fr[n];
-
-		if ((s32)ctx->fpul > 0x7fffff80)
-			ctx->fpul = 0x7fffffff;
-		// Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign
-		else if (ctx->fpul == 0x80000000 && ctx->fr[n] == ctx->fr[n])
+		if (std::isnan(ctx->fr[n])) {
+			ctx->fpul = 0x80000000;
+		}
+		else
 		{
-			if (*(int *)&ctx->fr[n] > 0) // Using integer math to avoid issues with Inf and NaN
+			ctx->fpul = (u32)(s32)ctx->fr[n];
+			if ((s32)ctx->fpul > 0x7fffff80)
+				ctx->fpul = 0x7fffffff;
+#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
+			// Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign
+			else if (ctx->fpul == 0x80000000 && ctx->fr[n] > 0)
 				ctx->fpul--;
+#endif
 		}
 	}
 	else
 	{
 		f64 f = getDRn(ctx, op);
-		ctx->fpul = (u32)(s32)f;
-
-		// TODO saturate
-		// Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign
-		if (ctx->fpul == 0x80000000 && f == f)
+		if (std::isnan(f)) {
+			ctx->fpul = 0x80000000;
+		}
+		else
 		{
-			if (*(s64 *)&f > 0)     // Using integer math to avoid issues with Inf and NaN
+			ctx->fpul = (u32)(s32)f;
+#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
+			// Intel CPUs convert out of range float numbers to 0x80000000. Manually set the correct sign
+			if (ctx->fpul == 0x80000000 && f > 0)
 				ctx->fpul--;
+#endif
 		}
 	}
 }
diff --git a/core/rec-ARM/rec_arm.cpp b/core/rec-ARM/rec_arm.cpp
index 4552a3c5a..878a8a2fc 100644
--- a/core/rec-ARM/rec_arm.cpp
+++ b/core/rec-ARM/rec_arm.cpp
@@ -1901,7 +1901,6 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
 			unaryFpOp(op, &MacroAssembler::Vsqrt);
 			break;
 
-		
 		case shop_fmac:
 			{
 				SRegister rd = reg.mapFReg(op->rd);
@@ -1945,7 +1944,7 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
 				}
 				if (!rd.Is(rs1))
 					Vmov(rd, rs1);
-				Vmla(rd, rs2, rs3);
+				Vfma(rd, rs2, rs3);
 			}
 			break;
 
@@ -2001,7 +2000,7 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
 				Vstr(d0, MemOperand(r8, op->rd.reg_nofs()));
 			}
 			break;
-
+		/* fall back to the canonical implementations for better precision
 		case shop_fipr:
 			{
 				QRegister _r1 = q0;
@@ -2098,7 +2097,7 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
 #endif
 			}
 			break;
-
+			*/
 		case shop_frswap:
 			Sub(r0, r8, -op->rs1.reg_nofs());
 			Sub(r1, r8, -op->rd.reg_nofs());
@@ -2111,8 +2110,19 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
 			break;
 
 		case shop_cvt_f2i_t:
-			Vcvt(S32, F32, s0, reg.mapFReg(op->rs1));
-			Vmov(reg.mapReg(op->rd), s0);
+			{
+				SRegister from = reg.mapFReg(op->rs1);
+				Register to = reg.mapReg(op->rd);
+				Vcvt(S32, F32, s0, from);
+				Vmov(to, s0);
+				Mvn(r0, 127);
+				Sub(r0, r0, 0x80000000);
+				Cmp(to, r0);
+				Mvn(gt, to, 0xf8000000);
+				Vcmp(from, from);
+				Vmrs(RegisterOrAPSR_nzcv(APSR_nzcv), FPSCR);
+				Mov(ne, to, 0x80000000);
+			}
 			break;
 
 		case shop_cvt_i2f_n:	// may be some difference should be made ?
diff --git a/core/rec-ARM64/rec_arm64.cpp b/core/rec-ARM64/rec_arm64.cpp
index c9fbe7f81..9fff1759e 100644
--- a/core/rec-ARM64/rec_arm64.cpp
+++ b/core/rec-ARM64/rec_arm64.cpp
@@ -952,8 +952,20 @@ public:
 				break;
 
 			case shop_cvt_f2i_t:
-				Fcvtzs(regalloc.MapRegister(op.rd), regalloc.MapVRegister(op.rs1));
+				{
+					const VRegister& from = regalloc.MapVRegister(op.rs1);
+					const Register& to = regalloc.MapRegister(op.rd);
+					Fcvtzs(to, from);
+					Mov(w0, 0x7FFFFF80);
+					Cmp(to, w0);
+					Mov(w0, 0x7FFFFFF);
+					Csel(to, w0, to, gt);
+					Fcmp(from, from);
+					Mov(w0, 0x80000000);
+					Csel(to, to, w0, vc);
+				}
 				break;
+
 			case shop_cvt_i2f_n:
 			case shop_cvt_i2f_z:
 				Scvtf(regalloc.MapVRegister(op.rd), regalloc.MapRegister(op.rs1));