reduce precision of FIPR and FTRV sh4 ops on x86 platforms

2019-12-17 20:23:58 +01:00 · 2019-12-17 20:23:58 +01:00 · 12b9b7254a
parent 7ab275d4b5
commit 12b9b7254a
4 changed files with 100 additions and 37 deletions
--- a/core/hw/sh4/dyna/shil_canonical.h
+++ b/core/hw/sh4/dyna/shil_canonical.h
@ -911,7 +911,6 @@ shil_opc(fsqrt)
 UN_OP_F(sqrtf)
 shil_opc_end()

-
 //shop_fipr
 shil_opc(fipr)

@ -920,13 +919,13 @@ shil_canonical
 (
 f32,f1,(float* fn, float* fm),

-	// Using double for better precision on x86 (Sonic Adventure)
-	double idp = (double)fn[0] * fm[0];
-	idp += (double)fn[1] * fm[1];
-	idp += (double)fn[2] * fm[2];
-	idp += (double)fn[3] * fm[3];
+	// multiplications are done with 28 bits of precision (53 - 25) and the final sum at 30 bits
+	double idp = reduce_precision<25>((double)fn[0] * fm[0]);
+	idp += reduce_precision<25>((double)fn[1] * fm[1]);
+	idp += reduce_precision<25>((double)fn[2] * fm[2]);
+	idp += reduce_precision<25>((double)fn[3] * fm[3]);

-	return fixNaN((float)idp);
+	return (float)fixNaN64(idp);
 )
 #else
 shil_canonical
@ -956,39 +955,67 @@ shil_opc_end()

 //shop_ftrv
 shil_opc(ftrv)
+#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
 shil_canonical
 (
 void,f1,(float* fd,float* fn, float* fm),
-	float v1;
-	float v2;
-	float v3;
-	float v4;

-	v1 = fm[0]  * fn[0] +
-		 fm[4]  * fn[1] +
-		 fm[8]  * fn[2] +
-		 fm[12] * fn[3];
+	double v1 = reduce_precision<25>((double)fm[0]  * fn[0]) +
+				reduce_precision<25>((double)fm[4]  * fn[1]) +
+				reduce_precision<25>((double)fm[8]  * fn[2]) +
+				reduce_precision<25>((double)fm[12] * fn[3]);

-	v2 = fm[1]  * fn[0] +
-		 fm[5]  * fn[1] +
-		 fm[9]  * fn[2] +
-		 fm[13] * fn[3];
+	double v2 = reduce_precision<25>((double)fm[1]  * fn[0]) +
+				reduce_precision<25>((double)fm[5]  * fn[1]) +
+				reduce_precision<25>((double)fm[9]  * fn[2]) +
+				reduce_precision<25>((double)fm[13] * fn[3]);

-	v3 = fm[2]  * fn[0] +
-		 fm[6]  * fn[1] +
-		 fm[10] * fn[2] +
-		 fm[14] * fn[3];
+	double v3 = reduce_precision<25>((double)fm[2]  * fn[0]) +
+				reduce_precision<25>((double)fm[6]  * fn[1]) +
+				reduce_precision<25>((double)fm[10] * fn[2]) +
+				reduce_precision<25>((double)fm[14] * fn[3]);

-	v4 = fm[3]  * fn[0] +
-		 fm[7]  * fn[1] +
-		 fm[11] * fn[2] +
-		 fm[15] * fn[3];
+	double v4 = reduce_precision<25>((double)fm[3]  * fn[0]) +
+				reduce_precision<25>((double)fm[7]  * fn[1]) +
+				reduce_precision<25>((double)fm[11] * fn[2]) +
+				reduce_precision<25>((double)fm[15] * fn[3]);
+
+	fd[0] = (float)fixNaN64(v1);
+	fd[1] = (float)fixNaN64(v2);
+	fd[2] = (float)fixNaN64(v3);
+	fd[3] = (float)fixNaN64(v4);
+)
+#else
+shil_canonical
+(
+void,f1,(float* fd,float* fn, float* fm),
+
+	float v1 = fm[0]  * fn[0] +
+			   fm[4]  * fn[1] +
+			   fm[8]  * fn[2] +
+			   fm[12] * fn[3];
+
+	float v2 = fm[1]  * fn[0] +
+			   fm[5]  * fn[1] +
+			   fm[9]  * fn[2] +
+			   fm[13] * fn[3];
+
+	float v3 = fm[2]  * fn[0] +
+			   fm[6]  * fn[1] +
+			   fm[10] * fn[2] +
+			   fm[14] * fn[3];
+
+	float v4 = fm[3]  * fn[0] +
+			   fm[7]  * fn[1] +
+			   fm[11] * fn[2] +
+			   fm[15] * fn[3];

 	fd[0] = fixNaN(v1);
 	fd[1] = fixNaN(v2);
 	fd[2] = fixNaN(v3);
 	fd[3] = fixNaN(v4);
 )
+#endif
 shil_compile
 (
 	shil_cf_arg_ptr(rs2);
--- a/core/hw/sh4/interpr/sh4_fpu.cpp
+++ b/core/hw/sh4/interpr/sh4_fpu.cpp
@ -504,23 +504,25 @@ sh4op(i1111_nnmm_1110_1101)
 {
 	int n=GetN(op)&0xC;
 	int m=(GetN(op)&0x3)<<2;
-	if(fpscr.PR ==0)
+	if (fpscr.PR == 0)
 	{
 #if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
-		double idp = (double)fr[n + 0] * fr[m + 0];
-		idp += (double)fr[n + 1] * fr[m + 1];
-		idp += (double)fr[n + 2] * fr[m + 2];
-		idp += (double)fr[n + 3] * fr[m + 3];
-		float rv = (float)idp;
+		// multiplications are done with 28 bits of precision (53 - 25) and the final sum at 30 bits
+		double idp = reduce_precision<25>((double)fr[n + 0] * fr[m + 0]);
+		idp += reduce_precision<25>((double)fr[n + 1] * fr[m + 1]);
+		idp += reduce_precision<25>((double)fr[n + 2] * fr[m + 2]);
+		idp += reduce_precision<25>((double)fr[n + 3] * fr[m + 3]);
+
+		fr[n + 3] = (float)fixNaN64(idp);
 #else
 		float rv = fr[n + 0] * fr[m + 0];
 		rv += fr[n + 1] * fr[m + 1];
 		rv += fr[n + 2] * fr[m + 2];
 		rv += fr[n + 3] * fr[m + 3];
-#endif

 		CHECK_FPU_32(rv);
 		fr[n + 3] = rv;
+#endif
 	}
 	else
 	{
@ -697,7 +699,32 @@ sh4op(i1111_nn01_1111_1101)

 	if (fpscr.PR==0)
 	{
+#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
+		double v1 = reduce_precision<25>((double)xf[0]  * fr[n + 0]) +
+					reduce_precision<25>((double)xf[4]  * fr[n + 1]) +
+					reduce_precision<25>((double)xf[8]  * fr[n + 2]) +
+					reduce_precision<25>((double)xf[12] * fr[n + 3]);

+		double v2 = reduce_precision<25>((double)xf[1]  * fr[n + 0]) +
+					reduce_precision<25>((double)xf[5]  * fr[n + 1]) +
+					reduce_precision<25>((double)xf[9]  * fr[n + 2]) +
+					reduce_precision<25>((double)xf[13] * fr[n + 3]);
+
+		double v3 = reduce_precision<25>((double)xf[2]  * fr[n + 0]) +
+					reduce_precision<25>((double)xf[6]  * fr[n + 1]) +
+					reduce_precision<25>((double)xf[10] * fr[n + 2]) +
+					reduce_precision<25>((double)xf[14] * fr[n + 3]);
+
+		double v4 = reduce_precision<25>((double)xf[3]  * fr[n + 0]) +
+					reduce_precision<25>((double)xf[7]  * fr[n + 1]) +
+					reduce_precision<25>((double)xf[11] * fr[n + 2]) +
+					reduce_precision<25>((double)xf[15] * fr[n + 3]);
+
+		fr[n + 0] = (float)fixNaN64(v1);
+		fr[n + 1] = (float)fixNaN64(v2);
+		fr[n + 2] = (float)fixNaN64(v3);
+		fr[n + 3] = (float)fixNaN64(v4);
+#else
 		float v1, v2, v3, v4;

 		v1 = xf[0]  * fr[n + 0] +
@ -729,7 +756,7 @@ sh4op(i1111_nn01_1111_1101)
 		fr[n + 1] = v2;
 		fr[n + 2] = v3;
 		fr[n + 3] = v4;
-
+#endif
 	}
 	else
 	{
--- a/core/hw/sh4/sh4_core.h
+++ b/core/hw/sh4/sh4_core.h
@ -152,3 +152,12 @@ static INLINE f64 fixNaN64(f64 f)
 //	return (*(u64 *)&f & 0x7fffffffffffffffll) <= 0x7f80000000000000ll ? f : 0x7ff7ffffffffffffll;
 	return f;
 }
+
+// Reduces the precision of the argument f by a given number of bits
+// double have 53 bits of precision so the returned result will have a precision of 53 - bits
+template<int bits>
+static INLINE double reduce_precision(double f)
+{
+	double c = (double)((1 << bits) + 1) * f;
+	return c - (c - f);
+}
--- a/core/rec-x64/rec_x64.cpp
+++ b/core/rec-x64/rec_x64.cpp
@ -1086,7 +1086,7 @@ public:
 				mov(qword[rdx], rcx);
 #endif
 				break;
-
+/*
 			case shop_fipr:
 				{
 						// Using doubles for better precision
@ -1164,7 +1164,7 @@ public:

 				}
 				break;
-
+*/
 			case shop_frswap:
 				mov(rax, (uintptr_t)op.rs1.reg_ptr());
 				mov(rcx, (uintptr_t)op.rd.reg_ptr());