GFX3D: GEM_TransformVertex() now uses the SIMD-optimized generic matrix functions instead of using its own scalar-only version.

- This change results in a small, yet measurable, performance improvement. - Note that this change has the side-effect of enabling saturation logic for the following functions: MatrixMultVec3x3(), MatrixTranslate(), MatrixScale(). This is a change in their behavior, since these functions did not perform saturation logic before. This will need additional testing.
2022-05-23 15:53:05 -07:00 · 2022-05-23 15:53:05 -07:00 · 356fe47df7
parent d757d83b3d
commit 356fe47df7
2 changed files with 11 additions and 48 deletions
--- a/desmume/src/gfx3d.cpp
+++ b/desmume/src/gfx3d.cpp
@ -691,35 +691,9 @@ FORCEINLINE s64 GEM_Mul32x32To64(const s32 a, const s32 b)
 #endif
 }
 static s32 GEM_SaturateAndShiftdown36To32(const s64 val)
 {
 	if(val>(s64)0x000007FFFFFFFFFFULL) return (s32)0x7FFFFFFFU;
 	if(val<(s64)0xFFFFF80000000000ULL) return (s32)0x80000000U;
 	return fx32_shiftdown(val);
 }
 static void GEM_TransformVertex(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4])
 {
-	const s32 x = vec[0];
+	MatrixMultVec4x4(mtx, vec);
 	const s32 y = vec[1];
 	const s32 z = vec[2];
 	const s32 w = vec[3];
 	//saturation logic is most carefully tested by:
 	//+ spectrobes beyond the portals excavation blower and drill tools: sets very large overflowing +x,+y in the modelview matrix to push things offscreen
 	//You can see this happening quite clearly: vertices will get translated to extreme values and overflow from a 7FFF-like to an 8000-like
 	//but if it's done wrongly, you can get bugs in:
 	//+ kingdom hearts re-coded: first conversation with cast characters will place them oddly with something overflowing to about 0xA???????
 	//other test cases that cropped up during this development, but are probably not actually related to this after all
 	//+ SM64: outside castle skybox
 	//+ NSMB: mario head screen wipe
 	vec[0] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[0]) + GEM_Mul32x32To64(y,mtx[4]) + GEM_Mul32x32To64(z,mtx[ 8]) + GEM_Mul32x32To64(w,mtx[12]) );
 	vec[1] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[1]) + GEM_Mul32x32To64(y,mtx[5]) + GEM_Mul32x32To64(z,mtx[ 9]) + GEM_Mul32x32To64(w,mtx[13]) );
 	vec[2] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[2]) + GEM_Mul32x32To64(y,mtx[6]) + GEM_Mul32x32To64(z,mtx[10]) + GEM_Mul32x32To64(w,mtx[14]) );
 	vec[3] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[3]) + GEM_Mul32x32To64(y,mtx[7]) + GEM_Mul32x32To64(z,mtx[11]) + GEM_Mul32x32To64(w,mtx[15]) );
 }
 //---------------
--- a/desmume/src/matrix.cpp
+++ b/desmume/src/matrix.cpp
@ -24,20 +24,6 @@
 #include "matrix.h"
 #include "MMU.h"
 // NDS matrix math functions uses 20.12 fixed-point for calculations. According to
 // GEM_TransformVertex(), dot product calculations use accumulation that goes beyond
 // 32-bits and then saturates. Therefore, all fixed-point math functions (with the
 // exception of matrix-by-matrix multiplication,) will also support that feature here.
 //
 // But for historical reasons, we can't enable this right away. Therefore, the scalar
 // function GEM_TransformVertex() will continue to be used for SetVertex() while these
 // fixed-point functions will remain as they are. In order to document the future
 // intent of the fixed-point functions while retaining the existing functionality, the
 // saturate code will be hidden by this macro.
 //
 // Testing is highly encouraged! Simply uncomment to try out this feature.
 //#define FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
 // The following floating-point functions exist for historical reasons and are deprecated.
 // They should be obsoleted and removed as more of the geometry engine moves to fixed-point.
@ -575,7 +561,6 @@ static FORCEINLINE void __mtx4_translate_vec3_float_NEON(float (&__restrict inou
 static FORCEINLINE s32 ___s32_saturate_shiftdown_accum64_fixed(s64 inAccum)
 {
 #ifdef FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
 	if (inAccum > (s64)0x000007FFFFFFFFFFULL)
 	{
 		return (s32)0x7FFFFFFFU;
@ -584,7 +569,6 @@ static FORCEINLINE s32 ___s32_saturate_shiftdown_accum64_fixed(s64 inAccum)
 	{
 		return (s32)0x80000000U;
 	}
 #endif // FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
 	return sfx32_shiftdown(inAccum);
 }
@ -596,6 +580,16 @@ static FORCEINLINE s32 __vec4_dotproduct_vec4_fixed(const s32 (&__restrict vecA)
 static FORCEINLINE void __vec4_multiply_mtx4_fixed(s32 (&__restrict inoutVec)[4], const s32 (&__restrict inMtx)[16])
 {
 	//saturation logic is most carefully tested by:
 	//+ spectrobes beyond the portals excavation blower and drill tools: sets very large overflowing +x,+y in the modelview matrix to push things offscreen
 	//You can see this happening quite clearly: vertices will get translated to extreme values and overflow from a 7FFF-like to an 8000-like
 	//but if it's done wrongly, you can get bugs in:
 	//+ kingdom hearts re-coded: first conversation with cast characters will place them oddly with something overflowing to about 0xA???????
 	//other test cases that cropped up during this development, but are probably not actually related to this after all
 	//+ SM64: outside castle skybox
 	//+ NSMB: mario head screen wipe
 	const CACHE_ALIGN s32 v[4] = {inoutVec[0], inoutVec[1], inoutVec[2], inoutVec[3]};
 	inoutVec[0] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inMtx[0],v[0]) + fx32_mul(inMtx[4],v[1]) + fx32_mul(inMtx[ 8],v[2]) + fx32_mul(inMtx[12],v[3]) );
@ -675,7 +669,6 @@ static FORCEINLINE void __mtx4_translate_vec3_fixed(s32 (&__restrict inoutMtx)[1
 static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_SSE4(__m128i &inoutAccum)
 {
 #ifdef FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
 	v128u8 outVecMask;
 #if defined(ENABLE_SSE4_2)
@ -697,8 +690,6 @@ static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_SSE4(__m128i &in
 	inoutAccum = _mm_blendv_epi8(outVecNeg, outVecPos, outVecSignMask);
 #endif // ENABLE_SSE4_2
 #endif // FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
 	inoutAccum = _mm_srli_epi64(inoutAccum, 12);
 	inoutAccum = _mm_shuffle_epi32(inoutAccum, 0xD8);
 }
@ -967,7 +958,6 @@ static FORCEINLINE void __mtx4_translate_vec3_fixed_SSE4(s32 (&__restrict inoutM
 static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_NEON(int64x2_t &inoutAccum)
 {
 #ifdef FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
 	int64x2_t outVecMask;
 	outVecMask = vcgtq_s64( inoutAccum, vdupq_n_s64((s64)0x000007FFFFFFFFFFULL) );
@ -975,7 +965,6 @@ static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_NEON(int64x2_t &
 	outVecMask = vcltq_s64( inoutAccum, vdupq_n_s64((s64)0xFFFFF80000000000ULL) );
 	inoutAccum = vbslq_s64( outVecMask, vdupq_n_s64((s64)0xFFFFF80000000000ULL), inoutAccum );
 #endif // FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
 	inoutAccum = vshrq_n_s64(inoutAccum, 12);
 	inoutAccum = vreinterpretq_s64_s32( vuzp1q_s32(vreinterpretq_s32_s64(inoutAccum), vdupq_n_s32(0)) );