GFX3D: GEM_TransformVertex() now uses the SIMD-optimized generic matrix functions instead of using its own scalar-only version.

- This change results in a small, yet measurable, performance improvement.
- Note that this change has the side-effect of enabling saturation logic for the following functions: MatrixMultVec3x3(), MatrixTranslate(), MatrixScale(). This is a change in their behavior, since these functions did not perform saturation logic before. This will need additional testing.
This commit is contained in:
rogerman 2022-05-23 15:53:05 -07:00
parent d757d83b3d
commit 356fe47df7
2 changed files with 11 additions and 48 deletions

View File

@ -691,35 +691,9 @@ FORCEINLINE s64 GEM_Mul32x32To64(const s32 a, const s32 b)
#endif
}
static s32 GEM_SaturateAndShiftdown36To32(const s64 val)
{
if(val>(s64)0x000007FFFFFFFFFFULL) return (s32)0x7FFFFFFFU;
if(val<(s64)0xFFFFF80000000000ULL) return (s32)0x80000000U;
return fx32_shiftdown(val);
}
static void GEM_TransformVertex(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4])
{
const s32 x = vec[0];
const s32 y = vec[1];
const s32 z = vec[2];
const s32 w = vec[3];
//saturation logic is most carefully tested by:
//+ spectrobes beyond the portals excavation blower and drill tools: sets very large overflowing +x,+y in the modelview matrix to push things offscreen
//You can see this happening quite clearly: vertices will get translated to extreme values and overflow from a 7FFF-like to an 8000-like
//but if it's done wrongly, you can get bugs in:
//+ kingdom hearts re-coded: first conversation with cast characters will place them oddly with something overflowing to about 0xA???????
//other test cases that cropped up during this development, but are probably not actually related to this after all
//+ SM64: outside castle skybox
//+ NSMB: mario head screen wipe
vec[0] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[0]) + GEM_Mul32x32To64(y,mtx[4]) + GEM_Mul32x32To64(z,mtx[ 8]) + GEM_Mul32x32To64(w,mtx[12]) );
vec[1] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[1]) + GEM_Mul32x32To64(y,mtx[5]) + GEM_Mul32x32To64(z,mtx[ 9]) + GEM_Mul32x32To64(w,mtx[13]) );
vec[2] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[2]) + GEM_Mul32x32To64(y,mtx[6]) + GEM_Mul32x32To64(z,mtx[10]) + GEM_Mul32x32To64(w,mtx[14]) );
vec[3] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[3]) + GEM_Mul32x32To64(y,mtx[7]) + GEM_Mul32x32To64(z,mtx[11]) + GEM_Mul32x32To64(w,mtx[15]) );
MatrixMultVec4x4(mtx, vec);
}
//---------------

View File

@ -24,20 +24,6 @@
#include "matrix.h"
#include "MMU.h"
// NDS matrix math functions uses 20.12 fixed-point for calculations. According to
// GEM_TransformVertex(), dot product calculations use accumulation that goes beyond
// 32-bits and then saturates. Therefore, all fixed-point math functions (with the
// exception of matrix-by-matrix multiplication,) will also support that feature here.
//
// But for historical reasons, we can't enable this right away. Therefore, the scalar
// function GEM_TransformVertex() will continue to be used for SetVertex() while these
// fixed-point functions will remain as they are. In order to document the future
// intent of the fixed-point functions while retaining the existing functionality, the
// saturate code will be hidden by this macro.
//
// Testing is highly encouraged! Simply uncomment to try out this feature.
//#define FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
// The following floating-point functions exist for historical reasons and are deprecated.
// They should be obsoleted and removed as more of the geometry engine moves to fixed-point.
@ -575,7 +561,6 @@ static FORCEINLINE void __mtx4_translate_vec3_float_NEON(float (&__restrict inou
static FORCEINLINE s32 ___s32_saturate_shiftdown_accum64_fixed(s64 inAccum)
{
#ifdef FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
if (inAccum > (s64)0x000007FFFFFFFFFFULL)
{
return (s32)0x7FFFFFFFU;
@ -584,7 +569,6 @@ static FORCEINLINE s32 ___s32_saturate_shiftdown_accum64_fixed(s64 inAccum)
{
return (s32)0x80000000U;
}
#endif // FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
return sfx32_shiftdown(inAccum);
}
@ -596,6 +580,16 @@ static FORCEINLINE s32 __vec4_dotproduct_vec4_fixed(const s32 (&__restrict vecA)
static FORCEINLINE void __vec4_multiply_mtx4_fixed(s32 (&__restrict inoutVec)[4], const s32 (&__restrict inMtx)[16])
{
//saturation logic is most carefully tested by:
//+ spectrobes beyond the portals excavation blower and drill tools: sets very large overflowing +x,+y in the modelview matrix to push things offscreen
//You can see this happening quite clearly: vertices will get translated to extreme values and overflow from a 7FFF-like to an 8000-like
//but if it's done wrongly, you can get bugs in:
//+ kingdom hearts re-coded: first conversation with cast characters will place them oddly with something overflowing to about 0xA???????
//other test cases that cropped up during this development, but are probably not actually related to this after all
//+ SM64: outside castle skybox
//+ NSMB: mario head screen wipe
const CACHE_ALIGN s32 v[4] = {inoutVec[0], inoutVec[1], inoutVec[2], inoutVec[3]};
inoutVec[0] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inMtx[0],v[0]) + fx32_mul(inMtx[4],v[1]) + fx32_mul(inMtx[ 8],v[2]) + fx32_mul(inMtx[12],v[3]) );
@ -675,7 +669,6 @@ static FORCEINLINE void __mtx4_translate_vec3_fixed(s32 (&__restrict inoutMtx)[1
static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_SSE4(__m128i &inoutAccum)
{
#ifdef FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
v128u8 outVecMask;
#if defined(ENABLE_SSE4_2)
@ -697,8 +690,6 @@ static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_SSE4(__m128i &in
inoutAccum = _mm_blendv_epi8(outVecNeg, outVecPos, outVecSignMask);
#endif // ENABLE_SSE4_2
#endif // FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
inoutAccum = _mm_srli_epi64(inoutAccum, 12);
inoutAccum = _mm_shuffle_epi32(inoutAccum, 0xD8);
}
@ -967,7 +958,6 @@ static FORCEINLINE void __mtx4_translate_vec3_fixed_SSE4(s32 (&__restrict inoutM
static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_NEON(int64x2_t &inoutAccum)
{
#ifdef FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
int64x2_t outVecMask;
outVecMask = vcgtq_s64( inoutAccum, vdupq_n_s64((s64)0x000007FFFFFFFFFFULL) );
@ -975,7 +965,6 @@ static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_NEON(int64x2_t &
outVecMask = vcltq_s64( inoutAccum, vdupq_n_s64((s64)0xFFFFF80000000000ULL) );
inoutAccum = vbslq_s64( outVecMask, vdupq_n_s64((s64)0xFFFFF80000000000ULL), inoutAccum );
#endif // FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
inoutAccum = vshrq_n_s64(inoutAccum, 12);
inoutAccum = vreinterpretq_s64_s32( vuzp1q_s32(vreinterpretq_s32_s64(inoutAccum), vdupq_n_s32(0)) );