GFX3D: GEM_TransformVertex() now uses the SIMD-optimized generic matrix functions instead of using its own scalar-only version.
- This change results in a small, yet measurable, performance improvement. - Note that this change has the side-effect of enabling saturation logic for the following functions: MatrixMultVec3x3(), MatrixTranslate(), MatrixScale(). This is a change in their behavior, since these functions did not perform saturation logic before. This will need additional testing.
This commit is contained in:
parent
d757d83b3d
commit
356fe47df7
|
@ -691,35 +691,9 @@ FORCEINLINE s64 GEM_Mul32x32To64(const s32 a, const s32 b)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static s32 GEM_SaturateAndShiftdown36To32(const s64 val)
|
|
||||||
{
|
|
||||||
if(val>(s64)0x000007FFFFFFFFFFULL) return (s32)0x7FFFFFFFU;
|
|
||||||
if(val<(s64)0xFFFFF80000000000ULL) return (s32)0x80000000U;
|
|
||||||
|
|
||||||
return fx32_shiftdown(val);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void GEM_TransformVertex(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4])
|
static void GEM_TransformVertex(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4])
|
||||||
{
|
{
|
||||||
const s32 x = vec[0];
|
MatrixMultVec4x4(mtx, vec);
|
||||||
const s32 y = vec[1];
|
|
||||||
const s32 z = vec[2];
|
|
||||||
const s32 w = vec[3];
|
|
||||||
|
|
||||||
//saturation logic is most carefully tested by:
|
|
||||||
//+ spectrobes beyond the portals excavation blower and drill tools: sets very large overflowing +x,+y in the modelview matrix to push things offscreen
|
|
||||||
//You can see this happening quite clearly: vertices will get translated to extreme values and overflow from a 7FFF-like to an 8000-like
|
|
||||||
//but if it's done wrongly, you can get bugs in:
|
|
||||||
//+ kingdom hearts re-coded: first conversation with cast characters will place them oddly with something overflowing to about 0xA???????
|
|
||||||
|
|
||||||
//other test cases that cropped up during this development, but are probably not actually related to this after all
|
|
||||||
//+ SM64: outside castle skybox
|
|
||||||
//+ NSMB: mario head screen wipe
|
|
||||||
|
|
||||||
vec[0] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[0]) + GEM_Mul32x32To64(y,mtx[4]) + GEM_Mul32x32To64(z,mtx[ 8]) + GEM_Mul32x32To64(w,mtx[12]) );
|
|
||||||
vec[1] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[1]) + GEM_Mul32x32To64(y,mtx[5]) + GEM_Mul32x32To64(z,mtx[ 9]) + GEM_Mul32x32To64(w,mtx[13]) );
|
|
||||||
vec[2] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[2]) + GEM_Mul32x32To64(y,mtx[6]) + GEM_Mul32x32To64(z,mtx[10]) + GEM_Mul32x32To64(w,mtx[14]) );
|
|
||||||
vec[3] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[3]) + GEM_Mul32x32To64(y,mtx[7]) + GEM_Mul32x32To64(z,mtx[11]) + GEM_Mul32x32To64(w,mtx[15]) );
|
|
||||||
}
|
}
|
||||||
//---------------
|
//---------------
|
||||||
|
|
||||||
|
|
|
@ -24,20 +24,6 @@
|
||||||
#include "matrix.h"
|
#include "matrix.h"
|
||||||
#include "MMU.h"
|
#include "MMU.h"
|
||||||
|
|
||||||
// NDS matrix math functions uses 20.12 fixed-point for calculations. According to
|
|
||||||
// GEM_TransformVertex(), dot product calculations use accumulation that goes beyond
|
|
||||||
// 32-bits and then saturates. Therefore, all fixed-point math functions (with the
|
|
||||||
// exception of matrix-by-matrix multiplication,) will also support that feature here.
|
|
||||||
//
|
|
||||||
// But for historical reasons, we can't enable this right away. Therefore, the scalar
|
|
||||||
// function GEM_TransformVertex() will continue to be used for SetVertex() while these
|
|
||||||
// fixed-point functions will remain as they are. In order to document the future
|
|
||||||
// intent of the fixed-point functions while retaining the existing functionality, the
|
|
||||||
// saturate code will be hidden by this macro.
|
|
||||||
//
|
|
||||||
// Testing is highly encouraged! Simply uncomment to try out this feature.
|
|
||||||
//#define FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
|
|
||||||
|
|
||||||
|
|
||||||
// The following floating-point functions exist for historical reasons and are deprecated.
|
// The following floating-point functions exist for historical reasons and are deprecated.
|
||||||
// They should be obsoleted and removed as more of the geometry engine moves to fixed-point.
|
// They should be obsoleted and removed as more of the geometry engine moves to fixed-point.
|
||||||
|
@ -575,7 +561,6 @@ static FORCEINLINE void __mtx4_translate_vec3_float_NEON(float (&__restrict inou
|
||||||
|
|
||||||
static FORCEINLINE s32 ___s32_saturate_shiftdown_accum64_fixed(s64 inAccum)
|
static FORCEINLINE s32 ___s32_saturate_shiftdown_accum64_fixed(s64 inAccum)
|
||||||
{
|
{
|
||||||
#ifdef FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
|
|
||||||
if (inAccum > (s64)0x000007FFFFFFFFFFULL)
|
if (inAccum > (s64)0x000007FFFFFFFFFFULL)
|
||||||
{
|
{
|
||||||
return (s32)0x7FFFFFFFU;
|
return (s32)0x7FFFFFFFU;
|
||||||
|
@ -584,7 +569,6 @@ static FORCEINLINE s32 ___s32_saturate_shiftdown_accum64_fixed(s64 inAccum)
|
||||||
{
|
{
|
||||||
return (s32)0x80000000U;
|
return (s32)0x80000000U;
|
||||||
}
|
}
|
||||||
#endif // FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
|
|
||||||
|
|
||||||
return sfx32_shiftdown(inAccum);
|
return sfx32_shiftdown(inAccum);
|
||||||
}
|
}
|
||||||
|
@ -596,6 +580,16 @@ static FORCEINLINE s32 __vec4_dotproduct_vec4_fixed(const s32 (&__restrict vecA)
|
||||||
|
|
||||||
static FORCEINLINE void __vec4_multiply_mtx4_fixed(s32 (&__restrict inoutVec)[4], const s32 (&__restrict inMtx)[16])
|
static FORCEINLINE void __vec4_multiply_mtx4_fixed(s32 (&__restrict inoutVec)[4], const s32 (&__restrict inMtx)[16])
|
||||||
{
|
{
|
||||||
|
//saturation logic is most carefully tested by:
|
||||||
|
//+ spectrobes beyond the portals excavation blower and drill tools: sets very large overflowing +x,+y in the modelview matrix to push things offscreen
|
||||||
|
//You can see this happening quite clearly: vertices will get translated to extreme values and overflow from a 7FFF-like to an 8000-like
|
||||||
|
//but if it's done wrongly, you can get bugs in:
|
||||||
|
//+ kingdom hearts re-coded: first conversation with cast characters will place them oddly with something overflowing to about 0xA???????
|
||||||
|
|
||||||
|
//other test cases that cropped up during this development, but are probably not actually related to this after all
|
||||||
|
//+ SM64: outside castle skybox
|
||||||
|
//+ NSMB: mario head screen wipe
|
||||||
|
|
||||||
const CACHE_ALIGN s32 v[4] = {inoutVec[0], inoutVec[1], inoutVec[2], inoutVec[3]};
|
const CACHE_ALIGN s32 v[4] = {inoutVec[0], inoutVec[1], inoutVec[2], inoutVec[3]};
|
||||||
|
|
||||||
inoutVec[0] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inMtx[0],v[0]) + fx32_mul(inMtx[4],v[1]) + fx32_mul(inMtx[ 8],v[2]) + fx32_mul(inMtx[12],v[3]) );
|
inoutVec[0] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inMtx[0],v[0]) + fx32_mul(inMtx[4],v[1]) + fx32_mul(inMtx[ 8],v[2]) + fx32_mul(inMtx[12],v[3]) );
|
||||||
|
@ -675,7 +669,6 @@ static FORCEINLINE void __mtx4_translate_vec3_fixed(s32 (&__restrict inoutMtx)[1
|
||||||
|
|
||||||
static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_SSE4(__m128i &inoutAccum)
|
static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_SSE4(__m128i &inoutAccum)
|
||||||
{
|
{
|
||||||
#ifdef FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
|
|
||||||
v128u8 outVecMask;
|
v128u8 outVecMask;
|
||||||
|
|
||||||
#if defined(ENABLE_SSE4_2)
|
#if defined(ENABLE_SSE4_2)
|
||||||
|
@ -697,8 +690,6 @@ static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_SSE4(__m128i &in
|
||||||
inoutAccum = _mm_blendv_epi8(outVecNeg, outVecPos, outVecSignMask);
|
inoutAccum = _mm_blendv_epi8(outVecNeg, outVecPos, outVecSignMask);
|
||||||
#endif // ENABLE_SSE4_2
|
#endif // ENABLE_SSE4_2
|
||||||
|
|
||||||
#endif // FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
|
|
||||||
|
|
||||||
inoutAccum = _mm_srli_epi64(inoutAccum, 12);
|
inoutAccum = _mm_srli_epi64(inoutAccum, 12);
|
||||||
inoutAccum = _mm_shuffle_epi32(inoutAccum, 0xD8);
|
inoutAccum = _mm_shuffle_epi32(inoutAccum, 0xD8);
|
||||||
}
|
}
|
||||||
|
@ -967,7 +958,6 @@ static FORCEINLINE void __mtx4_translate_vec3_fixed_SSE4(s32 (&__restrict inoutM
|
||||||
|
|
||||||
static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_NEON(int64x2_t &inoutAccum)
|
static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_NEON(int64x2_t &inoutAccum)
|
||||||
{
|
{
|
||||||
#ifdef FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
|
|
||||||
int64x2_t outVecMask;
|
int64x2_t outVecMask;
|
||||||
|
|
||||||
outVecMask = vcgtq_s64( inoutAccum, vdupq_n_s64((s64)0x000007FFFFFFFFFFULL) );
|
outVecMask = vcgtq_s64( inoutAccum, vdupq_n_s64((s64)0x000007FFFFFFFFFFULL) );
|
||||||
|
@ -975,7 +965,6 @@ static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_NEON(int64x2_t &
|
||||||
|
|
||||||
outVecMask = vcltq_s64( inoutAccum, vdupq_n_s64((s64)0xFFFFF80000000000ULL) );
|
outVecMask = vcltq_s64( inoutAccum, vdupq_n_s64((s64)0xFFFFF80000000000ULL) );
|
||||||
inoutAccum = vbslq_s64( outVecMask, vdupq_n_s64((s64)0xFFFFF80000000000ULL), inoutAccum );
|
inoutAccum = vbslq_s64( outVecMask, vdupq_n_s64((s64)0xFFFFF80000000000ULL), inoutAccum );
|
||||||
#endif // FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE
|
|
||||||
|
|
||||||
inoutAccum = vshrq_n_s64(inoutAccum, 12);
|
inoutAccum = vshrq_n_s64(inoutAccum, 12);
|
||||||
inoutAccum = vreinterpretq_s64_s32( vuzp1q_s32(vreinterpretq_s32_s64(inoutAccum), vdupq_n_s32(0)) );
|
inoutAccum = vreinterpretq_s64_s32( vuzp1q_s32(vreinterpretq_s32_s64(inoutAccum), vdupq_n_s32(0)) );
|
||||||
|
|
Loading…
Reference in New Issue