diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index 5a6e669bb..c15bc6154 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -621,11 +621,11 @@ inline float vec3dot(float* a, float* b) { FORCEINLINE s32 mul_fixed32(s32 a, s32 b) { - return fx32_shiftdown(fx32_mul(a,b)); + return sfx32_shiftdown(fx32_mul(a,b)); } FORCEINLINE s32 vec3dot_fixed32(const s32* a, const s32* b) { - return fx32_shiftdown(fx32_mul(a[0],b[0]) + fx32_mul(a[1],b[1]) + fx32_mul(a[2],b[2])); + return sfx32_shiftdown(fx32_mul(a[0],b[0]) + fx32_mul(a[1],b[1]) + fx32_mul(a[2],b[2])); } #define SUBMITVERTEX(ii, nn) polylist->list[polylist->count].vertIndexes[ii] = tempVertInfo.map[nn]; diff --git a/desmume/src/matrix.cpp b/desmume/src/matrix.cpp index 666029bbf..330747c69 100644 --- a/desmume/src/matrix.cpp +++ b/desmume/src/matrix.cpp @@ -44,10 +44,10 @@ void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr) const s32 z = vecPtr[2]; const s32 w = vecPtr[3]; - vecPtr[0] = fx32_shiftdown(fx32_mul(x,matrix[0]) + fx32_mul(y,matrix[4]) + fx32_mul(z,matrix [8]) + fx32_mul(w,matrix[12])); - vecPtr[1] = fx32_shiftdown(fx32_mul(x,matrix[1]) + fx32_mul(y,matrix[5]) + fx32_mul(z,matrix[ 9]) + fx32_mul(w,matrix[13])); - vecPtr[2] = fx32_shiftdown(fx32_mul(x,matrix[2]) + fx32_mul(y,matrix[6]) + fx32_mul(z,matrix[10]) + fx32_mul(w,matrix[14])); - vecPtr[3] = fx32_shiftdown(fx32_mul(x,matrix[3]) + fx32_mul(y,matrix[7]) + fx32_mul(z,matrix[11]) + fx32_mul(w,matrix[15])); + vecPtr[0] = sfx32_shiftdown(fx32_mul(x,matrix[0]) + fx32_mul(y,matrix[4]) + fx32_mul(z,matrix [8]) + fx32_mul(w,matrix[12])); + vecPtr[1] = sfx32_shiftdown(fx32_mul(x,matrix[1]) + fx32_mul(y,matrix[5]) + fx32_mul(z,matrix[ 9]) + fx32_mul(w,matrix[13])); + vecPtr[2] = sfx32_shiftdown(fx32_mul(x,matrix[2]) + fx32_mul(y,matrix[6]) + fx32_mul(z,matrix[10]) + fx32_mul(w,matrix[14])); + vecPtr[3] = sfx32_shiftdown(fx32_mul(x,matrix[3]) + fx32_mul(y,matrix[7]) + fx32_mul(z,matrix[11]) + fx32_mul(w,matrix[15])); } void MatrixMultVec3x3_fixed(const s32 *matrix, s32 *vecPtr) @@ -56,9 +56,9 @@ void MatrixMultVec3x3_fixed(const s32 *matrix, s32 *vecPtr) const s32 y = vecPtr[1]; const s32 z = vecPtr[2]; - vecPtr[0] = fx32_shiftdown(fx32_mul(x,matrix[0]) + fx32_mul(y,matrix[4]) + fx32_mul(z,matrix[8])); - vecPtr[1] = fx32_shiftdown(fx32_mul(x,matrix[1]) + fx32_mul(y,matrix[5]) + fx32_mul(z,matrix[9])); - vecPtr[2] = fx32_shiftdown(fx32_mul(x,matrix[2]) + fx32_mul(y,matrix[6]) + fx32_mul(z,matrix[10])); + vecPtr[0] = sfx32_shiftdown(fx32_mul(x,matrix[0]) + fx32_mul(y,matrix[4]) + fx32_mul(z,matrix[8])); + vecPtr[1] = sfx32_shiftdown(fx32_mul(x,matrix[1]) + fx32_mul(y,matrix[5]) + fx32_mul(z,matrix[9])); + vecPtr[2] = sfx32_shiftdown(fx32_mul(x,matrix[2]) + fx32_mul(y,matrix[6]) + fx32_mul(z,matrix[10])); } //------------------------- @@ -384,25 +384,25 @@ void MatrixMultiply (s32 *matrix, const s32 *rightMatrix) { s32 tmpMatrix[16]; - tmpMatrix[0] = fx32_shiftdown(fx32_mul(matrix[0],rightMatrix[0])+fx32_mul(matrix[4],rightMatrix[1])+fx32_mul(matrix[8],rightMatrix[2])+fx32_mul(matrix[12],rightMatrix[3])); - tmpMatrix[1] = fx32_shiftdown(fx32_mul(matrix[1],rightMatrix[0])+fx32_mul(matrix[5],rightMatrix[1])+fx32_mul(matrix[9],rightMatrix[2])+fx32_mul(matrix[13],rightMatrix[3])); - tmpMatrix[2] = fx32_shiftdown(fx32_mul(matrix[2],rightMatrix[0])+fx32_mul(matrix[6],rightMatrix[1])+fx32_mul(matrix[10],rightMatrix[2])+fx32_mul(matrix[14],rightMatrix[3])); - tmpMatrix[3] = fx32_shiftdown(fx32_mul(matrix[3],rightMatrix[0])+fx32_mul(matrix[7],rightMatrix[1])+fx32_mul(matrix[11],rightMatrix[2])+fx32_mul(matrix[15],rightMatrix[3])); + tmpMatrix[0] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[0])+fx32_mul(matrix[4],rightMatrix[1])+fx32_mul(matrix[8],rightMatrix[2])+fx32_mul(matrix[12],rightMatrix[3])); + tmpMatrix[1] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[0])+fx32_mul(matrix[5],rightMatrix[1])+fx32_mul(matrix[9],rightMatrix[2])+fx32_mul(matrix[13],rightMatrix[3])); + tmpMatrix[2] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[0])+fx32_mul(matrix[6],rightMatrix[1])+fx32_mul(matrix[10],rightMatrix[2])+fx32_mul(matrix[14],rightMatrix[3])); + tmpMatrix[3] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[0])+fx32_mul(matrix[7],rightMatrix[1])+fx32_mul(matrix[11],rightMatrix[2])+fx32_mul(matrix[15],rightMatrix[3])); - tmpMatrix[4] = fx32_shiftdown(fx32_mul(matrix[0],rightMatrix[4])+fx32_mul(matrix[4],rightMatrix[5])+fx32_mul(matrix[8],rightMatrix[6])+fx32_mul(matrix[12],rightMatrix[7])); - tmpMatrix[5] = fx32_shiftdown(fx32_mul(matrix[1],rightMatrix[4])+fx32_mul(matrix[5],rightMatrix[5])+fx32_mul(matrix[9],rightMatrix[6])+fx32_mul(matrix[13],rightMatrix[7])); - tmpMatrix[6] = fx32_shiftdown(fx32_mul(matrix[2],rightMatrix[4])+fx32_mul(matrix[6],rightMatrix[5])+fx32_mul(matrix[10],rightMatrix[6])+fx32_mul(matrix[14],rightMatrix[7])); - tmpMatrix[7] = fx32_shiftdown(fx32_mul(matrix[3],rightMatrix[4])+fx32_mul(matrix[7],rightMatrix[5])+fx32_mul(matrix[11],rightMatrix[6])+fx32_mul(matrix[15],rightMatrix[7])); + tmpMatrix[4] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[4])+fx32_mul(matrix[4],rightMatrix[5])+fx32_mul(matrix[8],rightMatrix[6])+fx32_mul(matrix[12],rightMatrix[7])); + tmpMatrix[5] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[4])+fx32_mul(matrix[5],rightMatrix[5])+fx32_mul(matrix[9],rightMatrix[6])+fx32_mul(matrix[13],rightMatrix[7])); + tmpMatrix[6] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[4])+fx32_mul(matrix[6],rightMatrix[5])+fx32_mul(matrix[10],rightMatrix[6])+fx32_mul(matrix[14],rightMatrix[7])); + tmpMatrix[7] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[4])+fx32_mul(matrix[7],rightMatrix[5])+fx32_mul(matrix[11],rightMatrix[6])+fx32_mul(matrix[15],rightMatrix[7])); - tmpMatrix[8] = fx32_shiftdown(fx32_mul(matrix[0],rightMatrix[8])+fx32_mul(matrix[4],rightMatrix[9])+fx32_mul(matrix[8],rightMatrix[10])+fx32_mul(matrix[12],rightMatrix[11])); - tmpMatrix[9] = fx32_shiftdown(fx32_mul(matrix[1],rightMatrix[8])+fx32_mul(matrix[5],rightMatrix[9])+fx32_mul(matrix[9],rightMatrix[10])+fx32_mul(matrix[13],rightMatrix[11])); - tmpMatrix[10] = fx32_shiftdown(fx32_mul(matrix[2],rightMatrix[8])+fx32_mul(matrix[6],rightMatrix[9])+fx32_mul(matrix[10],rightMatrix[10])+fx32_mul(matrix[14],rightMatrix[11])); - tmpMatrix[11] = fx32_shiftdown(fx32_mul(matrix[3],rightMatrix[8])+fx32_mul(matrix[7],rightMatrix[9])+fx32_mul(matrix[11],rightMatrix[10])+fx32_mul(matrix[15],rightMatrix[11])); + tmpMatrix[8] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[8])+fx32_mul(matrix[4],rightMatrix[9])+fx32_mul(matrix[8],rightMatrix[10])+fx32_mul(matrix[12],rightMatrix[11])); + tmpMatrix[9] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[8])+fx32_mul(matrix[5],rightMatrix[9])+fx32_mul(matrix[9],rightMatrix[10])+fx32_mul(matrix[13],rightMatrix[11])); + tmpMatrix[10] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[8])+fx32_mul(matrix[6],rightMatrix[9])+fx32_mul(matrix[10],rightMatrix[10])+fx32_mul(matrix[14],rightMatrix[11])); + tmpMatrix[11] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[8])+fx32_mul(matrix[7],rightMatrix[9])+fx32_mul(matrix[11],rightMatrix[10])+fx32_mul(matrix[15],rightMatrix[11])); - tmpMatrix[12] = fx32_shiftdown(fx32_mul(matrix[0],rightMatrix[12])+fx32_mul(matrix[4],rightMatrix[13])+fx32_mul(matrix[8],rightMatrix[14])+fx32_mul(matrix[12],rightMatrix[15])); - tmpMatrix[13] = fx32_shiftdown(fx32_mul(matrix[1],rightMatrix[12])+fx32_mul(matrix[5],rightMatrix[13])+fx32_mul(matrix[9],rightMatrix[14])+fx32_mul(matrix[13],rightMatrix[15])); - tmpMatrix[14] = fx32_shiftdown(fx32_mul(matrix[2],rightMatrix[12])+fx32_mul(matrix[6],rightMatrix[13])+fx32_mul(matrix[10],rightMatrix[14])+fx32_mul(matrix[14],rightMatrix[15])); - tmpMatrix[15] = fx32_shiftdown(fx32_mul(matrix[3],rightMatrix[12])+fx32_mul(matrix[7],rightMatrix[13])+fx32_mul(matrix[11],rightMatrix[14])+fx32_mul(matrix[15],rightMatrix[15])); + tmpMatrix[12] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[12])+fx32_mul(matrix[4],rightMatrix[13])+fx32_mul(matrix[8],rightMatrix[14])+fx32_mul(matrix[12],rightMatrix[15])); + tmpMatrix[13] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[12])+fx32_mul(matrix[5],rightMatrix[13])+fx32_mul(matrix[9],rightMatrix[14])+fx32_mul(matrix[13],rightMatrix[15])); + tmpMatrix[14] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[12])+fx32_mul(matrix[6],rightMatrix[13])+fx32_mul(matrix[10],rightMatrix[14])+fx32_mul(matrix[14],rightMatrix[15])); + tmpMatrix[15] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[12])+fx32_mul(matrix[7],rightMatrix[13])+fx32_mul(matrix[11],rightMatrix[14])+fx32_mul(matrix[15],rightMatrix[15])); memcpy(matrix,tmpMatrix,sizeof(s32)*16); } @@ -411,7 +411,7 @@ void MatrixScale(s32 *matrix, const s32 *ptr) { //zero 21-sep-2010 - verified unrolling seems faster on my cpu MACRODO_N(12, - matrix[X] = fx32_shiftdown(fx32_mul(matrix[X],ptr[X>>2])) + matrix[X] = sfx32_shiftdown(fx32_mul(matrix[X],ptr[X>>2])) ); } @@ -423,7 +423,7 @@ void MatrixTranslate(s32 *matrix, const s32 *ptr) temp += fx32_mul(matrix[X+0],ptr[0]); temp += fx32_mul(matrix[X+4],ptr[1]); temp += fx32_mul(matrix[X+8],ptr[2]); - matrix[X+12] = fx32_shiftdown(temp); + matrix[X+12] = sfx32_shiftdown(temp); }); } diff --git a/desmume/src/types.h b/desmume/src/types.h index 7d46faa0c..1cfd8a95e 100644 --- a/desmume/src/types.h +++ b/desmume/src/types.h @@ -21,6 +21,7 @@ #include #include +#include //analyze microsoft compilers #ifdef _MSC_VER @@ -458,47 +459,30 @@ template inline void reconstruct(T* t) { new(t) T(); } -//-------------fixed point speedup macros +/* fixed point speedup macros */ -#ifdef _MSC_VER -#include -#endif - -FORCEINLINE s64 fx32_mul(const s32 a, const s32 b) +FORCEINLINE s32 sfx32_shiftdown(const s64 a) { -#ifdef _MSC_VER - return __emul(a,b); -#else - return ((s64)a)*((s64)b); -#endif -} + s64 shifted = fx32_shiftdown(a); -FORCEINLINE s32 fx32_shiftdown(const s64 a) -{ - s64 shifted; -#ifdef _MSC_VER - shifted = __ll_rshift(a,12); -#else - shifted = (a>>12); -#endif - //either matrix math is happening at higher precision (an extra bit would suffice, I think), or the sums sent to this are saturated. - //tested by: spectrobes beyond the portals excavation blower - //(it sets very large +x,+y in the modelview matrix to push things offscreen, but the +y will overflow and become negative if we're not careful) - //I didnt think very hard about what would be fastest here on 32bit systems - //NOTE: this was intended for use in MatrixMultVec4x4_M2; it may not be appropriate for other uses of fx32_shiftdown. - //if this causes problems we should refactor the math routines a bit to take care of saturating in another function - if(shifted>(s32)0x7FFFFFFF) return 0x7FFFFFFF; - else if(shifted<=(s32)0x80000000) return 0x80000000; - else return shifted; -} - -FORCEINLINE s64 fx32_shiftup(const s32 a) -{ -#ifdef _MSC_VER - return __ll_lshift(a,12); -#else - return ((s64)a)<<12; -#endif + /*either matrix math is happening at higher precision (an extra bit would suffice, + * I think), or the sums sent to this are saturated. + * + *tested by: spectrobes beyond the portals excavation blower + *(it sets very large +x,+y in the modelview matrix to push things offscreen, + *but the +y will overflow and become negative if we're not careful) + * + *I didnt think very hard about what would be fastest here on 32bit systems + *NOTE: this was intended for use in MatrixMultVec4x4_M2; it may not be appropriate for + * other uses of fx32_shiftdown. + *if this causes problems we should refactor the math routines a bit to take care of + * saturating in another function + */ + if(shifted>(s32)0x7FFFFFFF) + return 0x7FFFFFFF; + if(shifted<=(s32)0x80000000) + return 0x80000000; + return shifted; } #endif