diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index d89e0ef9c..d5d138508 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -1199,8 +1199,6 @@ static BOOL gfx3d_glLoadMatrix4x4(s32 v) GFX_DELAY(19); - //vector_fix2float<4>(mtxCurrent[mode], 4096.f); - if (mode == MATRIXMODE_POSITION_VECTOR) MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], mtxCurrent[MATRIXMODE_POSITION_VECTOR]); @@ -1217,8 +1215,6 @@ static BOOL gfx3d_glLoadMatrix4x3(s32 v) if(ML4x3ind<16) return FALSE; ML4x3ind = 0; - //vector_fix2float<4>(mtxCurrent[mode], 4096.f); - //fill in the unusued matrix values mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0; mtxCurrent[mode][15] = (1<<12); @@ -1241,8 +1237,6 @@ static BOOL gfx3d_glMultMatrix4x4(s32 v) GFX_DELAY(35); - //vector_fix2float<4>(mtxTemporal, 4096.f); - MatrixMultiply(mtxCurrent[mode], mtxTemporal); if (mode == MATRIXMODE_POSITION_VECTOR) @@ -1273,8 +1267,6 @@ static BOOL gfx3d_glMultMatrix4x3(s32 v) GFX_DELAY(31); - //vector_fix2float<4>(mtxTemporal, 4096.f); - //fill in the unusued matrix values mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0; mtxTemporal[15] = 1 << 12; @@ -1310,8 +1302,6 @@ static BOOL gfx3d_glMultMatrix3x3(s32 v) GFX_DELAY(28); - //vector_fix2float<3>(mtxTemporal, 4096.f); - //fill in the unusued matrix values mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0; mtxTemporal[15] = 1<<12; @@ -1796,8 +1786,6 @@ static BOOL gfx3d_glBoxTest(u32 v) //DS_ALIGN(16) VERT_POS4f vert = { verts[i].x, verts[i].y, verts[i].z, verts[i].w }; - //_MatrixMultVec4x4_NoSIMD(mtxCurrent[MATRIXMODE_POSITION], verts[i].coord); - //_MatrixMultVec4x4_NoSIMD(mtxCurrent[MATRIXMODE_PROJECTION], verts[i].coord); MatrixMultVec4x4(mtxCurrent[MATRIXMODE_POSITION], verts[i].coord); MatrixMultVec4x4(mtxCurrent[MATRIXMODE_PROJECTION], verts[i].coord); } diff --git a/desmume/src/matrix.cpp b/desmume/src/matrix.cpp index d8f806327..1930c4987 100644 --- a/desmume/src/matrix.cpp +++ b/desmume/src/matrix.cpp @@ -1,6 +1,6 @@ /* Copyright (C) 2006-2007 shash - Copyright (C) 2007-2018 DeSmuME team + Copyright (C) 2007-2022 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -24,6 +24,849 @@ #include "matrix.h" #include "MMU.h" +// NDS matrix math functions uses 20.12 fixed-point for calculations. According to +// GEM_TransformVertex(), dot product calculations use accumulation that goes beyond +// 32-bits and then saturates. Therefore, all fixed-point math functions will also +// support that feature here. +// +// But for historical reasons, we can't enable this right away. Therefore, the scalar +// function GEM_TransformVertex() will continue to be used for SetVertex() while these +// fixed-point functions will remain as they are. In order to document the future +// intent of the fixed-point functions while retaining the existing functionality, the +// saturate code will be hidden by this macro. +// +// Testing is highly encouraged! Simply uncomment to try out this feature. +//#define FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE + + +// The following floating-point functions exist for historical reasons and are deprecated. +// They should be obsoleted and removed as more of the geometry engine moves to fixed-point. +static FORCEINLINE void __mtx4_copy_mtx4_float(float (&__restrict outMtx)[16], const s32 (&__restrict inMtx)[16]) +{ + outMtx[ 0] = (float)inMtx[ 0]; + outMtx[ 1] = (float)inMtx[ 1]; + outMtx[ 2] = (float)inMtx[ 2]; + outMtx[ 3] = (float)inMtx[ 3]; + + outMtx[ 4] = (float)inMtx[ 4]; + outMtx[ 5] = (float)inMtx[ 5]; + outMtx[ 6] = (float)inMtx[ 6]; + outMtx[ 7] = (float)inMtx[ 7]; + + outMtx[ 8] = (float)inMtx[ 8]; + outMtx[ 9] = (float)inMtx[ 9]; + outMtx[10] = (float)inMtx[10]; + outMtx[11] = (float)inMtx[11]; + + outMtx[12] = (float)inMtx[12]; + outMtx[13] = (float)inMtx[13]; + outMtx[14] = (float)inMtx[14]; + outMtx[15] = (float)inMtx[15]; +} + +static FORCEINLINE void __mtx4_copynormalize_mtx4_float(float (&__restrict outMtx)[16], const s32 (&__restrict inMtx)[16]) +{ + outMtx[ 0] = (float)inMtx[ 0] / 4096.0f; + outMtx[ 1] = (float)inMtx[ 1] / 4096.0f; + outMtx[ 2] = (float)inMtx[ 2] / 4096.0f; + outMtx[ 3] = (float)inMtx[ 3] / 4096.0f; + + outMtx[ 4] = (float)inMtx[ 4] / 4096.0f; + outMtx[ 5] = (float)inMtx[ 5] / 4096.0f; + outMtx[ 6] = (float)inMtx[ 6] / 4096.0f; + outMtx[ 7] = (float)inMtx[ 7] / 4096.0f; + + outMtx[ 8] = (float)inMtx[ 8] / 4096.0f; + outMtx[ 9] = (float)inMtx[ 9] / 4096.0f; + outMtx[10] = (float)inMtx[10] / 4096.0f; + outMtx[11] = (float)inMtx[11] / 4096.0f; + + outMtx[12] = (float)inMtx[12] / 4096.0f; + outMtx[13] = (float)inMtx[13] / 4096.0f; + outMtx[14] = (float)inMtx[14] / 4096.0f; + outMtx[15] = (float)inMtx[15] / 4096.0f; +} + +static FORCEINLINE float __vec4_dotproduct_vec4_float(const float (&__restrict vecA)[4], const float (&__restrict vecB)[4]) +{ + return (vecA[0] * vecB[0]) + (vecA[1] * vecB[1]) + (vecA[2] * vecB[2]) + (vecA[3] * vecB[3]); +} + +static FORCEINLINE void __vec4_multiply_mtx4_float(float (&__restrict inoutVec)[4], const s32 (&__restrict inMtx)[16]) +{ + const CACHE_ALIGN float v[4] = {inoutVec[0], inoutVec[1], inoutVec[2], inoutVec[3]}; + + CACHE_ALIGN float m[16]; + __mtx4_copynormalize_mtx4_float(m, inMtx); + + inoutVec[0] = (m[0] * v[0]) + (m[4] * v[1]) + (m[ 8] * v[2]) + (m[12] * v[3]); + inoutVec[1] = (m[1] * v[0]) + (m[5] * v[1]) + (m[ 9] * v[2]) + (m[13] * v[3]); + inoutVec[2] = (m[2] * v[0]) + (m[6] * v[1]) + (m[10] * v[2]) + (m[14] * v[3]); + inoutVec[3] = (m[3] * v[0]) + (m[7] * v[1]) + (m[11] * v[2]) + (m[15] * v[3]); +} + +static FORCEINLINE void __vec3_multiply_mtx3_float(float (&__restrict inoutVec)[4], const s32 (&__restrict inMtx)[16]) +{ + const CACHE_ALIGN float v[4] = {inoutVec[0], inoutVec[1], inoutVec[2], inoutVec[3]}; + + CACHE_ALIGN float m[16]; + __mtx4_copynormalize_mtx4_float(m, inMtx); + + inoutVec[0] = (m[0] * v[0]) + (m[4] * v[1]) + (m[ 8] * v[2]); + inoutVec[1] = (m[1] * v[0]) + (m[5] * v[1]) + (m[ 9] * v[2]); + inoutVec[2] = (m[2] * v[0]) + (m[6] * v[1]) + (m[10] * v[2]); + inoutVec[3] = v[3]; +} + +static FORCEINLINE void __mtx4_multiply_mtx4_float(float (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]) +{ + CACHE_ALIGN float a[16]; + CACHE_ALIGN float b[16]; + + MatrixCopy(a, mtxA); + + // Can't call normal MatrixCopy() because the types would cause mtxB to become normalized. + // So instead, we need to call __mtx4_copy_mtx4_float() directly to copy the unmodified + // matrix values. + __mtx4_copy_mtx4_float(b, mtxB); + + mtxA[ 0] = (a[ 0] * b[ 0]) + (a[ 4] * b[ 1]) + (a[ 8] * b[ 2]) + (a[12] * b[ 3]); + mtxA[ 1] = (a[ 1] * b[ 0]) + (a[ 5] * b[ 1]) + (a[ 9] * b[ 2]) + (a[13] * b[ 3]); + mtxA[ 2] = (a[ 2] * b[ 0]) + (a[ 6] * b[ 1]) + (a[10] * b[ 2]) + (a[14] * b[ 3]); + mtxA[ 3] = (a[ 3] * b[ 0]) + (a[ 7] * b[ 1]) + (a[11] * b[ 2]) + (a[15] * b[ 3]); + + mtxA[ 4] = (a[ 0] * b[ 4]) + (a[ 4] * b[ 5]) + (a[ 8] * b[ 6]) + (a[12] * b[ 7]); + mtxA[ 5] = (a[ 1] * b[ 4]) + (a[ 5] * b[ 5]) + (a[ 9] * b[ 6]) + (a[13] * b[ 7]); + mtxA[ 6] = (a[ 2] * b[ 4]) + (a[ 6] * b[ 5]) + (a[10] * b[ 6]) + (a[14] * b[ 7]); + mtxA[ 7] = (a[ 3] * b[ 4]) + (a[ 7] * b[ 5]) + (a[11] * b[ 6]) + (a[15] * b[ 7]); + + mtxA[ 8] = (a[ 0] * b[ 8]) + (a[ 4] * b[ 9]) + (a[ 8] * b[10]) + (a[12] * b[11]); + mtxA[ 9] = (a[ 1] * b[ 8]) + (a[ 5] * b[ 9]) + (a[ 9] * b[10]) + (a[13] * b[11]); + mtxA[10] = (a[ 2] * b[ 8]) + (a[ 6] * b[ 9]) + (a[10] * b[10]) + (a[14] * b[11]); + mtxA[11] = (a[ 3] * b[ 8]) + (a[ 7] * b[ 9]) + (a[11] * b[10]) + (a[15] * b[11]); + + mtxA[12] = (a[ 0] * b[12]) + (a[ 4] * b[13]) + (a[ 8] * b[14]) + (a[12] * b[15]); + mtxA[13] = (a[ 1] * b[12]) + (a[ 5] * b[13]) + (a[ 9] * b[14]) + (a[13] * b[15]); + mtxA[14] = (a[ 2] * b[12]) + (a[ 6] * b[13]) + (a[10] * b[14]) + (a[14] * b[15]); + mtxA[15] = (a[ 3] * b[12]) + (a[ 7] * b[13]) + (a[11] * b[14]) + (a[15] * b[15]); +} + +static FORCEINLINE void __mtx4_scale_vec3_float(float (&__restrict inoutMtx)[16], const float (&__restrict inVec)[4]) +{ + inoutMtx[ 0] *= inVec[0]; + inoutMtx[ 1] *= inVec[0]; + inoutMtx[ 2] *= inVec[0]; + inoutMtx[ 3] *= inVec[0]; + + inoutMtx[ 4] *= inVec[1]; + inoutMtx[ 5] *= inVec[1]; + inoutMtx[ 6] *= inVec[1]; + inoutMtx[ 7] *= inVec[1]; + + inoutMtx[ 8] *= inVec[2]; + inoutMtx[ 9] *= inVec[2]; + inoutMtx[10] *= inVec[2]; + inoutMtx[11] *= inVec[2]; +} + +static FORCEINLINE void __mtx4_translate_vec3_float(float (&__restrict inoutMtx)[16], const float (&__restrict inVec)[4]) +{ + inoutMtx[12] = (inoutMtx[0] * inVec[0]) + (inoutMtx[4] * inVec[1]) + (inoutMtx[ 8] * inVec[2]); + inoutMtx[13] = (inoutMtx[1] * inVec[0]) + (inoutMtx[5] * inVec[1]) + (inoutMtx[ 9] * inVec[2]); + inoutMtx[14] = (inoutMtx[2] * inVec[0]) + (inoutMtx[6] * inVec[1]) + (inoutMtx[10] * inVec[2]); + inoutMtx[15] = (inoutMtx[3] * inVec[0]) + (inoutMtx[7] * inVec[1]) + (inoutMtx[11] * inVec[2]); +} + +// These SIMD functions may look fancy, but they still operate using floating-point, and therefore +// need to be obsoleted and removed. They exist for historical reasons, one of which is that they +// run on very old CPUs through plain ol' SSE. However, future geometry engine work will only be +// moving towards using the native NDS 20.12 fixed-point math, and so the fixed-point equivalent +// functions shall take precendence over these. + +#ifdef ENABLE_SSE + +#ifdef ENABLE_SSE2 +static FORCEINLINE void __mtx4_copy_mtx4_float_SSE2(float (&__restrict outMtx)[16], const s32 (&__restrict inMtx)[16]) +{ + _mm_store_ps( outMtx + 0, _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+0) ) ); + _mm_store_ps( outMtx + 4, _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+1) ) ); + _mm_store_ps( outMtx + 8, _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+2) ) ); + _mm_store_ps( outMtx +12, _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+3) ) ); +} +#endif // ENABLE_SSE2 + +static FORCEINLINE void __mtx4_copynormalize_mtx4_float_SSE(float (&__restrict outMtx)[16], const s32 (&__restrict inMtx)[16]) +{ +#ifdef ENABLE_SSE2 + __m128 row[4] = { + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+0) ), + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+1) ), + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+2) ), + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+3) ) + }; +#else + __m128 row[4] = { + _mm_setr_ps((float)inMtx[ 0], (float)inMtx[ 1], (float)inMtx[ 2], (float)inMtx[ 3]), + _mm_setr_ps((float)inMtx[ 4], (float)inMtx[ 5], (float)inMtx[ 6], (float)inMtx[ 7]), + _mm_setr_ps((float)inMtx[ 8], (float)inMtx[ 9], (float)inMtx[10], (float)inMtx[11]), + _mm_setr_ps((float)inMtx[12], (float)inMtx[13], (float)inMtx[14], (float)inMtx[15]) + }; +#endif // ENABLE_SSE2 + + const __m128 normalize = _mm_set1_ps(1.0f/4096.0f); + + row[0] = _mm_mul_ps(row[0], normalize); + _mm_store_ps(outMtx + 0, row[0]); + row[1] = _mm_mul_ps(row[1], normalize); + _mm_store_ps(outMtx + 4, row[1]); + row[2] = _mm_mul_ps(row[2], normalize); + _mm_store_ps(outMtx + 8, row[2]); + row[3] = _mm_mul_ps(row[3], normalize); + _mm_store_ps(outMtx +12, row[3]); +} + +#ifdef ENABLE_SSE4_1 +static FORCEINLINE float __vec4_dotproduct_vec4_float_SSE4(const float (&__restrict vecA)[4], const float (&__restrict vecB)[4]) +{ + const __m128 a = _mm_load_ps(vecA); + const __m128 b = _mm_load_ps(vecB); + const __m128 sum = _mm_dp_ps(a, b, 0xF1); + + return _mm_cvtss_f32(sum); +} +#endif // ENABLE_SSE4_1 + +static FORCEINLINE void __vec4_multiply_mtx4_float_SSE(float (&__restrict inoutVec)[4], const s32 (&__restrict inMtx)[16]) +{ +#ifdef ENABLE_SSE2 + __m128 row[4] = { + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+0) ), + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+1) ), + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+2) ), + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+3) ) + }; +#else + __m128 row[4] = { + _mm_setr_ps((float)inMtx[ 0], (float)inMtx[ 1], (float)inMtx[ 2], (float)inMtx[ 3]), + _mm_setr_ps((float)inMtx[ 4], (float)inMtx[ 5], (float)inMtx[ 6], (float)inMtx[ 7]), + _mm_setr_ps((float)inMtx[ 8], (float)inMtx[ 9], (float)inMtx[10], (float)inMtx[11]), + _mm_setr_ps((float)inMtx[12], (float)inMtx[13], (float)inMtx[14], (float)inMtx[15]) + }; +#endif // ENABLE_SSE2 + + const __m128 normalize = _mm_set1_ps(1.0f/4096.0f); + row[0] = _mm_mul_ps(row[0], normalize); + row[1] = _mm_mul_ps(row[1], normalize); + row[2] = _mm_mul_ps(row[2], normalize); + row[3] = _mm_mul_ps(row[3], normalize); + + const __m128 inVec = _mm_load_ps(inoutVec); + const __m128 v[4] = { + _mm_shuffle_ps(inVec, inVec, 0x00), + _mm_shuffle_ps(inVec, inVec, 0x55), + _mm_shuffle_ps(inVec, inVec, 0xAA), + _mm_shuffle_ps(inVec, inVec, 0xFF) + }; + + __m128 outVec; + outVec = _mm_mul_ps(row[0], v[0]); + outVec = _mm_add_ps( outVec, _mm_mul_ps(row[1], v[1]) ); + outVec = _mm_add_ps( outVec, _mm_mul_ps(row[2], v[2]) ); + outVec = _mm_add_ps( outVec, _mm_mul_ps(row[3], v[3]) ); + + _mm_store_ps(inoutVec, outVec); +} + +static FORCEINLINE void __vec3_multiply_mtx3_float_SSE(float (&__restrict inoutVec)[4], const s32 (&__restrict inMtx)[16]) +{ +#ifdef ENABLE_SSE2 + __m128 row[3] = { + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+0) ), + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+1) ), + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)inMtx+2) ) + }; +#else + __m128 row[3] = { + _mm_setr_ps((float)inMtx[ 0], (float)inMtx[ 1], (float)inMtx[ 2], (float)inMtx[ 3]), + _mm_setr_ps((float)inMtx[ 4], (float)inMtx[ 5], (float)inMtx[ 6], (float)inMtx[ 7]), + _mm_setr_ps((float)inMtx[ 8], (float)inMtx[ 9], (float)inMtx[10], (float)inMtx[11]) + }; +#endif // ENABLE_SSE2 + + const __m128 normalize = _mm_set1_ps(1.0f/4096.0f); + row[0] = _mm_mul_ps(row[0], normalize); + row[1] = _mm_mul_ps(row[1], normalize); + row[2] = _mm_mul_ps(row[2], normalize); + + const __m128 inVec = _mm_load_ps(inoutVec); + const __m128 v[3] = { + _mm_shuffle_ps(inVec, inVec, 0x00), + _mm_shuffle_ps(inVec, inVec, 0x55), + _mm_shuffle_ps(inVec, inVec, 0xAA) + }; + + __m128 outVec; + outVec = _mm_mul_ps(row[0], v[0]); + outVec = _mm_add_ps( outVec, _mm_mul_ps(row[1], v[1]) ); + outVec = _mm_add_ps( outVec, _mm_mul_ps(row[2], v[2]) ); + + _mm_store_ps(inoutVec, outVec); +} + +static FORCEINLINE void __mtx4_multiply_mtx4_float_SSE(float (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]) +{ +#ifdef ENABLE_SSE2 + __m128 rowB[4] = { + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)mtxB + 0) ), + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)mtxB + 1) ), + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)mtxB + 2) ), + _mm_cvtepi32_ps( _mm_load_si128((v128s32 *)mtxB + 3) ) + }; +#else + __m128 rowB[4] = { + _mm_setr_ps((float)mtxB[ 0], (float)mtxB[ 1], (float)mtxB[ 2], (float)mtxB[ 3]), + _mm_setr_ps((float)mtxB[ 4], (float)mtxB[ 5], (float)mtxB[ 6], (float)mtxB[ 7]), + _mm_setr_ps((float)mtxB[ 8], (float)mtxB[ 9], (float)mtxB[10], (float)mtxB[11]), + _mm_setr_ps((float)mtxB[12], (float)mtxB[13], (float)mtxB[14], (float)mtxB[15]) + }; +#endif // ENABLE_SSE2 + + const __m128 normalize = _mm_set1_ps(1.0f/4096.0f); + rowB[0] = _mm_mul_ps(rowB[0], normalize); + rowB[1] = _mm_mul_ps(rowB[1], normalize); + rowB[2] = _mm_mul_ps(rowB[2], normalize); + rowB[3] = _mm_mul_ps(rowB[3], normalize); + + const __m128 rowA[4] = { + _mm_load_ps(mtxA + 0), + _mm_load_ps(mtxA + 4), + _mm_load_ps(mtxA + 8), + _mm_load_ps(mtxA +12) + }; + + __m128 vecB[4]; + __m128 outRow; + + vecB[0] = _mm_shuffle_ps(rowB[0], rowB[0], 0x00); + vecB[1] = _mm_shuffle_ps(rowB[0], rowB[0], 0x55); + vecB[2] = _mm_shuffle_ps(rowB[0], rowB[0], 0xAA); + vecB[3] = _mm_shuffle_ps(rowB[0], rowB[0], 0xFF); + outRow = _mm_mul_ps(rowA[0], vecB[0]); + outRow = _mm_add_ps( outRow, _mm_mul_ps(rowA[1], vecB[1]) ); + outRow = _mm_add_ps( outRow, _mm_mul_ps(rowA[2], vecB[2]) ); + outRow = _mm_add_ps( outRow, _mm_mul_ps(rowA[3], vecB[3]) ); + _mm_store_ps(mtxA + 0, outRow); + + vecB[0] = _mm_shuffle_ps(rowB[1], rowB[1], 0x00); + vecB[1] = _mm_shuffle_ps(rowB[1], rowB[1], 0x55); + vecB[2] = _mm_shuffle_ps(rowB[1], rowB[1], 0xAA); + vecB[3] = _mm_shuffle_ps(rowB[1], rowB[1], 0xFF); + outRow = _mm_mul_ps(rowA[0], vecB[0]); + outRow = _mm_add_ps( outRow, _mm_mul_ps(rowA[1], vecB[1]) ); + outRow = _mm_add_ps( outRow, _mm_mul_ps(rowA[2], vecB[2]) ); + outRow = _mm_add_ps( outRow, _mm_mul_ps(rowA[3], vecB[3]) ); + _mm_store_ps(mtxA + 4, outRow); + + vecB[0] = _mm_shuffle_ps(rowB[2], rowB[2], 0x00); + vecB[1] = _mm_shuffle_ps(rowB[2], rowB[2], 0x55); + vecB[2] = _mm_shuffle_ps(rowB[2], rowB[2], 0xAA); + vecB[3] = _mm_shuffle_ps(rowB[2], rowB[2], 0xFF); + outRow = _mm_mul_ps(rowA[0], vecB[0]); + outRow = _mm_add_ps( outRow, _mm_mul_ps(rowA[1], vecB[1]) ); + outRow = _mm_add_ps( outRow, _mm_mul_ps(rowA[2], vecB[2]) ); + outRow = _mm_add_ps( outRow, _mm_mul_ps(rowA[3], vecB[3]) ); + _mm_store_ps(mtxA + 8, outRow); + + vecB[0] = _mm_shuffle_ps(rowB[3], rowB[3], 0x00); + vecB[1] = _mm_shuffle_ps(rowB[3], rowB[3], 0x55); + vecB[2] = _mm_shuffle_ps(rowB[3], rowB[3], 0xAA); + vecB[3] = _mm_shuffle_ps(rowB[3], rowB[3], 0xFF); + outRow = _mm_mul_ps(rowA[0], vecB[0]); + outRow = _mm_add_ps( outRow, _mm_mul_ps(rowA[1], vecB[1]) ); + outRow = _mm_add_ps( outRow, _mm_mul_ps(rowA[2], vecB[2]) ); + outRow = _mm_add_ps( outRow, _mm_mul_ps(rowA[3], vecB[3]) ); + _mm_store_ps(mtxA + 12, outRow); +} + +static FORCEINLINE void __mtx4_scale_vec3_float_SSE(float (&__restrict inoutMtx)[16], const float (&__restrict inVec)[4]) +{ + const __m128 inVec_m128 = _mm_load_ps(inVec); + const __m128 v[3] = { + _mm_shuffle_ps(inVec_m128, inVec_m128, 0x00), + _mm_shuffle_ps(inVec_m128, inVec_m128, 0x55), + _mm_shuffle_ps(inVec_m128, inVec_m128, 0xAA) + }; + + _mm_store_ps( inoutMtx, _mm_mul_ps( _mm_load_ps(inoutMtx+0), v[0] ) ); + _mm_store_ps( inoutMtx, _mm_mul_ps( _mm_load_ps(inoutMtx+4), v[1] ) ); + _mm_store_ps( inoutMtx, _mm_mul_ps( _mm_load_ps(inoutMtx+8), v[2] ) ); +} + +static FORCEINLINE void __mtx4_translate_vec3_float_SSE(float (&__restrict inoutMtx)[16], const float (&__restrict inVec)[4]) +{ + const __m128 inVec_m128 = _mm_load_ps(inVec); + const __m128 v[3] = { + _mm_shuffle_ps(inVec_m128, inVec_m128, 0x00), + _mm_shuffle_ps(inVec_m128, inVec_m128, 0x55), + _mm_shuffle_ps(inVec_m128, inVec_m128, 0xAA) + }; + + const __m128 row[3] = { + _mm_load_ps(inoutMtx + 0), + _mm_load_ps(inoutMtx + 4), + _mm_load_ps(inoutMtx + 8), + }; + + __m128 outVec; + outVec = _mm_mul_ps(row[0], v[0]); + outVec = _mm_add_ps( outVec, _mm_mul_ps(row[1], v[1]) ); + outVec = _mm_add_ps( outVec, _mm_mul_ps(row[2], v[2]) ); + + _mm_store_ps(inoutMtx+12, outVec); +} + +#endif // ENABLE_SSE + +static FORCEINLINE s32 ___s32_saturate_shiftdown_accum64_fixed(s64 inAccum) +{ +#ifdef FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE + if (inAccum > (s64)0x000007FFFFFFFFFFULL) + { + return (s32)0x7FFFFFFFU; + } + else if (inAccum < (s64)0xFFFFF80000000000ULL) + { + return (s32)0x80000000U; + } +#endif // FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE + + return sfx32_shiftdown(inAccum); +} + +static FORCEINLINE s32 __vec4_dotproduct_vec4_fixed(const s32 (&__restrict vecA)[4], const s32 (&__restrict vecB)[4]) +{ + return ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(vecA[0],vecB[0]) + fx32_mul(vecA[1],vecB[1]) + fx32_mul(vecA[2],vecB[2]) + fx32_mul(vecA[3],vecB[3]) ); +} + +static FORCEINLINE void __vec4_multiply_mtx4_fixed(s32 (&__restrict inoutVec)[4], const s32 (&__restrict inMtx)[16]) +{ + const CACHE_ALIGN s32 v[4] = {inoutVec[0], inoutVec[1], inoutVec[2], inoutVec[3]}; + + inoutVec[0] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inMtx[0],v[0]) + fx32_mul(inMtx[4],v[1]) + fx32_mul(inMtx[ 8],v[2]) + fx32_mul(inMtx[12],v[3]) ); + inoutVec[1] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inMtx[1],v[0]) + fx32_mul(inMtx[5],v[1]) + fx32_mul(inMtx[ 9],v[2]) + fx32_mul(inMtx[13],v[3]) ); + inoutVec[2] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inMtx[2],v[0]) + fx32_mul(inMtx[6],v[1]) + fx32_mul(inMtx[10],v[2]) + fx32_mul(inMtx[14],v[3]) ); + inoutVec[3] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inMtx[3],v[0]) + fx32_mul(inMtx[7],v[1]) + fx32_mul(inMtx[11],v[2]) + fx32_mul(inMtx[15],v[3]) ); +} + +static FORCEINLINE void __vec3_multiply_mtx3_fixed(s32 (&__restrict inoutVec)[4], const s32 (&__restrict inMtx)[16]) +{ + const CACHE_ALIGN s32 v[4] = {inoutVec[0], inoutVec[1], inoutVec[2], inoutVec[3]}; + + inoutVec[0] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inMtx[0],v[0]) + fx32_mul(inMtx[4],v[1]) + fx32_mul(inMtx[ 8],v[2]) ); + inoutVec[1] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inMtx[1],v[0]) + fx32_mul(inMtx[5],v[1]) + fx32_mul(inMtx[ 9],v[2]) ); + inoutVec[2] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inMtx[2],v[0]) + fx32_mul(inMtx[6],v[1]) + fx32_mul(inMtx[10],v[2]) ); + inoutVec[3] = v[3]; +} + +static FORCEINLINE void __mtx4_multiply_mtx4_fixed(s32 (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]) +{ + CACHE_ALIGN s32 a[16]; + MatrixCopy(a, mtxA); + + mtxA[ 0] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 0],mtxB[ 0]) + fx32_mul(a[ 4],mtxB[ 1]) + fx32_mul(a[ 8],mtxB[ 2]) + fx32_mul(a[12],mtxB[ 3]) ); + mtxA[ 1] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 1],mtxB[ 0]) + fx32_mul(a[ 5],mtxB[ 1]) + fx32_mul(a[ 9],mtxB[ 2]) + fx32_mul(a[13],mtxB[ 3]) ); + mtxA[ 2] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 2],mtxB[ 0]) + fx32_mul(a[ 6],mtxB[ 1]) + fx32_mul(a[10],mtxB[ 2]) + fx32_mul(a[14],mtxB[ 3]) ); + mtxA[ 3] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 3],mtxB[ 0]) + fx32_mul(a[ 7],mtxB[ 1]) + fx32_mul(a[11],mtxB[ 2]) + fx32_mul(a[15],mtxB[ 3]) ); + + mtxA[ 4] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 0],mtxB[ 4]) + fx32_mul(a[ 4],mtxB[ 5]) + fx32_mul(a[ 8],mtxB[ 6]) + fx32_mul(a[12],mtxB[ 7]) ); + mtxA[ 5] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 1],mtxB[ 4]) + fx32_mul(a[ 5],mtxB[ 5]) + fx32_mul(a[ 9],mtxB[ 6]) + fx32_mul(a[13],mtxB[ 7]) ); + mtxA[ 6] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 2],mtxB[ 4]) + fx32_mul(a[ 6],mtxB[ 5]) + fx32_mul(a[10],mtxB[ 6]) + fx32_mul(a[14],mtxB[ 7]) ); + mtxA[ 7] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 3],mtxB[ 4]) + fx32_mul(a[ 7],mtxB[ 5]) + fx32_mul(a[11],mtxB[ 6]) + fx32_mul(a[15],mtxB[ 7]) ); + + mtxA[ 8] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 0],mtxB[ 8]) + fx32_mul(a[ 4],mtxB[ 9]) + fx32_mul(a[ 8],mtxB[10]) + fx32_mul(a[12],mtxB[11]) ); + mtxA[ 9] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 1],mtxB[ 8]) + fx32_mul(a[ 5],mtxB[ 9]) + fx32_mul(a[ 9],mtxB[10]) + fx32_mul(a[13],mtxB[11]) ); + mtxA[10] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 2],mtxB[ 8]) + fx32_mul(a[ 6],mtxB[ 9]) + fx32_mul(a[10],mtxB[10]) + fx32_mul(a[14],mtxB[11]) ); + mtxA[11] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 3],mtxB[ 8]) + fx32_mul(a[ 7],mtxB[ 9]) + fx32_mul(a[11],mtxB[10]) + fx32_mul(a[15],mtxB[11]) ); + + mtxA[12] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 0],mtxB[12]) + fx32_mul(a[ 4],mtxB[13]) + fx32_mul(a[ 8],mtxB[14]) + fx32_mul(a[12],mtxB[15]) ); + mtxA[13] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 1],mtxB[12]) + fx32_mul(a[ 5],mtxB[13]) + fx32_mul(a[ 9],mtxB[14]) + fx32_mul(a[13],mtxB[15]) ); + mtxA[14] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 2],mtxB[12]) + fx32_mul(a[ 6],mtxB[13]) + fx32_mul(a[10],mtxB[14]) + fx32_mul(a[14],mtxB[15]) ); + mtxA[15] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(a[ 3],mtxB[12]) + fx32_mul(a[ 7],mtxB[13]) + fx32_mul(a[11],mtxB[14]) + fx32_mul(a[15],mtxB[15]) ); +} + +static FORCEINLINE void __mtx4_scale_vec3_fixed(s32 (&__restrict inoutMtx)[16], const s32 (&__restrict inVec)[4]) +{ + inoutMtx[ 0] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[ 0], inVec[0]) ); + inoutMtx[ 1] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[ 1], inVec[0]) ); + inoutMtx[ 2] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[ 2], inVec[0]) ); + inoutMtx[ 3] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[ 3], inVec[0]) ); + + inoutMtx[ 4] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[ 4], inVec[1]) ); + inoutMtx[ 5] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[ 5], inVec[1]) ); + inoutMtx[ 6] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[ 6], inVec[1]) ); + inoutMtx[ 7] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[ 7], inVec[1]) ); + + inoutMtx[ 8] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[ 8], inVec[2]) ); + inoutMtx[ 9] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[ 9], inVec[2]) ); + inoutMtx[10] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[10], inVec[2]) ); + inoutMtx[11] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[11], inVec[2]) ); +} + +static FORCEINLINE void __mtx4_translate_vec3_fixed(s32 (&__restrict inoutMtx)[16], const s32 (&__restrict inVec)[4]) +{ + inoutMtx[12] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[0], inVec[0]) + fx32_mul(inoutMtx[4], inVec[1]) + fx32_mul(inoutMtx[ 8], inVec[2]) + fx32_shiftup(inoutMtx[12]) ); + inoutMtx[13] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[1], inVec[0]) + fx32_mul(inoutMtx[5], inVec[1]) + fx32_mul(inoutMtx[ 9], inVec[2]) + fx32_shiftup(inoutMtx[13]) ); + inoutMtx[14] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[2], inVec[0]) + fx32_mul(inoutMtx[6], inVec[1]) + fx32_mul(inoutMtx[10], inVec[2]) + fx32_shiftup(inoutMtx[14]) ); + inoutMtx[15] = ___s32_saturate_shiftdown_accum64_fixed( fx32_mul(inoutMtx[3], inVec[0]) + fx32_mul(inoutMtx[7], inVec[1]) + fx32_mul(inoutMtx[11], inVec[2]) + fx32_shiftup(inoutMtx[15]) ); +} + +#ifdef ENABLE_SSE4_1 + +static FORCEINLINE void ___s32_saturate_shiftdown_accum64_fixed_SSE4(__m128i &inoutAccum) +{ +#ifdef FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE + v128u8 outVecMask; + +#if defined(ENABLE_SSE4_2) + outVecMask = _mm_cmpgt_epi64( inoutAccum, _mm_set1_epi64x((s64)0x000007FFFFFFFFFFULL) ); + inoutAccum = _mm_blendv_epi8( inoutAccum, _mm_set1_epi64x((s64)0x000007FFFFFFFFFFULL), outVecMask ); + + outVecMask = _mm_cmpgt_epi64( _mm_set1_epi64x((s64)0xFFFFF80000000000ULL), inoutAccum ); + inoutAccum = _mm_blendv_epi8( inoutAccum, _mm_set1_epi64x((s64)0xFFFFF80000000000ULL), outVecMask ); +#else + const v128u8 outVecSignMask = _mm_cmpeq_epi64( _mm_and_si128(inoutAccum, _mm_set1_epi64x((s64)0x8000000000000000ULL)), _mm_setzero_si128() ); + + outVecMask = _mm_cmpeq_epi64( _mm_and_si128(inoutAccum, _mm_set1_epi64x((s64)0x7FFFF80000000000ULL)), _mm_setzero_si128() ); + const v128u32 outVecPos = _mm_blendv_epi8( _mm_set1_epi64x((s64)0x000007FFFFFFFFFFULL), inoutAccum, outVecMask ); + + const v128u32 outVecFlipped = _mm_xor_si128(inoutAccum, _mm_set1_epi8(0xFF)); + outVecMask = _mm_cmpeq_epi64( _mm_and_si128(outVecFlipped, _mm_set1_epi64x((s64)0x7FFFF80000000000ULL)), _mm_setzero_si128() ); + const v128u32 outVecNeg = _mm_blendv_epi8( _mm_set1_epi64x((s64)0xFFFFF80000000000ULL), inoutAccum, outVecMask ); + + inoutAccum = _mm_blendv_epi8(outVecNeg, outVecPos, outVecSignMask); +#endif // ENABLE_SSE4_2 + +#endif // FIXED_POINT_MATH_FUNCTIONS_USE_ACCUMULATOR_SATURATE + + inoutAccum = _mm_srli_epi64(inoutAccum, 12); + inoutAccum = _mm_shuffle_epi32(inoutAccum, 0xD8); +} + +static FORCEINLINE s32 __vec4_dotproduct_vec4_fixed_SSE4(const s32 (&__restrict vecA)[4], const s32 (&__restrict vecB)[4]) +{ + // Due to SSE4.1's limitations, this function is actually slower than its scalar counterpart, + // and so we're just going to use that here. The SSE4.1 code is being included for reference + // as inspiration for porting to other ISAs that could see more benefit. + return __vec4_dotproduct_vec4_fixed(vecA, vecB); + + /* + const v128s32 inA = _mm_load_si128((v128s32 *)vecA); + const v128s32 inB = _mm_load_si128((v128s32 *)vecB); + + const v128s32 lo = _mm_mul_epi32( _mm_shuffle_epi32(inA, 0x50), _mm_shuffle_epi32(inB, 0x50) ); + const v128s32 hi = _mm_mul_epi32( _mm_shuffle_epi32(inA, 0xFA), _mm_shuffle_epi32(inB, 0xFA) ); + + s64 accum[4]; + _mm_store_si128((v128s32 *)&accum[0], lo); + _mm_store_si128((v128s32 *)&accum[2], hi); + + return ___s32_saturate_shiftdown_accum64_fixed( accum[0] + accum[1] + accum[2] + accum[3] ); + */ +} + +static FORCEINLINE void __vec4_multiply_mtx4_fixed_SSE4(s32 (&__restrict inoutVec)[4], const s32 (&__restrict inMtx)[16]) +{ + const v128s32 inVec = _mm_load_si128((v128s32 *)inoutVec); + + const v128s32 v[4] = { + _mm_shuffle_epi32(inVec, 0x00), + _mm_shuffle_epi32(inVec, 0x55), + _mm_shuffle_epi32(inVec, 0xAA), + _mm_shuffle_epi32(inVec, 0xFF) + }; + + const v128s32 row[4] = { + _mm_load_si128((v128s32 *)inMtx + 0), + _mm_load_si128((v128s32 *)inMtx + 1), + _mm_load_si128((v128s32 *)inMtx + 2), + _mm_load_si128((v128s32 *)inMtx + 3) + }; + + const v128s32 rowLo[4] = { + _mm_shuffle_epi32(row[0], 0x50), + _mm_shuffle_epi32(row[1], 0x50), + _mm_shuffle_epi32(row[2], 0x50), + _mm_shuffle_epi32(row[3], 0x50) + }; + + const v128s32 rowHi[4] = { + _mm_shuffle_epi32(row[0], 0xFA), + _mm_shuffle_epi32(row[1], 0xFA), + _mm_shuffle_epi32(row[2], 0xFA), + _mm_shuffle_epi32(row[3], 0xFA) + }; + + v128s32 outVecLo = _mm_mul_epi32(rowLo[0], v[0]); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[1], v[1]) ); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[2], v[2]) ); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[3], v[3]) ); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(outVecLo); + + v128s32 outVecHi = _mm_mul_epi32(rowHi[0], v[0]); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[1], v[1]) ); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[2], v[2]) ); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[3], v[3]) ); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(outVecHi); + + _mm_store_si128( (v128s32 *)inoutVec, _mm_unpacklo_epi64(outVecLo, outVecHi) ); +} + +static FORCEINLINE void __vec3_multiply_mtx3_fixed_SSE4(s32 (&__restrict inoutVec)[4], const s32 (&__restrict inMtx)[16]) +{ + const v128s32 inVec = _mm_load_si128((v128s32 *)inoutVec); + + const v128s32 v[3] = { + _mm_shuffle_epi32(inVec, 0x00), + _mm_shuffle_epi32(inVec, 0x55), + _mm_shuffle_epi32(inVec, 0xAA) + }; + + const v128s32 row[3] = { + _mm_load_si128((v128s32 *)inMtx + 0), + _mm_load_si128((v128s32 *)inMtx + 1), + _mm_load_si128((v128s32 *)inMtx + 2) + }; + + const v128s32 rowLo[4] = { + _mm_shuffle_epi32(row[0], 0x50), + _mm_shuffle_epi32(row[1], 0x50), + _mm_shuffle_epi32(row[2], 0x50) + }; + + const v128s32 rowHi[4] = { + _mm_shuffle_epi32(row[0], 0xFA), + _mm_shuffle_epi32(row[1], 0xFA), + _mm_shuffle_epi32(row[2], 0xFA) + }; + + v128s32 outVecLo = _mm_mul_epi32(rowLo[0], v[0]); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[1], v[1]) ); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[2], v[2]) ); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(outVecLo); + + v128s32 outVecHi = _mm_mul_epi32(rowHi[0], v[0]); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[1], v[1]) ); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[2], v[2]) ); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(outVecHi); + + v128s32 outVec = _mm_unpacklo_epi64(outVecLo, outVecHi); + outVec = _mm_blend_epi16(outVec, inVec, 0xC0); + + _mm_store_si128((v128s32 *)inoutVec, outVec); +} + +static FORCEINLINE void __mtx4_multiply_mtx4_fixed_SSE4(s32 (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]) +{ + const v128s32 rowA[4] = { + _mm_load_si128((v128s32 *)(mtxA + 0)), + _mm_load_si128((v128s32 *)(mtxA + 4)), + _mm_load_si128((v128s32 *)(mtxA + 8)), + _mm_load_si128((v128s32 *)(mtxA +12)) + }; + + const v128s32 rowB[4] = { + _mm_load_si128((v128s32 *)(mtxB + 0)), + _mm_load_si128((v128s32 *)(mtxB + 4)), + _mm_load_si128((v128s32 *)(mtxB + 8)), + _mm_load_si128((v128s32 *)(mtxB +12)) + }; + + const v128s32 rowLo[4] = { + _mm_shuffle_epi32(rowA[0], 0x50), + _mm_shuffle_epi32(rowA[1], 0x50), + _mm_shuffle_epi32(rowA[2], 0x50), + _mm_shuffle_epi32(rowA[3], 0x50) + }; + + const v128s32 rowHi[4] = { + _mm_shuffle_epi32(rowA[0], 0xFA), + _mm_shuffle_epi32(rowA[1], 0xFA), + _mm_shuffle_epi32(rowA[2], 0xFA), + _mm_shuffle_epi32(rowA[3], 0xFA) + }; + + v128s32 outVecLo; + v128s32 outVecHi; + v128s32 v[4]; + + v[0] = _mm_shuffle_epi32(rowB[0], 0x00); + v[1] = _mm_shuffle_epi32(rowB[0], 0x55); + v[2] = _mm_shuffle_epi32(rowB[0], 0xAA); + v[3] = _mm_shuffle_epi32(rowB[0], 0xFF); + outVecLo = _mm_mul_epi32(rowLo[0], v[0]); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[1], v[1]) ); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[2], v[2]) ); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[3], v[3]) ); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(outVecLo); + outVecHi = _mm_mul_epi32(rowHi[0], v[0]); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[1], v[1]) ); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[2], v[2]) ); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[3], v[3]) ); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(outVecHi); + _mm_store_si128( (v128s32 *)(mtxA + 0), _mm_unpacklo_epi64(outVecLo, outVecHi) ); + + v[0] = _mm_shuffle_epi32(rowB[1], 0x00); + v[1] = _mm_shuffle_epi32(rowB[1], 0x55); + v[2] = _mm_shuffle_epi32(rowB[1], 0xAA); + v[3] = _mm_shuffle_epi32(rowB[1], 0xFF); + outVecLo = _mm_mul_epi32(rowLo[0], v[0]); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[1], v[1]) ); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[2], v[2]) ); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[3], v[3]) ); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(outVecLo); + outVecHi = _mm_mul_epi32(rowHi[0], v[0]); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[1], v[1]) ); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[2], v[2]) ); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[3], v[3]) ); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(outVecHi); + _mm_store_si128( (v128s32 *)(mtxA + 4), _mm_unpacklo_epi64(outVecLo, outVecHi) ); + + v[0] = _mm_shuffle_epi32(rowB[2], 0x00); + v[1] = _mm_shuffle_epi32(rowB[2], 0x55); + v[2] = _mm_shuffle_epi32(rowB[2], 0xAA); + v[3] = _mm_shuffle_epi32(rowB[2], 0xFF); + outVecLo = _mm_mul_epi32(rowLo[0], v[0]); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[1], v[1]) ); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[2], v[2]) ); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[3], v[3]) ); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(outVecLo); + outVecHi = _mm_mul_epi32(rowHi[0], v[0]); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[1], v[1]) ); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[2], v[2]) ); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[3], v[3]) ); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(outVecHi); + _mm_store_si128( (v128s32 *)(mtxA + 8), _mm_unpacklo_epi64(outVecLo, outVecHi) ); + + v[0] = _mm_shuffle_epi32(rowB[3], 0x00); + v[1] = _mm_shuffle_epi32(rowB[3], 0x55); + v[2] = _mm_shuffle_epi32(rowB[3], 0xAA); + v[3] = _mm_shuffle_epi32(rowB[3], 0xFF); + outVecLo = _mm_mul_epi32(rowLo[0], v[0]); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[1], v[1]) ); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[2], v[2]) ); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[3], v[3]) ); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(outVecLo); + outVecHi = _mm_mul_epi32(rowHi[0], v[0]); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[1], v[1]) ); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[2], v[2]) ); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[3], v[3]) ); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(outVecHi); + _mm_store_si128( (v128s32 *)(mtxA +12), _mm_unpacklo_epi64(outVecLo, outVecHi) ); +} + +static FORCEINLINE void __mtx4_scale_vec3_fixed_SSE4(s32 (&__restrict inoutMtx)[16], const s32 (&__restrict inVec)[4]) +{ + const v128s32 inVec_v128 = _mm_load_si128((v128s32 *)inVec); + const v128s32 v[3] = { + _mm_shuffle_epi32(inVec_v128, 0x00), + _mm_shuffle_epi32(inVec_v128, 0x55), + _mm_shuffle_epi32(inVec_v128, 0xAA) + }; + + v128s32 row[3] = { + _mm_load_si128((v128s32 *)inoutMtx + 0), + _mm_load_si128((v128s32 *)inoutMtx + 1), + _mm_load_si128((v128s32 *)inoutMtx + 2) + }; + + v128s32 rowLo; + v128s32 rowHi; + + rowLo = _mm_shuffle_epi32(row[0], 0x50); + rowLo = _mm_mul_epi32(rowLo, v[0]); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(rowLo); + + rowHi = _mm_shuffle_epi32(row[0], 0xFA); + rowHi = _mm_mul_epi32(rowHi, v[0]); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(rowHi); + _mm_store_si128( (v128s32 *)inoutMtx + 0, _mm_unpacklo_epi64(rowLo, rowHi) ); + + rowLo = _mm_shuffle_epi32(row[1], 0x50); + rowLo = _mm_mul_epi32(rowLo, v[1]); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(rowLo); + + rowHi = _mm_shuffle_epi32(row[1], 0xFA); + rowHi = _mm_mul_epi32(rowHi, v[1]); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(rowHi); + _mm_store_si128( (v128s32 *)inoutMtx + 1, _mm_unpacklo_epi64(rowLo, rowHi) ); + + rowLo = _mm_shuffle_epi32(row[2], 0x50); + rowLo = _mm_mul_epi32(rowLo, v[2]); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(rowLo); + + rowHi = _mm_shuffle_epi32(row[2], 0xFA); + rowHi = _mm_mul_epi32(rowHi, v[2]); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(rowHi); + _mm_store_si128( (v128s32 *)inoutMtx + 2, _mm_unpacklo_epi64(rowLo, rowHi) ); +} + +static FORCEINLINE void __mtx4_translate_vec3_fixed_SSE4(s32 (&__restrict inoutMtx)[16], const s32 (&__restrict inVec)[4]) +{ + const v128s32 tempVec = _mm_load_si128((v128s32 *)inVec); + + const v128s32 v[3] = { + _mm_shuffle_epi32(tempVec, 0x00), + _mm_shuffle_epi32(tempVec, 0x55), + _mm_shuffle_epi32(tempVec, 0xAA) + }; + + const v128s32 row[4] = { + _mm_load_si128((v128s32 *)(inoutMtx + 0)), + _mm_load_si128((v128s32 *)(inoutMtx + 4)), + _mm_load_si128((v128s32 *)(inoutMtx + 8)), + _mm_load_si128((v128s32 *)(inoutMtx +12)) + }; + + // Notice how we use pmovsxdq for the 4th row instead of pshufd. This is + // because the dot product calculation for the 4th row involves adding a + // 12-bit shift up (psllq) instead of adding a pmuldq. When using SSE + // vectors as 64x2, pmuldq ignores the high 32 bits, while psllq needs + // those high bits in case of a negative number. pmovsxdq does preserve + // the sign bits, while pshufd does not. + + const v128s32 rowLo[4] = { + _mm_shuffle_epi32(row[0], 0x50), + _mm_shuffle_epi32(row[1], 0x50), + _mm_shuffle_epi32(row[2], 0x50), + _mm_cvtepi32_epi64(row[3]) + }; + + const v128s32 rowHi[4] = { + _mm_shuffle_epi32(row[0], 0xFA), + _mm_shuffle_epi32(row[1], 0xFA), + _mm_shuffle_epi32(row[2], 0xFA), + _mm_cvtepi32_epi64( _mm_srli_si128(row[3],8) ) + }; + + v128s32 outVecLo; + v128s32 outVecHi; + + outVecLo = _mm_mul_epi32(rowLo[0], v[0]); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[1], v[1]) ); + outVecLo = _mm_add_epi64( outVecLo, _mm_mul_epi32(rowLo[2], v[2]) ); + outVecLo = _mm_add_epi64( outVecLo, _mm_slli_epi64(rowLo[3], 12) ); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(outVecLo); + + outVecHi = _mm_mul_epi32(rowHi[0], v[0]); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[1], v[1]) ); + outVecHi = _mm_add_epi64( outVecHi, _mm_mul_epi32(rowHi[2], v[2]) ); + outVecHi = _mm_add_epi64( outVecHi, _mm_slli_epi64(rowHi[3], 12) ); + ___s32_saturate_shiftdown_accum64_fixed_SSE4(outVecHi); + + _mm_store_si128( (v128s32 *)(inoutMtx + 12), _mm_unpacklo_epi64(outVecLo, outVecHi) ); +} + +#endif // ENABLE_SSE4_1 void MatrixInit(s32 (&mtx)[16]) { @@ -44,7 +887,7 @@ void MatrixIdentity(s32 (&mtx)[16]) 0, 0, 0, (1 << 12) }; - memcpy(mtx, mtxIdentity, sizeof(s32)*16); + MatrixCopy(mtx, mtxIdentity); } void MatrixIdentity(float (&mtx)[16]) @@ -56,7 +899,7 @@ void MatrixIdentity(float (&mtx)[16]) 0.0f, 0.0f, 0.0f, 1.0f }; - memcpy(mtx, mtxIdentity, sizeof(float)*16); + MatrixCopy(mtx, mtxIdentity); } void MatrixSet(s32 (&mtx)[16], const size_t x, const size_t y, const s32 value) @@ -76,35 +919,23 @@ void MatrixSet(float (&mtx)[16], const size_t x, const size_t y, const s32 value void MatrixCopy(s32 (&__restrict mtxDst)[16], const s32 (&__restrict mtxSrc)[16]) { - memcpy(mtxDst, mtxSrc, sizeof(s32)*16); + buffer_copy_fast((s32 *)mtxDst, (s32 *)mtxSrc); } void MatrixCopy(float (&__restrict mtxDst)[16], const float (&__restrict mtxSrc)[16]) { + // Can't use buffer_copy_fast() here because it assumes the copying of integers, + // so just use regular memcpy() for copying the floats. memcpy(mtxDst, mtxSrc, sizeof(float)*16); } void MatrixCopy(float (&__restrict mtxDst)[16], const s32 (&__restrict mtxSrc)[16]) { - mtxDst[ 0] = (float)mtxSrc[ 0] / 4096.0f; - mtxDst[ 1] = (float)mtxSrc[ 1] / 4096.0f; - mtxDst[ 2] = (float)mtxSrc[ 2] / 4096.0f; - mtxDst[ 3] = (float)mtxSrc[ 3] / 4096.0f; - - mtxDst[ 4] = (float)mtxSrc[ 4] / 4096.0f; - mtxDst[ 5] = (float)mtxSrc[ 5] / 4096.0f; - mtxDst[ 6] = (float)mtxSrc[ 6] / 4096.0f; - mtxDst[ 7] = (float)mtxSrc[ 7] / 4096.0f; - - mtxDst[ 8] = (float)mtxSrc[ 8] / 4096.0f; - mtxDst[ 9] = (float)mtxSrc[ 9] / 4096.0f; - mtxDst[10] = (float)mtxSrc[10] / 4096.0f; - mtxDst[11] = (float)mtxSrc[11] / 4096.0f; - - mtxDst[12] = (float)mtxSrc[12] / 4096.0f; - mtxDst[13] = (float)mtxSrc[13] / 4096.0f; - mtxDst[14] = (float)mtxSrc[14] / 4096.0f; - mtxDst[15] = (float)mtxSrc[15] / 4096.0f; +#if defined(ENABLE_SSE) + __mtx4_copynormalize_mtx4_float_SSE(mtxDst, mtxSrc); +#else + __mtx4_copynormalize_mtx4_float(mtxDst, mtxSrc); +#endif } int MatrixCompare(const s32 (&__restrict mtxDst)[16], const s32 (&__restrict mtxSrc)[16]) @@ -121,25 +952,37 @@ s32 MatrixGetMultipliedIndex(const u32 index, const s32 (&__restrict mtxA)[16], { assert(index < 16); - const size_t iMod = index % 4; - const size_t iDiv = (index >> 2) << 2; + const u32 col = index & 0x00000003; + const u32 row = index & 0x0000000C; - const s32 temp = sfx32_shiftdown( fx32_mul(mtxA[iMod ], mtxB[iDiv ]) + fx32_mul(mtxA[iMod+ 4], mtxB[iDiv+1]) + fx32_mul(mtxA[iMod+8], mtxB[iDiv+2]) + fx32_mul(mtxA[iMod+12], mtxB[iDiv+3]) ); - return temp; + const s32 vecA[4] = { mtxA[col+0], mtxA[col+4], mtxA[col+8], mtxA[col+12] }; + const s32 vecB[4] = { mtxB[row+0], mtxB[row+1], mtxB[row+2], mtxB[row+3] }; + +#if defined(ENABLE_SSE4_1) + return __vec4_dotproduct_vec4_fixed_SSE4(vecA, vecB); +#else + return __vec4_dotproduct_vec4_fixed(vecA, vecB); +#endif } float MatrixGetMultipliedIndex(const u32 index, const float (&__restrict mtxA)[16], const float (&__restrict mtxB)[16]) { assert(index < 16); - const size_t iMod = index % 4; - const size_t iDiv = (index >> 2) << 2; + const u32 col = index & 0x00000003; + const u32 row = index & 0x0000000C; - const float temp = (mtxA[iMod ] * mtxB[iDiv ]) + (mtxA[iMod+ 4] * mtxB[iDiv+1]) + (mtxA[iMod+8] * mtxB[iDiv+2]) + (mtxA[iMod+12] * mtxB[iDiv+3]); - return temp; + const float vecA[4] = { mtxA[col+0], mtxA[col+4], mtxA[col+8], mtxA[col+12] }; + const float vecB[4] = { mtxB[row+0], mtxB[row+1], mtxB[row+2], mtxB[row+3] }; + +#if defined(ENABLE_SSE4_1) + return __vec4_dotproduct_vec4_float_SSE4(vecA, vecB); +#else + return __vec4_dotproduct_vec4_float(vecA, vecB); +#endif } -template +template void MatrixStackInit(MatrixStack *stack) { for (size_t i = 0; i < MatrixStack::size; i++) @@ -150,7 +993,7 @@ void MatrixStackInit(MatrixStack *stack) stack->position = 0; } -template +template s32* MatrixStackGet(MatrixStack *stack) { return stack->matrix[stack->position]; @@ -166,6 +1009,7 @@ template s32* MatrixStackGet(MatrixStack *stack); template s32* MatrixStackGet(MatrixStack *stack); template s32* MatrixStackGet(MatrixStack *stack); +// TODO: All of these float-based vector functions are obsolete and should be deleted. void Vector2Copy(float *dst, const float *src) { dst[0] = src[0]; @@ -207,7 +1051,6 @@ void Vector3Cross(float* dst, const float *a, const float *b) dst[2] = a[0]*b[1] - a[1]*b[0]; } - float Vector3Length(const float *a) { float lengthSquared = Vector3Dot(a,a); @@ -257,769 +1100,92 @@ void Vector4Copy(float *dst, const float *src) dst[3] = src[3]; } -void _MatrixMultVec4x4_NoSIMD(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]) -{ - const CACHE_ALIGN float mtxFloat[16] = { - mtx[ 0] / 4096.0f, - mtx[ 1] / 4096.0f, - mtx[ 2] / 4096.0f, - mtx[ 3] / 4096.0f, - - mtx[ 4] / 4096.0f, - mtx[ 5] / 4096.0f, - mtx[ 6] / 4096.0f, - mtx[ 7] / 4096.0f, - - mtx[ 8] / 4096.0f, - mtx[ 9] / 4096.0f, - mtx[10] / 4096.0f, - mtx[11] / 4096.0f, - - mtx[12] / 4096.0f, - mtx[13] / 4096.0f, - mtx[14] / 4096.0f, - mtx[15] / 4096.0f - }; - - const float x = vec[0]; - const float y = vec[1]; - const float z = vec[2]; - const float w = vec[3]; - - vec[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]) + (w * mtxFloat[12]); - vec[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]) + (w * mtxFloat[13]); - vec[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]) + (w * mtxFloat[14]); - vec[3] = (x * mtxFloat[3]) + (y * mtxFloat[7]) + (z * mtxFloat[11]) + (w * mtxFloat[15]); -} - -#ifdef ENABLE_SSE - -void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]) -{ - const __m128 loadedVec = _mm_load_ps(vec); - const __m128 convertScalar = _mm_set1_ps(1.0f/4096.0f); - -#ifdef ENABLE_SSE2 - __m128 row[4] = { - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 0)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 4)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 8)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 12)) ) - }; -#else - const CACHE_ALIGN float mtxFloat[16] = { - (float)mtx[0], - (float)mtx[1], - (float)mtx[2], - (float)mtx[3], - - (float)mtx[4], - (float)mtx[5], - (float)mtx[6], - (float)mtx[7], - - (float)mtx[8], - (float)mtx[9], - (float)mtx[10], - (float)mtx[11], - - (float)mtx[12], - (float)mtx[13], - (float)mtx[14], - (float)mtx[15] - }; - - __m128 row[4] = { - _mm_load_ps(mtxFloat + 0), - _mm_load_ps(mtxFloat + 4), - _mm_load_ps(mtxFloat + 8), - _mm_load_ps(mtxFloat + 12) - }; -#endif - - row[0] = _mm_mul_ps(row[0], convertScalar); - row[1] = _mm_mul_ps(row[1], convertScalar); - row[2] = _mm_mul_ps(row[2], convertScalar); - row[3] = _mm_mul_ps(row[3], convertScalar); - - const __m128 scalar[4] = { - _mm_shuffle_ps(loadedVec, loadedVec, 0x00), - _mm_shuffle_ps(loadedVec, loadedVec, 0x55), - _mm_shuffle_ps(loadedVec, loadedVec, 0xAA), - _mm_shuffle_ps(loadedVec, loadedVec, 0xFF) - }; - - const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], scalar[0]), _mm_add_ps(_mm_mul_ps(row[1], scalar[1]), _mm_add_ps(_mm_mul_ps(row[2], scalar[2]), _mm_mul_ps(row[3], scalar[3]))) ); - _mm_store_ps(vec, calcVec); -} - -void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]) -{ - const __m128 loadedVec = _mm_load_ps(vec); - const __m128 convertScalar = _mm_set1_ps(1.0f/4096.0f); - -#ifdef ENABLE_SSE2 - __m128 row[3] = { - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 0)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 4)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 8)) ) - }; -#else - const CACHE_ALIGN float mtxFloat[16] = { - (float)mtx[0], - (float)mtx[1], - (float)mtx[2], - (float)mtx[3], - - (float)mtx[4], - (float)mtx[5], - (float)mtx[6], - (float)mtx[7], - - (float)mtx[8], - (float)mtx[9], - (float)mtx[10], - (float)mtx[11], - - (float)mtx[12], - (float)mtx[13], - (float)mtx[14], - (float)mtx[15] - }; - - __m128 row[3] = { - _mm_load_ps(mtxFloat + 0), - _mm_load_ps(mtxFloat + 4), - _mm_load_ps(mtxFloat + 8) - }; -#endif - - row[0] = _mm_mul_ps(row[0], convertScalar); - row[1] = _mm_mul_ps(row[1], convertScalar); - row[2] = _mm_mul_ps(row[2], convertScalar); - - const __m128 scalar[3] = { - _mm_shuffle_ps(loadedVec, loadedVec, 0x00), - _mm_shuffle_ps(loadedVec, loadedVec, 0x55), - _mm_shuffle_ps(loadedVec, loadedVec, 0xAA) - }; - - const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], scalar[0]), _mm_add_ps(_mm_mul_ps(row[1], scalar[1]), _mm_mul_ps(row[2], scalar[2])) ); - _mm_store_ps(vec, calcVec); -} - -void MatrixTranslate(float (&__restrict mtx)[16], const float (&__restrict vec)[4]) -{ - __m128 xmm4 = _mm_load_ps(vec); - __m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101)); - __m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010)); - xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000)); - - xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtx)); - xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtx+4)); - xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtx+8)); - xmm4 = _mm_add_ps(xmm4,xmm5); - xmm4 = _mm_add_ps(xmm4,xmm6); - xmm4 = _mm_add_ps(xmm4,_mm_load_ps(mtx+12)); - _mm_store_ps(mtx+12,xmm4); -} - -void MatrixScale(float (&__restrict mtx)[16], const float (&__restrict vec)[4]) -{ - __m128 xmm4 = _mm_load_ps(vec); - __m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101)); - __m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010)); - xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000)); - - xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtx)); - xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtx+4)); - xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtx+8)); - _mm_store_ps(mtx,xmm4); - _mm_store_ps(mtx+4,xmm5); - _mm_store_ps(mtx+8,xmm6); -} - -void MatrixMultiply(float (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]) -{ - const __m128 convertScale = _mm_set1_ps(1.0f/4096.0f); - -#ifdef ENABLE_SSE2 - __m128 rowB[4] = { - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxB + 0)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxB + 4)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxB + 8)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxB + 12)) ) - }; -#else - const CACHE_ALIGN float mtxFloatB[16] = { - (float)mtxB[ 0], - (float)mtxB[ 1], - (float)mtxB[ 2], - (float)mtxB[ 3], - - (float)mtxB[ 4], - (float)mtxB[ 5], - (float)mtxB[ 6], - (float)mtxB[ 7], - - (float)mtxB[ 8], - (float)mtxB[ 9], - (float)mtxB[10], - (float)mtxB[11], - - (float)mtxB[12], - (float)mtxB[13], - (float)mtxB[14], - (float)mtxB[15] - }; - - __m128 rowB[4] = { - _mm_load_ps(mtxFloatB + 0), - _mm_load_ps(mtxFloatB + 4), - _mm_load_ps(mtxFloatB + 8), - _mm_load_ps(mtxFloatB + 12) - }; -#endif - - rowB[0] = _mm_mul_ps(rowB[0], convertScale); - rowB[1] = _mm_mul_ps(rowB[1], convertScale); - rowB[2] = _mm_mul_ps(rowB[2], convertScale); - rowB[3] = _mm_mul_ps(rowB[3], convertScale); - - __m128 rowA[4] = { - _mm_load_ps(mtxA + 0), - _mm_load_ps(mtxA + 4), - _mm_load_ps(mtxA + 8), - _mm_load_ps(mtxA + 12) - }; - - __m128 vecB[4]; - __m128 calcRow; - - vecB[0] = _mm_shuffle_ps(rowB[0], rowB[0], 0x00); - vecB[1] = _mm_shuffle_ps(rowB[0], rowB[0], 0x55); - vecB[2] = _mm_shuffle_ps(rowB[0], rowB[0], 0xAA); - vecB[3] = _mm_shuffle_ps(rowB[0], rowB[0], 0xFF); - calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) ); - _mm_store_ps(mtxA + 0, calcRow); - - vecB[0] = _mm_shuffle_ps(rowB[1], rowB[1], 0x00); - vecB[1] = _mm_shuffle_ps(rowB[1], rowB[1], 0x55); - vecB[2] = _mm_shuffle_ps(rowB[1], rowB[1], 0xAA); - vecB[3] = _mm_shuffle_ps(rowB[1], rowB[1], 0xFF); - calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) ); - _mm_store_ps(mtxA + 4, calcRow); - - vecB[0] = _mm_shuffle_ps(rowB[2], rowB[2], 0x00); - vecB[1] = _mm_shuffle_ps(rowB[2], rowB[2], 0x55); - vecB[2] = _mm_shuffle_ps(rowB[2], rowB[2], 0xAA); - vecB[3] = _mm_shuffle_ps(rowB[2], rowB[2], 0xFF); - calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) ); - _mm_store_ps(mtxA + 8, calcRow); - - vecB[0] = _mm_shuffle_ps(rowB[3], rowB[3], 0x00); - vecB[1] = _mm_shuffle_ps(rowB[3], rowB[3], 0x55); - vecB[2] = _mm_shuffle_ps(rowB[3], rowB[3], 0xAA); - vecB[3] = _mm_shuffle_ps(rowB[3], rowB[3], 0xFF); - calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) ); - _mm_store_ps(mtxA + 12, calcRow); -} - -template -FORCEINLINE void vector_fix2float(float (&mtx)[16], const float divisor) -{ - const __m128 divisor_v128 = _mm_set1_ps(divisor); - - for (size_t i = 0; i < NUM_ROWS * 4; i+=4) - { - _mm_store_ps( mtx + i, _mm_div_ps(_mm_load_ps(mtx + i), divisor_v128) ); - } -} - -#else - -void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]) -{ - _MatrixMultVec4x4_NoSIMD(mtx, vec); -} - -void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]) -{ - const CACHE_ALIGN float mtxFloat[16] = { - mtx[ 0] / 4096.0f, - mtx[ 1] / 4096.0f, - mtx[ 2] / 4096.0f, - mtx[ 3] / 4096.0f, - - mtx[ 4] / 4096.0f, - mtx[ 5] / 4096.0f, - mtx[ 6] / 4096.0f, - mtx[ 7] / 4096.0f, - - mtx[ 8] / 4096.0f, - mtx[ 9] / 4096.0f, - mtx[10] / 4096.0f, - mtx[11] / 4096.0f, - - mtx[12] / 4096.0f, - mtx[13] / 4096.0f, - mtx[14] / 4096.0f, - mtx[15] / 4096.0f - }; - - const float x = vec[0]; - const float y = vec[1]; - const float z = vec[2]; - - vec[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]); - vec[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]); - vec[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]); -} - -void MatrixTranslate(float (&__restrict mtx)[16], const float (&__restrict vec)[4]) -{ - mtx[12] += (mtx[0] * vec[0]) + (mtx[4] * vec[1]) + (mtx[ 8] * vec[2]); - mtx[13] += (mtx[1] * vec[0]) + (mtx[5] * vec[1]) + (mtx[ 9] * vec[2]); - mtx[14] += (mtx[2] * vec[0]) + (mtx[6] * vec[1]) + (mtx[10] * vec[2]); - mtx[15] += (mtx[3] * vec[0]) + (mtx[7] * vec[1]) + (mtx[11] * vec[2]); -} - -void MatrixScale(float (&__restrict mtx)[16], const float (&__restrict vec)[4]) -{ - mtx[ 0] *= vec[0]; - mtx[ 1] *= vec[0]; - mtx[ 2] *= vec[0]; - mtx[ 3] *= vec[0]; - - mtx[ 4] *= vec[1]; - mtx[ 5] *= vec[1]; - mtx[ 6] *= vec[1]; - mtx[ 7] *= vec[1]; - - mtx[ 8] *= vec[2]; - mtx[ 9] *= vec[2]; - mtx[10] *= vec[2]; - mtx[11] *= vec[2]; -} - -void MatrixMultiply(float (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]) -{ - const CACHE_ALIGN float mtxFloatB[16] = { - (float)mtxB[ 0], - (float)mtxB[ 1], - (float)mtxB[ 2], - (float)mtxB[ 3], - - (float)mtxB[ 4], - (float)mtxB[ 5], - (float)mtxB[ 6], - (float)mtxB[ 7], - - (float)mtxB[ 8], - (float)mtxB[ 9], - (float)mtxB[10], - (float)mtxB[11], - - (float)mtxB[12], - (float)mtxB[13], - (float)mtxB[14], - (float)mtxB[15] - }; - - CACHE_ALIGN float tmpMatrix[16]; - - tmpMatrix[0] = (mtxA[ 0] * mtxFloatB[ 0]) + (mtxA[ 4] * mtxFloatB[ 1]) + (mtxA[ 8] * mtxFloatB[ 2]) + (mtxA[12] * mtxFloatB[ 3]); - tmpMatrix[1] = (mtxA[ 1] * mtxFloatB[ 0]) + (mtxA[ 5] * mtxFloatB[ 1]) + (mtxA[ 9] * mtxFloatB[ 2]) + (mtxA[13] * mtxFloatB[ 3]); - tmpMatrix[2] = (mtxA[ 2] * mtxFloatB[ 0]) + (mtxA[ 6] * mtxFloatB[ 1]) + (mtxA[10] * mtxFloatB[ 2]) + (mtxA[14] * mtxFloatB[ 3]); - tmpMatrix[3] = (mtxA[ 3] * mtxFloatB[ 0]) + (mtxA[ 7] * mtxFloatB[ 1]) + (mtxA[11] * mtxFloatB[ 2]) + (mtxA[15] * mtxFloatB[ 3]); - - tmpMatrix[4] = (mtxA[ 0] * mtxFloatB[ 4]) + (mtxA[ 4] * mtxFloatB[ 5]) + (mtxA[ 8] * mtxFloatB[ 6]) + (mtxA[12] * mtxFloatB[ 7]); - tmpMatrix[5] = (mtxA[ 1] * mtxFloatB[ 4]) + (mtxA[ 5] * mtxFloatB[ 5]) + (mtxA[ 9] * mtxFloatB[ 6]) + (mtxA[13] * mtxFloatB[ 7]); - tmpMatrix[6] = (mtxA[ 2] * mtxFloatB[ 4]) + (mtxA[ 6] * mtxFloatB[ 5]) + (mtxA[10] * mtxFloatB[ 6]) + (mtxA[14] * mtxFloatB[ 7]); - tmpMatrix[7] = (mtxA[ 3] * mtxFloatB[ 4]) + (mtxA[ 7] * mtxFloatB[ 5]) + (mtxA[11] * mtxFloatB[ 6]) + (mtxA[15] * mtxFloatB[ 7]); - - tmpMatrix[8] = (mtxA[ 0] * mtxFloatB[ 8]) + (mtxA[ 4] * mtxFloatB[ 9]) + (mtxA[ 8] * mtxFloatB[10]) + (mtxA[12] * mtxFloatB[11]); - tmpMatrix[9] = (mtxA[ 1] * mtxFloatB[ 8]) + (mtxA[ 5] * mtxFloatB[ 9]) + (mtxA[ 9] * mtxFloatB[10]) + (mtxA[13] * mtxFloatB[11]); - tmpMatrix[10] = (mtxA[ 2] * mtxFloatB[ 8]) + (mtxA[ 6] * mtxFloatB[ 9]) + (mtxA[10] * mtxFloatB[10]) + (mtxA[14] * mtxFloatB[11]); - tmpMatrix[11] = (mtxA[ 3] * mtxFloatB[ 8]) + (mtxA[ 7] * mtxFloatB[ 9]) + (mtxA[11] * mtxFloatB[10]) + (mtxA[15] * mtxFloatB[11]); - - tmpMatrix[12] = (mtxA[ 0] * mtxFloatB[12]) + (mtxA[ 4] * mtxFloatB[13]) + (mtxA[ 8] * mtxFloatB[14]) + (mtxA[12] * mtxFloatB[15]); - tmpMatrix[13] = (mtxA[ 1] * mtxFloatB[12]) + (mtxA[ 5] * mtxFloatB[13]) + (mtxA[ 9] * mtxFloatB[14]) + (mtxA[13] * mtxFloatB[15]); - tmpMatrix[14] = (mtxA[ 2] * mtxFloatB[12]) + (mtxA[ 6] * mtxFloatB[13]) + (mtxA[10] * mtxFloatB[14]) + (mtxA[14] * mtxFloatB[15]); - tmpMatrix[15] = (mtxA[ 3] * mtxFloatB[12]) + (mtxA[ 7] * mtxFloatB[13]) + (mtxA[11] * mtxFloatB[14]) + (mtxA[15] * mtxFloatB[15]); - - memcpy(mtxA, tmpMatrix, sizeof(float)*16); -} - -template -FORCEINLINE void vector_fix2float(float (&mtx)[16], const float divisor) -{ - for (size_t i = 0; i < NUM_ROWS * 4; i+=4) - { - mtx[i+0] /= divisor; - mtx[i+1] /= divisor; - mtx[i+2] /= divisor; - mtx[i+3] /= divisor; - } -} - -#endif - -#ifdef ENABLE_SSE4_1 - -FORCEINLINE void _Vec4_MultiplyByMatrix(__m128i &outVec, - const __m128i &c0, const __m128i &c1, const __m128i &c2, const __m128i &c3, - const __m128i &rowLo0, const __m128i &rowLo1, const __m128i &rowLo2, const __m128i &rowLo3, - const __m128i &rowHi0, const __m128i &rowHi1, const __m128i &rowHi2, const __m128i &rowHi3) -{ - __m128i outVecLo = _mm_add_epi64( _mm_add_epi64(_mm_mul_epi32(rowLo0, c0), _mm_mul_epi32(rowLo1, c1)), _mm_add_epi64(_mm_mul_epi32(rowLo2, c2), _mm_mul_epi32(rowLo3, c3)) ); - outVecLo = _mm_srli_epi64(outVecLo, 12); - outVecLo = _mm_shuffle_epi32(outVecLo, 0xD8); - - __m128i outVecHi = _mm_add_epi64( _mm_add_epi64(_mm_mul_epi32(rowHi0, c0), _mm_mul_epi32(rowHi1, c1)), _mm_add_epi64(_mm_mul_epi32(rowHi2, c2), _mm_mul_epi32(rowHi3, c3)) ); - outVecHi = _mm_srli_epi64(outVecHi, 12); - outVecHi = _mm_shuffle_epi32(outVecHi, 0x8D); - - outVec = _mm_blendv_epi8(outVecLo, outVecHi, _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)); -} - -FORCEINLINE void _Vec3_MultiplyByMatrix(__m128i &outVec, - const __m128i &c0, const __m128i &c1, const __m128i &c2, - const __m128i &rowLo0, const __m128i &rowLo1, const __m128i &rowLo2, - const __m128i &rowHi0, const __m128i &rowHi1, const __m128i &rowHi2) -{ - __m128i outVecLo = _mm_add_epi64( _mm_mul_epi32(rowLo0, c0), _mm_add_epi64(_mm_mul_epi32(rowLo1, c1), _mm_mul_epi32(rowLo2, c2)) ); - outVecLo = _mm_srli_epi64(outVecLo, 12); - outVecLo = _mm_shuffle_epi32(outVecLo, 0xD8); - - __m128i outVecHi = _mm_add_epi64( _mm_mul_epi32(rowHi0, c0), _mm_add_epi64(_mm_mul_epi32(rowHi1, c1), _mm_mul_epi32(rowHi2, c2)) ); - outVecHi = _mm_srli_epi64(outVecHi, 12); - outVecHi = _mm_shuffle_epi32(outVecHi, 0x8D); - - outVec = _mm_blendv_epi8(outVecLo, outVecHi, _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)); -} - -FORCEINLINE void _Vec4_Translate(__m128i &outVec, - const __m128i &c0, const __m128i &c1, const __m128i &c2, - const __m128i &rowLo0, const __m128i &rowLo1, const __m128i &rowLo2, const __m128i &rowLo3, - const __m128i &rowHi0, const __m128i &rowHi1, const __m128i &rowHi2, const __m128i &rowHi3) -{ - __m128i outVecLo = _mm_add_epi64( _mm_add_epi64(_mm_mul_epi32(rowLo0, c0), _mm_mul_epi32(rowLo1, c1)), _mm_add_epi64(_mm_mul_epi32(rowLo2, c2), _mm_slli_epi64(rowLo3, 12)) ); - outVecLo = _mm_srli_epi64(outVecLo, 12); - outVecLo = _mm_shuffle_epi32(outVecLo, 0xD8); - - __m128i outVecHi = _mm_add_epi64( _mm_add_epi64(_mm_mul_epi32(rowHi0, c0), _mm_mul_epi32(rowHi1, c1)), _mm_add_epi64(_mm_mul_epi32(rowHi2, c2), _mm_slli_epi64(rowHi3, 12)) ); - outVecHi = _mm_srli_epi64(outVecHi, 12); - outVecHi = _mm_shuffle_epi32(outVecHi, 0x8D); - - outVec = _mm_blendv_epi8(outVecLo, outVecHi, _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)); -} - -FORCEINLINE void _Vec4_Scale(__m128i &inoutVec, const __m128i &scalar) -{ - __m128i outVecLo = _mm_cvtepu32_epi64(inoutVec); - __m128i outVecHi = _mm_cvtepu32_epi64( _mm_srli_si128(inoutVec, 8) ); - - outVecLo = _mm_mul_epi32(outVecLo, scalar); - outVecLo = _mm_srli_epi64(outVecLo, 12); - outVecLo = _mm_shuffle_epi32(outVecLo, 0xD8); - - outVecHi = _mm_mul_epi32(outVecHi, scalar); - outVecHi = _mm_srli_epi64(outVecHi, 12); - outVecHi = _mm_shuffle_epi32(outVecHi, 0x8D); - - inoutVec = _mm_blendv_epi8(outVecLo, outVecHi, _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)); -} - void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]) { - const __m128i inVec = _mm_load_si128((__m128i *)vec); - - const __m128i scalar[4] = { - _mm_shuffle_epi32(inVec, 0x00), - _mm_shuffle_epi32(inVec, 0x55), - _mm_shuffle_epi32(inVec, 0xAA), - _mm_shuffle_epi32(inVec, 0xFF) - }; - - const __m128i row[4] = { - _mm_load_si128((__m128i *)(mtx + 0)), - _mm_load_si128((__m128i *)(mtx + 4)), - _mm_load_si128((__m128i *)(mtx + 8)), - _mm_load_si128((__m128i *)(mtx + 12)) - }; - - const __m128i rowLo[4] = { - _mm_cvtepu32_epi64(row[0]), - _mm_cvtepu32_epi64(row[1]), - _mm_cvtepu32_epi64(row[2]), - _mm_cvtepu32_epi64(row[3]) - }; - - const __m128i rowHi[4] = { - _mm_cvtepu32_epi64( _mm_srli_si128(row[0], 8)), - _mm_cvtepu32_epi64( _mm_srli_si128(row[1], 8)), - _mm_cvtepu32_epi64( _mm_srli_si128(row[2], 8)), - _mm_cvtepu32_epi64( _mm_srli_si128(row[3], 8)) - }; - - __m128i outVec; - _Vec4_MultiplyByMatrix(outVec, - scalar[0], scalar[1], scalar[2], scalar[3], - rowLo[0], rowLo[1], rowLo[2], rowLo[3], - rowHi[0], rowHi[1], rowHi[2], rowHi[3]); - - _mm_store_si128((__m128i *)vec, outVec); +#if defined(ENABLE_SSE4_1) + __vec4_multiply_mtx4_fixed_SSE4(vec, mtx); +#else + __vec4_multiply_mtx4_fixed(vec, mtx); +#endif +} + +void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]) +{ +#if defined(ENABLE_SSE) + __vec4_multiply_mtx4_float_SSE(vec, mtx); +#else + __vec4_multiply_mtx4_float(vec, mtx); +#endif } void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]) { - const __m128i inVec = _mm_load_si128((__m128i *)vec); - - const __m128i scalar[3] = { - _mm_shuffle_epi32(inVec, 0x00), - _mm_shuffle_epi32(inVec, 0x55), - _mm_shuffle_epi32(inVec, 0xAA) - }; - - const __m128i row[3] = { - _mm_load_si128((__m128i *)(mtx + 0)), - _mm_load_si128((__m128i *)(mtx + 4)), - _mm_load_si128((__m128i *)(mtx + 8)) - }; - - const __m128i rowLo[3] = { - _mm_cvtepu32_epi64(row[0]), - _mm_cvtepu32_epi64(row[1]), - _mm_cvtepu32_epi64(row[2]) - }; - - const __m128i rowHi[3] = { - _mm_cvtepu32_epi64( _mm_srli_si128(row[0], 8)), - _mm_cvtepu32_epi64( _mm_srli_si128(row[1], 8)), - _mm_cvtepu32_epi64( _mm_srli_si128(row[2], 8)) - }; - - __m128i outVec; - _Vec3_MultiplyByMatrix(outVec, - scalar[0], scalar[1], scalar[2], - rowLo[0], rowLo[1], rowLo[2], - rowHi[0], rowHi[1], rowHi[2]); - - outVec = _mm_blend_epi16(outVec, inVec, 0xC0); - _mm_store_si128((__m128i *)vec, outVec); -} - -void MatrixTranslate(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]) -{ - const __m128i inVec = _mm_load_si128((__m128i *)vec); - - const __m128i scalar[3] = { - _mm_shuffle_epi32(inVec, 0x00), - _mm_shuffle_epi32(inVec, 0x55), - _mm_shuffle_epi32(inVec, 0xAA) - }; - - const __m128i row[4] = { - _mm_load_si128((__m128i *)(mtx + 0)), - _mm_load_si128((__m128i *)(mtx + 4)), - _mm_load_si128((__m128i *)(mtx + 8)), - _mm_load_si128((__m128i *)(mtx + 12)) - }; - - const __m128i rowLo[4] = { - _mm_cvtepu32_epi64(row[0]), - _mm_cvtepu32_epi64(row[1]), - _mm_cvtepu32_epi64(row[2]), - _mm_cvtepu32_epi64(row[3]) - }; - - const __m128i rowHi[4] = { - _mm_cvtepu32_epi64( _mm_srli_si128(row[0], 8)), - _mm_cvtepu32_epi64( _mm_srli_si128(row[1], 8)), - _mm_cvtepu32_epi64( _mm_srli_si128(row[2], 8)), - _mm_cvtepu32_epi64( _mm_srli_si128(row[3], 8)) - }; - - __m128i outVec; - _Vec4_Translate(outVec, - scalar[0], scalar[1], scalar[2], - rowLo[0], rowLo[1], rowLo[2], rowLo[3], - rowHi[0], rowHi[1], rowHi[2], rowHi[3]); - - _mm_store_si128((__m128i *)(mtx + 12), outVec); -} - -void MatrixScale(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]) -{ - const __m128i inVec = _mm_load_si128((__m128i *)vec); - const __m128i scalar[3] = { - _mm_shuffle_epi32(inVec, 0x00), - _mm_shuffle_epi32(inVec, 0x55), - _mm_shuffle_epi32(inVec, 0xAA) - }; - - __m128i row[3] = { - _mm_load_si128((__m128i *)(mtx + 0)), - _mm_load_si128((__m128i *)(mtx + 4)), - _mm_load_si128((__m128i *)(mtx + 8)) - }; - - _Vec4_Scale(row[0], scalar[0]); - _mm_store_si128((__m128i *)(mtx + 0), row[0]); - - _Vec4_Scale(row[1], scalar[1]); - _mm_store_si128((__m128i *)(mtx + 4), row[1]); - - _Vec4_Scale(row[2], scalar[2]); - _mm_store_si128((__m128i *)(mtx + 8), row[2]); -} - -void MatrixMultiply(s32 (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]) -{ - const __m128i rowA[4] = { - _mm_load_si128((__m128i *)(mtxA + 0)), - _mm_load_si128((__m128i *)(mtxA + 4)), - _mm_load_si128((__m128i *)(mtxA + 8)), - _mm_load_si128((__m128i *)(mtxA + 12)) - }; - - const __m128i rowB[4] = { - _mm_load_si128((__m128i *)(mtxB + 0)), - _mm_load_si128((__m128i *)(mtxB + 4)), - _mm_load_si128((__m128i *)(mtxB + 8)), - _mm_load_si128((__m128i *)(mtxB + 12)) - }; - - const __m128i rowLo[4] = { - _mm_cvtepu32_epi64(rowA[0]), - _mm_cvtepu32_epi64(rowA[1]), - _mm_cvtepu32_epi64(rowA[2]), - _mm_cvtepu32_epi64(rowA[3]) - }; - - const __m128i rowHi[4] = { - _mm_cvtepu32_epi64( _mm_srli_si128(rowA[0], 8)), - _mm_cvtepu32_epi64( _mm_srli_si128(rowA[1], 8)), - _mm_cvtepu32_epi64( _mm_srli_si128(rowA[2], 8)), - _mm_cvtepu32_epi64( _mm_srli_si128(rowA[3], 8)) - }; - - __m128i outVec; - __m128i scalar[4]; - - scalar[0] = _mm_shuffle_epi32(rowB[0], 0x00); - scalar[1] = _mm_shuffle_epi32(rowB[0], 0x55); - scalar[2] = _mm_shuffle_epi32(rowB[0], 0xAA); - scalar[3] = _mm_shuffle_epi32(rowB[0], 0xFF); - _Vec4_MultiplyByMatrix(outVec, - scalar[0], scalar[1], scalar[2], scalar[3], - rowLo[0], rowLo[1], rowLo[2], rowLo[3], - rowHi[0], rowHi[1], rowHi[2], rowHi[3]); - _mm_store_si128((__m128i *)(mtxA + 0), outVec); - - scalar[0] = _mm_shuffle_epi32(rowB[1], 0x00); - scalar[1] = _mm_shuffle_epi32(rowB[1], 0x55); - scalar[2] = _mm_shuffle_epi32(rowB[1], 0xAA); - scalar[3] = _mm_shuffle_epi32(rowB[1], 0xFF); - _Vec4_MultiplyByMatrix(outVec, - scalar[0], scalar[1], scalar[2], scalar[3], - rowLo[0], rowLo[1], rowLo[2], rowLo[3], - rowHi[0], rowHi[1], rowHi[2], rowHi[3]); - _mm_store_si128((__m128i *)(mtxA + 4), outVec); - - scalar[0] = _mm_shuffle_epi32(rowB[2], 0x00); - scalar[1] = _mm_shuffle_epi32(rowB[2], 0x55); - scalar[2] = _mm_shuffle_epi32(rowB[2], 0xAA); - scalar[3] = _mm_shuffle_epi32(rowB[2], 0xFF); - _Vec4_MultiplyByMatrix(outVec, - scalar[0], scalar[1], scalar[2], scalar[3], - rowLo[0], rowLo[1], rowLo[2], rowLo[3], - rowHi[0], rowHi[1], rowHi[2], rowHi[3]); - _mm_store_si128((__m128i *)(mtxA + 8), outVec); - - scalar[0] = _mm_shuffle_epi32(rowB[3], 0x00); - scalar[1] = _mm_shuffle_epi32(rowB[3], 0x55); - scalar[2] = _mm_shuffle_epi32(rowB[3], 0xAA); - scalar[3] = _mm_shuffle_epi32(rowB[3], 0xFF); - _Vec4_MultiplyByMatrix(outVec, - scalar[0], scalar[1], scalar[2], scalar[3], - rowLo[0], rowLo[1], rowLo[2], rowLo[3], - rowHi[0], rowHi[1], rowHi[2], rowHi[3]); - _mm_store_si128((__m128i *)(mtxA + 12), outVec); -} - +#if defined(ENABLE_SSE4_1) + __vec3_multiply_mtx3_fixed_SSE4(vec, mtx); #else - -FORCEINLINE void _Vec4_MultiplyByMatrix(s32 (&__restrict outVec)[4], const s32 (&__restrict inVec)[4], const s32 (&__restrict mtx)[16]) -{ - outVec[0] = sfx32_shiftdown( fx32_mul(mtx[0],inVec[0]) + fx32_mul(mtx[4],inVec[1]) + fx32_mul(mtx[ 8],inVec[2]) + fx32_mul(mtx[12],inVec[3]) ); - outVec[1] = sfx32_shiftdown( fx32_mul(mtx[1],inVec[0]) + fx32_mul(mtx[5],inVec[1]) + fx32_mul(mtx[ 9],inVec[2]) + fx32_mul(mtx[13],inVec[3]) ); - outVec[2] = sfx32_shiftdown( fx32_mul(mtx[2],inVec[0]) + fx32_mul(mtx[6],inVec[1]) + fx32_mul(mtx[10],inVec[2]) + fx32_mul(mtx[14],inVec[3]) ); - outVec[3] = sfx32_shiftdown( fx32_mul(mtx[3],inVec[0]) + fx32_mul(mtx[7],inVec[1]) + fx32_mul(mtx[11],inVec[2]) + fx32_mul(mtx[15],inVec[3]) ); + __vec3_multiply_mtx3_fixed(vec, mtx); +#endif } -FORCEINLINE void _Vec3_MultiplyByMatrix(s32 (&__restrict outVec)[4], const s32 (&__restrict inVec)[3], const s32 (&__restrict mtx)[16]) +void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]) { - outVec[0] = sfx32_shiftdown( fx32_mul(mtx[0],inVec[0]) + fx32_mul(mtx[4],inVec[1]) + fx32_mul(mtx[ 8],inVec[2]) ); - outVec[1] = sfx32_shiftdown( fx32_mul(mtx[1],inVec[0]) + fx32_mul(mtx[5],inVec[1]) + fx32_mul(mtx[ 9],inVec[2]) ); - outVec[2] = sfx32_shiftdown( fx32_mul(mtx[2],inVec[0]) + fx32_mul(mtx[6],inVec[1]) + fx32_mul(mtx[10],inVec[2]) ); -} - -FORCEINLINE void _Vec4_Scale(s32 (&inoutVec)[4], const s32 scalar) -{ - inoutVec[0] = sfx32_shiftdown( fx32_mul(inoutVec[0], scalar) ); - inoutVec[1] = sfx32_shiftdown( fx32_mul(inoutVec[1], scalar) ); - inoutVec[2] = sfx32_shiftdown( fx32_mul(inoutVec[2], scalar) ); - inoutVec[3] = sfx32_shiftdown( fx32_mul(inoutVec[3], scalar) ); -} - -void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]) -{ - const CACHE_ALIGN s32 tmpVec[4] = { - vec[0], vec[1], vec[2], vec[3] - }; - - _Vec4_MultiplyByMatrix(vec, tmpVec, mtx); -} - -void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]) -{ - const CACHE_ALIGN s32 tmpVec[3] = { - vec[0], vec[1], vec[2] - }; - - _Vec3_MultiplyByMatrix(vec, tmpVec, mtx); +#if defined(ENABLE_SSE) + __vec3_multiply_mtx3_float_SSE(vec, mtx); +#else + __vec3_multiply_mtx3_float(vec, mtx); +#endif } void MatrixTranslate(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]) { - mtx[12] = sfx32_shiftdown( fx32_mul(mtx[0], vec[0]) + fx32_mul(mtx[4], vec[1]) + fx32_mul(mtx[ 8], vec[2]) + fx32_shiftup(mtx[12]) ); - mtx[13] = sfx32_shiftdown( fx32_mul(mtx[1], vec[0]) + fx32_mul(mtx[5], vec[1]) + fx32_mul(mtx[ 9], vec[2]) + fx32_shiftup(mtx[13]) ); - mtx[14] = sfx32_shiftdown( fx32_mul(mtx[2], vec[0]) + fx32_mul(mtx[6], vec[1]) + fx32_mul(mtx[10], vec[2]) + fx32_shiftup(mtx[14]) ); - mtx[15] = sfx32_shiftdown( fx32_mul(mtx[3], vec[0]) + fx32_mul(mtx[7], vec[1]) + fx32_mul(mtx[11], vec[2]) + fx32_shiftup(mtx[15]) ); +#if defined(ENABLE_SSE4_1) + __mtx4_translate_vec3_fixed_SSE4(mtx, vec); +#else + __mtx4_translate_vec3_fixed(mtx, vec); +#endif +} + +void MatrixTranslate(float (&__restrict mtx)[16], const float (&__restrict vec)[4]) +{ +#if defined(ENABLE_SSE) + __mtx4_translate_vec3_float_SSE(mtx, vec); +#else + __mtx4_translate_vec3_float(mtx, vec); +#endif } void MatrixScale(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]) { - _Vec4_Scale((s32 (&__restrict)[4])mtx[0], vec[0]); - _Vec4_Scale((s32 (&__restrict)[4])mtx[4], vec[1]); - _Vec4_Scale((s32 (&__restrict)[4])mtx[8], vec[2]); +#if defined(ENABLE_SSE4_1) + __mtx4_scale_vec3_fixed_SSE4(mtx, vec); +#else + __mtx4_scale_vec3_fixed(mtx, vec); +#endif +} + +void MatrixScale(float (&__restrict mtx)[16], const float (&__restrict vec)[4]) +{ +#if defined(ENABLE_SSE) + __mtx4_scale_vec3_float_SSE(mtx, vec); +#else + __mtx4_scale_vec3_float(mtx, vec); +#endif } void MatrixMultiply(s32 (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]) { - const CACHE_ALIGN s32 tmpMtxA[16] = { - mtxA[ 0], mtxA[ 1], mtxA[ 2], mtxA[ 3], - mtxA[ 4], mtxA[ 5], mtxA[ 6], mtxA[ 7], - mtxA[ 8], mtxA[ 9], mtxA[10], mtxA[11], - mtxA[12], mtxA[13], mtxA[14], mtxA[15] - }; - - _Vec4_MultiplyByMatrix((s32 (&__restrict)[4])mtxA[ 0], (s32 (&__restrict)[4])mtxB[ 0], tmpMtxA); - _Vec4_MultiplyByMatrix((s32 (&__restrict)[4])mtxA[ 4], (s32 (&__restrict)[4])mtxB[ 4], tmpMtxA); - _Vec4_MultiplyByMatrix((s32 (&__restrict)[4])mtxA[ 8], (s32 (&__restrict)[4])mtxB[ 8], tmpMtxA); - _Vec4_MultiplyByMatrix((s32 (&__restrict)[4])mtxA[12], (s32 (&__restrict)[4])mtxB[12], tmpMtxA); +#if defined(ENABLE_SSE4_1) + __mtx4_multiply_mtx4_fixed_SSE4(mtxA, mtxB); +#else + __mtx4_multiply_mtx4_fixed(mtxA, mtxB); +#endif } +void MatrixMultiply(float (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]) +{ +#if defined(ENABLE_SSE) + __mtx4_multiply_mtx4_float_SSE(mtxA, mtxB); +#else + __mtx4_multiply_mtx4_float(mtxA, mtxB); #endif +} diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h index 2ee9cb82e..0475b3bb9 100644 --- a/desmume/src/matrix.h +++ b/desmume/src/matrix.h @@ -95,17 +95,12 @@ void Vector3Normalize(float *dst); void Vector4Copy(float *dst, const float *src); - -void _MatrixMultVec4x4_NoSIMD(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]); - void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]); void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]); void MatrixTranslate(float (&__restrict mtx)[16], const float (&__restrict vec)[4]); void MatrixScale(float (&__restrict mtx)[16], const float (&__restrict vec)[4]); void MatrixMultiply(float (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]); -template FORCEINLINE void vector_fix2float(float (&mtx)[16], const float divisor); - void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]); void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]); void MatrixTranslate(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]);