diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index 1eccf9657..515a909ac 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -609,11 +609,11 @@ void gfx3d_reset() memset(colorRGB, 0, sizeof(colorRGB)); memset(&tempVertInfo, 0, sizeof(tempVertInfo)); - MatrixInit (mtxCurrent[0]); - MatrixInit (mtxCurrent[1]); - MatrixInit (mtxCurrent[2]); - MatrixInit (mtxCurrent[3]); - MatrixInit (mtxTemporal); + MatrixInit(mtxCurrent[MATRIXMODE_PROJECTION]); + MatrixInit(mtxCurrent[MATRIXMODE_POSITION]); + MatrixInit(mtxCurrent[MATRIXMODE_POSITION_VECTOR]); + MatrixInit(mtxCurrent[MATRIXMODE_TEXTURE]); + MatrixInit(mtxTemporal); MatrixStackInit(&mtxStack[0]); MatrixStackInit(&mtxStack[1]); @@ -727,13 +727,13 @@ static void SetVertex() if (texCoordTransformMode == TextureTransformationMode_VertexSource) { //Tested by: Eledees The Adventures of Kai and Zero (E) [title screen and frontend menus] - last_s = (s32)(((s64)s16coord[0] * mtxCurrent[3][0] + - (s64)s16coord[1] * mtxCurrent[3][4] + - (s64)s16coord[2] * mtxCurrent[3][8] + + last_s = (s32)(((s64)s16coord[0] * mtxCurrent[MATRIXMODE_TEXTURE][0] + + (s64)s16coord[1] * mtxCurrent[MATRIXMODE_TEXTURE][4] + + (s64)s16coord[2] * mtxCurrent[MATRIXMODE_TEXTURE][8] + (((s64)(_s))<<24))>>24); - last_t = (s32)(((s64)s16coord[0] * mtxCurrent[3][1] + - (s64)s16coord[1] * mtxCurrent[3][5] + - (s64)s16coord[2] * mtxCurrent[3][9] + + last_t = (s32)(((s64)s16coord[0] * mtxCurrent[MATRIXMODE_TEXTURE][1] + + (s64)s16coord[1] * mtxCurrent[MATRIXMODE_TEXTURE][5] + + (s64)s16coord[2] * mtxCurrent[MATRIXMODE_TEXTURE][9] + (((s64)(_t))<<24))>>24); } @@ -744,8 +744,8 @@ static void SetVertex() if(polylist->count >= POLYLIST_SIZE) return; - GEM_TransformVertex(mtxCurrent[1],coordTransformed); //modelview - GEM_TransformVertex(mtxCurrent[0],coordTransformed); //projection + GEM_TransformVertex(mtxCurrent[MATRIXMODE_POSITION], coordTransformed); //modelview + GEM_TransformVertex(mtxCurrent[MATRIXMODE_PROJECTION], coordTransformed); //projection //TODO - culling should be done here. //TODO - viewport transform? @@ -930,7 +930,7 @@ static void gfx3d_glLightDirection_cache(const size_t index) cacheLightDirection[index][3] = 0; //Multiply the vector by the directional matrix - MatrixMultVec3x3_fixed(mtxCurrent[2], cacheLightDirection[index]); + MatrixMultVec3x3(mtxCurrent[MATRIXMODE_POSITION_VECTOR], cacheLightDirection[index]); //Calculate the half angle vector s32 lineOfSight[4] = {0, 0, (-1)<<12, 0}; @@ -1092,7 +1092,7 @@ static void gfx3d_glLoadIdentity() GFX_DELAY(19); if (mode == MATRIXMODE_POSITION_VECTOR) - MatrixIdentity(mtxCurrent[1]); + MatrixIdentity(mtxCurrent[MATRIXMODE_POSITION]); //printf("identity: %d to: \n",mode); MatrixPrint(mtxCurrent[1]); } @@ -1110,7 +1110,7 @@ static BOOL gfx3d_glLoadMatrix4x4(s32 v) //vector_fix2float<4>(mtxCurrent[mode], 4096.f); if (mode == MATRIXMODE_POSITION_VECTOR) - MatrixCopy(mtxCurrent[1], mtxCurrent[2]); + MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], mtxCurrent[MATRIXMODE_POSITION_VECTOR]); //printf("load4x4: matrix %d to: \n",mode); MatrixPrint(mtxCurrent[1]); return TRUE; @@ -1134,7 +1134,7 @@ static BOOL gfx3d_glLoadMatrix4x3(s32 v) GFX_DELAY(30); if (mode == MATRIXMODE_POSITION_VECTOR) - MatrixCopy(mtxCurrent[1], mtxCurrent[2]); + MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], mtxCurrent[MATRIXMODE_POSITION_VECTOR]); //printf("load4x3: matrix %d to: \n",mode); MatrixPrint(mtxCurrent[1]); return TRUE; } @@ -1155,7 +1155,7 @@ static BOOL gfx3d_glMultMatrix4x4(s32 v) if (mode == MATRIXMODE_POSITION_VECTOR) { - MatrixMultiply(mtxCurrent[1], mtxTemporal); + MatrixMultiply(mtxCurrent[MATRIXMODE_POSITION], mtxTemporal); GFX_DELAY_M2(30); } @@ -1186,7 +1186,7 @@ static BOOL gfx3d_glMultMatrix4x3(s32 v) if (mode == MATRIXMODE_POSITION_VECTOR) { - MatrixMultiply (mtxCurrent[1], mtxTemporal); + MatrixMultiply (mtxCurrent[MATRIXMODE_POSITION], mtxTemporal); GFX_DELAY_M2(30); } @@ -1219,7 +1219,7 @@ static BOOL gfx3d_glMultMatrix3x3(s32 v) if (mode == MATRIXMODE_POSITION_VECTOR) { - MatrixMultiply(mtxCurrent[1], mtxTemporal); + MatrixMultiply(mtxCurrent[MATRIXMODE_POSITION], mtxTemporal); GFX_DELAY_M2(30); } @@ -1248,8 +1248,8 @@ static BOOL gfx3d_glScale(s32 v) //note: pos-vector mode should not cause both matrices to scale. //the whole purpose is to keep the vector matrix orthogonal //so, I am leaving this commented out as an example of what not to do. - //if (mode == 2) - // MatrixScale (mtxCurrent[1], scale); + //if (mode == MATRIXMODE_POSITION_VECTOR) + // MatrixScale (mtxCurrent[MATRIXMODE_POSITION], scale); return TRUE; } @@ -1268,7 +1268,7 @@ static BOOL gfx3d_glTranslate(s32 v) if (mode == MATRIXMODE_POSITION_VECTOR) { - MatrixTranslate(mtxCurrent[1], trans); + MatrixTranslate(mtxCurrent[MATRIXMODE_POSITION], trans); GFX_DELAY_M2(30); } @@ -1297,11 +1297,11 @@ static void gfx3d_glNormal(s32 v) { //SM64 highlight rendered star in main menu tests this //also smackdown 2010 player textures tested this (needed cast on _s and _t) - last_s = (s32)(((s64)normal[0] * mtxCurrent[3][0] + (s64)normal[1] * mtxCurrent[3][4] + (s64)normal[2] * mtxCurrent[3][8] + (((s64)_s)<<24))>>24); - last_t = (s32)(((s64)normal[0] * mtxCurrent[3][1] + (s64)normal[1] * mtxCurrent[3][5] + (s64)normal[2] * mtxCurrent[3][9] + (((s64)_t)<<24))>>24); + last_s = (s32)(((s64)normal[0] * mtxCurrent[MATRIXMODE_TEXTURE][0] + (s64)normal[1] * mtxCurrent[MATRIXMODE_TEXTURE][4] + (s64)normal[2] * mtxCurrent[MATRIXMODE_TEXTURE][8] + (((s64)_s)<<24))>>24); + last_t = (s32)(((s64)normal[0] * mtxCurrent[MATRIXMODE_TEXTURE][1] + (s64)normal[1] * mtxCurrent[MATRIXMODE_TEXTURE][5] + (s64)normal[2] * mtxCurrent[MATRIXMODE_TEXTURE][9] + (((s64)_t)<<24))>>24); } - MatrixMultVec3x3_fixed(mtxCurrent[2],normal); + MatrixMultVec3x3(mtxCurrent[MATRIXMODE_POSITION_VECTOR], normal); //apply lighting model u8 diffuse[3] = { @@ -1395,8 +1395,8 @@ static void gfx3d_glTexCoord(s32 val) if (texCoordTransformMode == TextureTransformationMode_TexCoordSource) { //dragon quest 4 overworld will test this - last_s = (s32) (( (s64)_s * mtxCurrent[3][0] + (s64)_t * mtxCurrent[3][4] + (s64)mtxCurrent[3][8] + (s64)mtxCurrent[3][12])>>12); - last_t = (s32) (( (s64)_s * mtxCurrent[3][1] + (s64)_t * mtxCurrent[3][5] + (s64)mtxCurrent[3][9] + (s64)mtxCurrent[3][13])>>12); + last_s = (s32) (( (s64)_s * mtxCurrent[MATRIXMODE_TEXTURE][0] + (s64)_t * mtxCurrent[MATRIXMODE_TEXTURE][4] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][8] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][12])>>12); + last_t = (s32) (( (s64)_s * mtxCurrent[MATRIXMODE_TEXTURE][1] + (s64)_t * mtxCurrent[MATRIXMODE_TEXTURE][5] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][9] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][13])>>12); } else if (texCoordTransformMode == TextureTransformationMode_None) { @@ -1684,13 +1684,13 @@ static BOOL gfx3d_glBoxTest(u32 v) //MatrixMultVec4x4_M2(mtxCurrent[0], verts[i].coord); //but change it all to floating point and do it that way instead - CACHE_ALIGN float temp1[16] = {mtxCurrent[1][0]/4096.0f,mtxCurrent[1][1]/4096.0f,mtxCurrent[1][2]/4096.0f,mtxCurrent[1][3]/4096.0f,mtxCurrent[1][4]/4096.0f,mtxCurrent[1][5]/4096.0f,mtxCurrent[1][6]/4096.0f,mtxCurrent[1][7]/4096.0f,mtxCurrent[1][8]/4096.0f,mtxCurrent[1][9]/4096.0f,mtxCurrent[1][10]/4096.0f,mtxCurrent[1][11]/4096.0f,mtxCurrent[1][12]/4096.0f,mtxCurrent[1][13]/4096.0f,mtxCurrent[1][14]/4096.0f,mtxCurrent[1][15]/4096.0f}; - CACHE_ALIGN float temp0[16] = {mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f,mtxCurrent[0][4]/4096.0f,mtxCurrent[0][5]/4096.0f,mtxCurrent[0][6]/4096.0f,mtxCurrent[0][7]/4096.0f,mtxCurrent[0][8]/4096.0f,mtxCurrent[0][9]/4096.0f,mtxCurrent[0][10]/4096.0f,mtxCurrent[0][11]/4096.0f,mtxCurrent[0][12]/4096.0f,mtxCurrent[0][13]/4096.0f,mtxCurrent[0][14]/4096.0f,mtxCurrent[0][15]/4096.0f}; //DS_ALIGN(16) VERT_POS4f vert = { verts[i].x, verts[i].y, verts[i].z, verts[i].w }; - - _NOSSE_MatrixMultVec4x4(temp1,verts[i].coord); - _NOSSE_MatrixMultVec4x4(temp0,verts[i].coord); + + //_MatrixMultVec4x4_NoSIMD(mtxCurrent[MATRIXMODE_POSITION], verts[i].coord); + //_MatrixMultVec4x4_NoSIMD(mtxCurrent[MATRIXMODE_PROJECTION], verts[i].coord); + MatrixMultVec4x4(mtxCurrent[MATRIXMODE_POSITION], verts[i].coord); + MatrixMultVec4x4(mtxCurrent[MATRIXMODE_PROJECTION], verts[i].coord); } //clip each poly @@ -1742,12 +1742,9 @@ static BOOL gfx3d_glPosTest(u32 v) PTind = 0; PTcoords[3] = 1.0f; - - CACHE_ALIGN float temp1[16] = {mtxCurrent[1][0]/4096.0f,mtxCurrent[1][1]/4096.0f,mtxCurrent[1][2]/4096.0f,mtxCurrent[1][3]/4096.0f,mtxCurrent[1][4]/4096.0f,mtxCurrent[1][5]/4096.0f,mtxCurrent[1][6]/4096.0f,mtxCurrent[1][7]/4096.0f,mtxCurrent[1][8]/4096.0f,mtxCurrent[1][9]/4096.0f,mtxCurrent[1][10]/4096.0f,mtxCurrent[1][11]/4096.0f,mtxCurrent[1][12]/4096.0f,mtxCurrent[1][13]/4096.0f,mtxCurrent[1][14]/4096.0f,mtxCurrent[1][15]/4096.0f}; - CACHE_ALIGN float temp0[16] = {mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f,mtxCurrent[0][4]/4096.0f,mtxCurrent[0][5]/4096.0f,mtxCurrent[0][6]/4096.0f,mtxCurrent[0][7]/4096.0f,mtxCurrent[0][8]/4096.0f,mtxCurrent[0][9]/4096.0f,mtxCurrent[0][10]/4096.0f,mtxCurrent[0][11]/4096.0f,mtxCurrent[0][12]/4096.0f,mtxCurrent[0][13]/4096.0f,mtxCurrent[0][14]/4096.0f,mtxCurrent[0][15]/4096.0f}; - - MatrixMultVec4x4(temp1, PTcoords); - MatrixMultVec4x4(temp0, PTcoords); + + MatrixMultVec4x4(mtxCurrent[MATRIXMODE_POSITION], PTcoords); + MatrixMultVec4x4(mtxCurrent[MATRIXMODE_PROJECTION], PTcoords); MMU_new.gxstat.tb = 0; @@ -1765,13 +1762,14 @@ static void gfx3d_glVecTest(u32 v) //i am not sure exactly what it is doing, maybe it is testing to ensure //that the normal vector for the point of interest is camera-facing. - CACHE_ALIGN float normal[4] = { normalTable[v&1023], - normalTable[(v>>10)&1023], - normalTable[(v>>20)&1023], - 0}; - - CACHE_ALIGN float temp[16] = {mtxCurrent[2][0]/4096.0f,mtxCurrent[2][1]/4096.0f,mtxCurrent[2][2]/4096.0f,mtxCurrent[2][3]/4096.0f,mtxCurrent[2][4]/4096.0f,mtxCurrent[2][5]/4096.0f,mtxCurrent[2][6]/4096.0f,mtxCurrent[2][7]/4096.0f,mtxCurrent[2][8]/4096.0f,mtxCurrent[2][9]/4096.0f,mtxCurrent[2][10]/4096.0f,mtxCurrent[2][11]/4096.0f,mtxCurrent[2][12]/4096.0f,mtxCurrent[2][13]/4096.0f,mtxCurrent[2][14]/4096.0f,mtxCurrent[2][15]/4096.0f}; - MatrixMultVec4x4(temp, normal); + CACHE_ALIGN float normal[4] = { + normalTable[v&1023], + normalTable[(v>>10)&1023], + normalTable[(v>>20)&1023], + 0 + }; + + MatrixMultVec4x4(mtxCurrent[MATRIXMODE_POSITION_VECTOR], normal); s16 x = (s16)(normal[0]*4096); s16 y = (s16)(normal[1]*4096); @@ -1853,7 +1851,7 @@ void gfx3d_UpdateToonTable(u8 offset, u32 val) s32 gfx3d_GetClipMatrix(const u32 index) { //printf("reading clip matrix: %d\n",index); - return (s32)MatrixGetMultipliedIndex(index, mtxCurrent[0], mtxCurrent[1]); + return (s32)MatrixGetMultipliedIndex(index, mtxCurrent[MATRIXMODE_PROJECTION], mtxCurrent[MATRIXMODE_POSITION]); } s32 gfx3d_GetDirectionalMatrix(const u32 index) @@ -1861,7 +1859,7 @@ s32 gfx3d_GetDirectionalMatrix(const u32 index) const size_t _index = (((index / 3) * 4) + (index % 3)); //return (s32)(mtxCurrent[2][_index]*(1<<12)); - return mtxCurrent[2][_index]; + return mtxCurrent[MATRIXMODE_POSITION_VECTOR][_index]; } void gfx3d_glAlphaFunc(u32 v) diff --git a/desmume/src/matrix.cpp b/desmume/src/matrix.cpp index b3c175aae..14f805207 100644 --- a/desmume/src/matrix.cpp +++ b/desmume/src/matrix.cpp @@ -1,6 +1,6 @@ /* Copyright (C) 2006-2007 shash - Copyright (C) 2007-2017 DeSmuME team + Copyright (C) 2007-2018 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -24,118 +24,6 @@ #include "matrix.h" #include "MMU.h" -void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr) -{ - float x = vecPtr[0]; - float y = vecPtr[1]; - float z = vecPtr[2]; - float w = vecPtr[3]; - - vecPtr[0] = x * matrix[0] + y * matrix[4] + z * matrix[ 8] + w * matrix[12]; - vecPtr[1] = x * matrix[1] + y * matrix[5] + z * matrix[ 9] + w * matrix[13]; - vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10] + w * matrix[14]; - vecPtr[3] = x * matrix[3] + y * matrix[7] + z * matrix[11] + w * matrix[15]; -} - -void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr) -{ - const s32 x = vecPtr[0]; - const s32 y = vecPtr[1]; - const s32 z = vecPtr[2]; - const s32 w = vecPtr[3]; - - vecPtr[0] = sfx32_shiftdown(fx32_mul(x,matrix[0]) + fx32_mul(y,matrix[4]) + fx32_mul(z,matrix [8]) + fx32_mul(w,matrix[12])); - vecPtr[1] = sfx32_shiftdown(fx32_mul(x,matrix[1]) + fx32_mul(y,matrix[5]) + fx32_mul(z,matrix[ 9]) + fx32_mul(w,matrix[13])); - vecPtr[2] = sfx32_shiftdown(fx32_mul(x,matrix[2]) + fx32_mul(y,matrix[6]) + fx32_mul(z,matrix[10]) + fx32_mul(w,matrix[14])); - vecPtr[3] = sfx32_shiftdown(fx32_mul(x,matrix[3]) + fx32_mul(y,matrix[7]) + fx32_mul(z,matrix[11]) + fx32_mul(w,matrix[15])); -} - -void MatrixMultVec3x3_fixed(const s32 *matrix, s32 *vecPtr) -{ - const s32 x = vecPtr[0]; - const s32 y = vecPtr[1]; - const s32 z = vecPtr[2]; - - vecPtr[0] = sfx32_shiftdown(fx32_mul(x,matrix[0]) + fx32_mul(y,matrix[4]) + fx32_mul(z,matrix[8])); - vecPtr[1] = sfx32_shiftdown(fx32_mul(x,matrix[1]) + fx32_mul(y,matrix[5]) + fx32_mul(z,matrix[9])); - vecPtr[2] = sfx32_shiftdown(fx32_mul(x,matrix[2]) + fx32_mul(y,matrix[6]) + fx32_mul(z,matrix[10])); -} - -//------------------------- -//switched SSE functions: implementations for no SSE -#ifndef ENABLE_SSE -void MatrixMultVec4x4 (const float *matrix, float *vecPtr) -{ - _NOSSE_MatrixMultVec4x4(matrix, vecPtr); -} - - -void MatrixMultVec3x3 (const float *matrix, float *vecPtr) -{ - float x = vecPtr[0]; - float y = vecPtr[1]; - float z = vecPtr[2]; - - vecPtr[0] = x * matrix[0] + y * matrix[4] + z * matrix[ 8]; - vecPtr[1] = x * matrix[1] + y * matrix[5] + z * matrix[ 9]; - vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10]; -} - -void MatrixMultiply (float *matrix, const float *rightMatrix) -{ - float tmpMatrix[16]; - - tmpMatrix[0] = (matrix[0]*rightMatrix[0])+(matrix[4]*rightMatrix[1])+(matrix[8]*rightMatrix[2])+(matrix[12]*rightMatrix[3]); - tmpMatrix[1] = (matrix[1]*rightMatrix[0])+(matrix[5]*rightMatrix[1])+(matrix[9]*rightMatrix[2])+(matrix[13]*rightMatrix[3]); - tmpMatrix[2] = (matrix[2]*rightMatrix[0])+(matrix[6]*rightMatrix[1])+(matrix[10]*rightMatrix[2])+(matrix[14]*rightMatrix[3]); - tmpMatrix[3] = (matrix[3]*rightMatrix[0])+(matrix[7]*rightMatrix[1])+(matrix[11]*rightMatrix[2])+(matrix[15]*rightMatrix[3]); - - tmpMatrix[4] = (matrix[0]*rightMatrix[4])+(matrix[4]*rightMatrix[5])+(matrix[8]*rightMatrix[6])+(matrix[12]*rightMatrix[7]); - tmpMatrix[5] = (matrix[1]*rightMatrix[4])+(matrix[5]*rightMatrix[5])+(matrix[9]*rightMatrix[6])+(matrix[13]*rightMatrix[7]); - tmpMatrix[6] = (matrix[2]*rightMatrix[4])+(matrix[6]*rightMatrix[5])+(matrix[10]*rightMatrix[6])+(matrix[14]*rightMatrix[7]); - tmpMatrix[7] = (matrix[3]*rightMatrix[4])+(matrix[7]*rightMatrix[5])+(matrix[11]*rightMatrix[6])+(matrix[15]*rightMatrix[7]); - - tmpMatrix[8] = (matrix[0]*rightMatrix[8])+(matrix[4]*rightMatrix[9])+(matrix[8]*rightMatrix[10])+(matrix[12]*rightMatrix[11]); - tmpMatrix[9] = (matrix[1]*rightMatrix[8])+(matrix[5]*rightMatrix[9])+(matrix[9]*rightMatrix[10])+(matrix[13]*rightMatrix[11]); - tmpMatrix[10] = (matrix[2]*rightMatrix[8])+(matrix[6]*rightMatrix[9])+(matrix[10]*rightMatrix[10])+(matrix[14]*rightMatrix[11]); - tmpMatrix[11] = (matrix[3]*rightMatrix[8])+(matrix[7]*rightMatrix[9])+(matrix[11]*rightMatrix[10])+(matrix[15]*rightMatrix[11]); - - tmpMatrix[12] = (matrix[0]*rightMatrix[12])+(matrix[4]*rightMatrix[13])+(matrix[8]*rightMatrix[14])+(matrix[12]*rightMatrix[15]); - tmpMatrix[13] = (matrix[1]*rightMatrix[12])+(matrix[5]*rightMatrix[13])+(matrix[9]*rightMatrix[14])+(matrix[13]*rightMatrix[15]); - tmpMatrix[14] = (matrix[2]*rightMatrix[12])+(matrix[6]*rightMatrix[13])+(matrix[10]*rightMatrix[14])+(matrix[14]*rightMatrix[15]); - tmpMatrix[15] = (matrix[3]*rightMatrix[12])+(matrix[7]*rightMatrix[13])+(matrix[11]*rightMatrix[14])+(matrix[15]*rightMatrix[15]); - - memcpy (matrix, tmpMatrix, sizeof(float)*16); -} - -void MatrixTranslate (float *matrix, const float *ptr) -{ - matrix[12] += (matrix[0]*ptr[0])+(matrix[4]*ptr[1])+(matrix[ 8]*ptr[2]); - matrix[13] += (matrix[1]*ptr[0])+(matrix[5]*ptr[1])+(matrix[ 9]*ptr[2]); - matrix[14] += (matrix[2]*ptr[0])+(matrix[6]*ptr[1])+(matrix[10]*ptr[2]); - matrix[15] += (matrix[3]*ptr[0])+(matrix[7]*ptr[1])+(matrix[11]*ptr[2]); -} - -void MatrixScale (float *matrix, const float *ptr) -{ - matrix[0] *= ptr[0]; - matrix[1] *= ptr[0]; - matrix[2] *= ptr[0]; - matrix[3] *= ptr[0]; - - matrix[4] *= ptr[1]; - matrix[5] *= ptr[1]; - matrix[6] *= ptr[1]; - matrix[7] *= ptr[1]; - - matrix[8] *= ptr[2]; - matrix[9] *= ptr[2]; - matrix[10] *= ptr[2]; - matrix[11] *= ptr[2]; -} - -#endif //switched c/asm functions -//----------------------------------------- void MatrixInit (s32 *matrix) { @@ -345,51 +233,487 @@ void Vector4Copy(float *dst, const float *src) dst[3] = src[3]; } +void _MatrixMultVec4x4_NoSIMD(const s32 *__restrict mtxPtr, float *__restrict vecPtr) +{ + const CACHE_ALIGN float mtxFloat[16] = { + mtxPtr[ 0] / 4096.0f, + mtxPtr[ 1] / 4096.0f, + mtxPtr[ 2] / 4096.0f, + mtxPtr[ 3] / 4096.0f, + + mtxPtr[ 4] / 4096.0f, + mtxPtr[ 5] / 4096.0f, + mtxPtr[ 6] / 4096.0f, + mtxPtr[ 7] / 4096.0f, + + mtxPtr[ 8] / 4096.0f, + mtxPtr[ 9] / 4096.0f, + mtxPtr[10] / 4096.0f, + mtxPtr[11] / 4096.0f, + + mtxPtr[12] / 4096.0f, + mtxPtr[13] / 4096.0f, + mtxPtr[14] / 4096.0f, + mtxPtr[15] / 4096.0f + }; + + const float x = vecPtr[0]; + const float y = vecPtr[1]; + const float z = vecPtr[2]; + const float w = vecPtr[3]; + + vecPtr[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]) + (w * mtxFloat[12]); + vecPtr[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]) + (w * mtxFloat[13]); + vecPtr[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]) + (w * mtxFloat[14]); + vecPtr[3] = (x * mtxFloat[3]) + (y * mtxFloat[7]) + (z * mtxFloat[11]) + (w * mtxFloat[15]); +} -void MatrixMultiply (s32 *matrix, const s32 *rightMatrix) +#ifdef ENABLE_SSE + +void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr) +{ + const __m128 loadedVecPtr = _mm_load_ps(vecPtr); + const __m128 convertScalar = _mm_set1_ps(1.0f/4096.0f); + +#ifdef ENABLE_SSE2 + __m128 row[4] = { + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 0)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 4)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 8)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 12)) ) + }; +#else + const CACHE_ALIGN float mtxFloat[16] = { + (float)mtxPtr[0], + (float)mtxPtr[1], + (float)mtxPtr[2], + (float)mtxPtr[3], + + (float)mtxPtr[4], + (float)mtxPtr[5], + (float)mtxPtr[6], + (float)mtxPtr[7], + + (float)mtxPtr[8], + (float)mtxPtr[9], + (float)mtxPtr[10], + (float)mtxPtr[11], + + (float)mtxPtr[12], + (float)mtxPtr[13], + (float)mtxPtr[14], + (float)mtxPtr[15] + }; + + __m128 row[4] = { + _mm_load_ps(mtxFloat + 0), + _mm_load_ps(mtxFloat + 4), + _mm_load_ps(mtxFloat + 8), + _mm_load_ps(mtxFloat + 12) + }; +#endif + + row[0] = _mm_mul_ps(row[0], convertScalar); + row[1] = _mm_mul_ps(row[1], convertScalar); + row[2] = _mm_mul_ps(row[2], convertScalar); + row[3] = _mm_mul_ps(row[3], convertScalar); + + const __m128 vec[4] = { + _mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x00), + _mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x55), + _mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xAA), + _mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xFF) + }; + + const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], vec[0]), _mm_add_ps(_mm_mul_ps(row[1], vec[1]), _mm_add_ps(_mm_mul_ps(row[2], vec[2]), _mm_mul_ps(row[3], vec[3]))) ); + _mm_store_ps(vecPtr, calcVec); +} + +void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr) +{ + const __m128 loadedVecPtr = _mm_load_ps(vecPtr); + const __m128 convertScalar = _mm_set1_ps(1.0f/4096.0f); + +#ifdef ENABLE_SSE2 + __m128 row[3] = { + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 0)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 4)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 8)) ) + }; +#else + const CACHE_ALIGN float mtxFloat[16] = { + (float)mtxPtr[0], + (float)mtxPtr[1], + (float)mtxPtr[2], + (float)mtxPtr[3], + + (float)mtxPtr[4], + (float)mtxPtr[5], + (float)mtxPtr[6], + (float)mtxPtr[7], + + (float)mtxPtr[8], + (float)mtxPtr[9], + (float)mtxPtr[10], + (float)mtxPtr[11], + + (float)mtxPtr[12], + (float)mtxPtr[13], + (float)mtxPtr[14], + (float)mtxPtr[15] + }; + + __m128 row[3] = { + _mm_load_ps(mtxFloat + 0), + _mm_load_ps(mtxFloat + 4), + _mm_load_ps(mtxFloat + 8) + }; +#endif + + row[0] = _mm_mul_ps(row[0], convertScalar); + row[1] = _mm_mul_ps(row[1], convertScalar); + row[2] = _mm_mul_ps(row[2], convertScalar); + + const __m128 vec[3] = { + _mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x00), + _mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x55), + _mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xAA) + }; + + const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], vec[0]), _mm_add_ps(_mm_mul_ps(row[1], vec[1]), _mm_mul_ps(row[2], vec[2])) ); + _mm_store_ps(vecPtr, calcVec); +} + +void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr) +{ + __m128 xmm4 = _mm_load_ps(vecPtr); + __m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101)); + __m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010)); + xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000)); + + xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtxPtr)); + xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtxPtr+4)); + xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtxPtr+8)); + xmm4 = _mm_add_ps(xmm4,xmm5); + xmm4 = _mm_add_ps(xmm4,xmm6); + xmm4 = _mm_add_ps(xmm4,_mm_load_ps(mtxPtr+12)); + _mm_store_ps(mtxPtr+12,xmm4); +} + +void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr) +{ + __m128 xmm4 = _mm_load_ps(vecPtr); + __m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101)); + __m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010)); + xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000)); + + xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtxPtr)); + xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtxPtr+4)); + xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtxPtr+8)); + _mm_store_ps(mtxPtr,xmm4); + _mm_store_ps(mtxPtr+4,xmm5); + _mm_store_ps(mtxPtr+8,xmm6); +} + +void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB) +{ + const __m128 convertScale = _mm_set1_ps(1.0f/4096.0f); + +#ifdef ENABLE_SSE2 + __m128 rowB[4] = { + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 0)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 4)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 8)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 12)) ) + }; +#else + const CACHE_ALIGN float mtxFloatB[16] = { + (float)mtxPtrB[0], + (float)mtxPtrB[1], + (float)mtxPtrB[2], + (float)mtxPtrB[3], + + (float)mtxPtrB[4], + (float)mtxPtrB[5], + (float)mtxPtrB[6], + (float)mtxPtrB[7], + + (float)mtxPtrB[8], + (float)mtxPtrB[9], + (float)mtxPtrB[10], + (float)mtxPtrB[11], + + (float)mtxPtrB[12], + (float)mtxPtrB[13], + (float)mtxPtrB[14], + (float)mtxPtrB[15] + }; + + __m128 rowB[4] = { + _mm_load_ps(mtxFloatB + 0), + _mm_load_ps(mtxFloatB + 4), + _mm_load_ps(mtxFloatB + 8), + _mm_load_ps(mtxFloatB + 12) + }; +#endif + + rowB[0] = _mm_mul_ps(rowB[0], convertScale); + rowB[1] = _mm_mul_ps(rowB[1], convertScale); + rowB[2] = _mm_mul_ps(rowB[2], convertScale); + rowB[3] = _mm_mul_ps(rowB[3], convertScale); + + __m128 rowA[4] = { + _mm_load_ps(mtxPtrA + 0), + _mm_load_ps(mtxPtrA + 4), + _mm_load_ps(mtxPtrA + 8), + _mm_load_ps(mtxPtrA + 12) + }; + + __m128 vecB[4]; + __m128 calcRow; + + vecB[0] = _mm_shuffle_ps(rowB[0], rowB[0], 0x00); + vecB[1] = _mm_shuffle_ps(rowB[0], rowB[0], 0x55); + vecB[2] = _mm_shuffle_ps(rowB[0], rowB[0], 0xAA); + vecB[3] = _mm_shuffle_ps(rowB[0], rowB[0], 0xFF); + calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) ); + _mm_store_ps(mtxPtrA + 0, calcRow); + + vecB[0] = _mm_shuffle_ps(rowB[1], rowB[1], 0x00); + vecB[1] = _mm_shuffle_ps(rowB[1], rowB[1], 0x55); + vecB[2] = _mm_shuffle_ps(rowB[1], rowB[1], 0xAA); + vecB[3] = _mm_shuffle_ps(rowB[1], rowB[1], 0xFF); + calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) ); + _mm_store_ps(mtxPtrA + 4, calcRow); + + vecB[0] = _mm_shuffle_ps(rowB[2], rowB[2], 0x00); + vecB[1] = _mm_shuffle_ps(rowB[2], rowB[2], 0x55); + vecB[2] = _mm_shuffle_ps(rowB[2], rowB[2], 0xAA); + vecB[3] = _mm_shuffle_ps(rowB[2], rowB[2], 0xFF); + calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) ); + _mm_store_ps(mtxPtrA + 8, calcRow); + + vecB[0] = _mm_shuffle_ps(rowB[3], rowB[3], 0x00); + vecB[1] = _mm_shuffle_ps(rowB[3], rowB[3], 0x55); + vecB[2] = _mm_shuffle_ps(rowB[3], rowB[3], 0xAA); + vecB[3] = _mm_shuffle_ps(rowB[3], rowB[3], 0xFF); + calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) ); + _mm_store_ps(mtxPtrA + 12, calcRow); +} + +template +FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor) +{ + const __m128 divisor_v128 = _mm_set1_ps(divisor); + + for (size_t i = 0; i < NUM_ROWS * 4; i+=4) + { + _mm_store_ps( mtxPtr + i, _mm_div_ps(_mm_load_ps(mtxPtr + i), divisor_v128) ); + } +} + +#else + +void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr) +{ + _MatrixMultVec4x4_NoSIMD(mtxPtr, vecPtr); +} + +void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr) +{ + const CACHE_ALIGN float mtxFloat[16] = { + mtxPtr[ 0] / 4096.0f, + mtxPtr[ 1] / 4096.0f, + mtxPtr[ 2] / 4096.0f, + mtxPtr[ 3] / 4096.0f, + + mtxPtr[ 4] / 4096.0f, + mtxPtr[ 5] / 4096.0f, + mtxPtr[ 6] / 4096.0f, + mtxPtr[ 7] / 4096.0f, + + mtxPtr[ 8] / 4096.0f, + mtxPtr[ 9] / 4096.0f, + mtxPtr[10] / 4096.0f, + mtxPtr[11] / 4096.0f, + + mtxPtr[12] / 4096.0f, + mtxPtr[13] / 4096.0f, + mtxPtr[14] / 4096.0f, + mtxPtr[15] / 4096.0f + }; + + const float x = vecPtr[0]; + const float y = vecPtr[1]; + const float z = vecPtr[2]; + + vecPtr[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]); + vecPtr[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]); + vecPtr[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]); +} + +void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr) +{ + mtxPtr[12] += (mtxPtr[0] * vecPtr[0]) + (mtxPtr[4] * vecPtr[1]) + (mtxPtr[ 8] * vecPtr[2]); + mtxPtr[13] += (mtxPtr[1] * vecPtr[0]) + (mtxPtr[5] * vecPtr[1]) + (mtxPtr[ 9] * vecPtr[2]); + mtxPtr[14] += (mtxPtr[2] * vecPtr[0]) + (mtxPtr[6] * vecPtr[1]) + (mtxPtr[10] * vecPtr[2]); + mtxPtr[15] += (mtxPtr[3] * vecPtr[0]) + (mtxPtr[7] * vecPtr[1]) + (mtxPtr[11] * vecPtr[2]); +} + +void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr) +{ + mtxPtr[ 0] *= vecPtr[0]; + mtxPtr[ 1] *= vecPtr[0]; + mtxPtr[ 2] *= vecPtr[0]; + mtxPtr[ 3] *= vecPtr[0]; + + mtxPtr[ 4] *= vecPtr[1]; + mtxPtr[ 5] *= vecPtr[1]; + mtxPtr[ 6] *= vecPtr[1]; + mtxPtr[ 7] *= vecPtr[1]; + + mtxPtr[ 8] *= vecPtr[2]; + mtxPtr[ 9] *= vecPtr[2]; + mtxPtr[10] *= vecPtr[2]; + mtxPtr[11] *= vecPtr[2]; +} + +void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB) +{ + const CACHE_ALIGN float mtxFloatB[16] = { + (float)mtxPtrB[ 0], + (float)mtxPtrB[ 1], + (float)mtxPtrB[ 2], + (float)mtxPtrB[ 3], + + (float)mtxPtrB[ 4], + (float)mtxPtrB[ 5], + (float)mtxPtrB[ 6], + (float)mtxPtrB[ 7], + + (float)mtxPtrB[ 8], + (float)mtxPtrB[ 9], + (float)mtxPtrB[10], + (float)mtxPtrB[11], + + (float)mtxPtrB[12], + (float)mtxPtrB[13], + (float)mtxPtrB[14], + (float)mtxPtrB[15] + }; + + float tmpMatrix[16]; + + tmpMatrix[0] = (mtxPtrA[ 0] * mtxFloatB[ 0]) + (mtxPtrA[ 4] * mtxFloatB[ 1]) + (mtxPtrA[ 8] * mtxFloatB[ 2]) + (mtxPtrA[12] * mtxFloatB[ 3]); + tmpMatrix[1] = (mtxPtrA[ 1] * mtxFloatB[ 0]) + (mtxPtrA[ 5] * mtxFloatB[ 1]) + (mtxPtrA[ 9] * mtxFloatB[ 2]) + (mtxPtrA[13] * mtxFloatB[ 3]); + tmpMatrix[2] = (mtxPtrA[ 2] * mtxFloatB[ 0]) + (mtxPtrA[ 6] * mtxFloatB[ 1]) + (mtxPtrA[10] * mtxFloatB[ 2]) + (mtxPtrA[14] * mtxFloatB[ 3]); + tmpMatrix[3] = (mtxPtrA[ 3] * mtxFloatB[ 0]) + (mtxPtrA[ 7] * mtxFloatB[ 1]) + (mtxPtrA[11] * mtxFloatB[ 2]) + (mtxPtrA[15] * mtxFloatB[ 3]); + + tmpMatrix[4] = (mtxPtrA[ 0] * mtxFloatB[ 4]) + (mtxPtrA[ 4] * mtxFloatB[ 5]) + (mtxPtrA[ 8] * mtxFloatB[ 6]) + (mtxPtrA[12] * mtxFloatB[ 7]); + tmpMatrix[5] = (mtxPtrA[ 1] * mtxFloatB[ 4]) + (mtxPtrA[ 5] * mtxFloatB[ 5]) + (mtxPtrA[ 9] * mtxFloatB[ 6]) + (mtxPtrA[13] * mtxFloatB[ 7]); + tmpMatrix[6] = (mtxPtrA[ 2] * mtxFloatB[ 4]) + (mtxPtrA[ 6] * mtxFloatB[ 5]) + (mtxPtrA[10] * mtxFloatB[ 6]) + (mtxPtrA[14] * mtxFloatB[ 7]); + tmpMatrix[7] = (mtxPtrA[ 3] * mtxFloatB[ 4]) + (mtxPtrA[ 7] * mtxFloatB[ 5]) + (mtxPtrA[11] * mtxFloatB[ 6]) + (mtxPtrA[15] * mtxFloatB[ 7]); + + tmpMatrix[8] = (mtxPtrA[ 0] * mtxFloatB[ 8]) + (mtxPtrA[ 4] * mtxFloatB[ 9]) + (mtxPtrA[ 8] * mtxFloatB[10]) + (mtxPtrA[12] * mtxFloatB[11]); + tmpMatrix[9] = (mtxPtrA[ 1] * mtxFloatB[ 8]) + (mtxPtrA[ 5] * mtxFloatB[ 9]) + (mtxPtrA[ 9] * mtxFloatB[10]) + (mtxPtrA[13] * mtxFloatB[11]); + tmpMatrix[10] = (mtxPtrA[ 2] * mtxFloatB[ 8]) + (mtxPtrA[ 6] * mtxFloatB[ 9]) + (mtxPtrA[10] * mtxFloatB[10]) + (mtxPtrA[14] * mtxFloatB[11]); + tmpMatrix[11] = (mtxPtrA[ 3] * mtxFloatB[ 8]) + (mtxPtrA[ 7] * mtxFloatB[ 9]) + (mtxPtrA[11] * mtxFloatB[10]) + (mtxPtrA[15] * mtxFloatB[11]); + + tmpMatrix[12] = (mtxPtrA[ 0] * mtxFloatB[12]) + (mtxPtrA[ 4] * mtxFloatB[13]) + (mtxPtrA[ 8] * mtxFloatB[14]) + (mtxPtrA[12] * mtxFloatB[15]); + tmpMatrix[13] = (mtxPtrA[ 1] * mtxFloatB[12]) + (mtxPtrA[ 5] * mtxFloatB[13]) + (mtxPtrA[ 9] * mtxFloatB[14]) + (mtxPtrA[13] * mtxFloatB[15]); + tmpMatrix[14] = (mtxPtrA[ 2] * mtxFloatB[12]) + (mtxPtrA[ 6] * mtxFloatB[13]) + (mtxPtrA[10] * mtxFloatB[14]) + (mtxPtrA[14] * mtxFloatB[15]); + tmpMatrix[15] = (mtxPtrA[ 3] * mtxFloatB[12]) + (mtxPtrA[ 7] * mtxFloatB[13]) + (mtxPtrA[11] * mtxFloatB[14]) + (mtxPtrA[15] * mtxFloatB[15]); + + memcpy(mtxPtrA, tmpMatrix, sizeof(float)*16); +} + +template +FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor) +{ + for (size_t i = 0; i < NUM_ROWS * 4; i+=4) + { + mtxPtr[i+0] /= divisor; + mtxPtr[i+1] /= divisor; + mtxPtr[i+2] /= divisor; + mtxPtr[i+3] /= divisor; + } +} + +#endif + +void MatrixMultVec4x4(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr) +{ + const s32 x = vecPtr[0]; + const s32 y = vecPtr[1]; + const s32 z = vecPtr[2]; + const s32 w = vecPtr[3]; + + vecPtr[0] = sfx32_shiftdown( fx32_mul(x,mtxPtr[0]) + fx32_mul(y,mtxPtr[4]) + fx32_mul(z,mtxPtr[ 8]) + fx32_mul(w,mtxPtr[12]) ); + vecPtr[1] = sfx32_shiftdown( fx32_mul(x,mtxPtr[1]) + fx32_mul(y,mtxPtr[5]) + fx32_mul(z,mtxPtr[ 9]) + fx32_mul(w,mtxPtr[13]) ); + vecPtr[2] = sfx32_shiftdown( fx32_mul(x,mtxPtr[2]) + fx32_mul(y,mtxPtr[6]) + fx32_mul(z,mtxPtr[10]) + fx32_mul(w,mtxPtr[14]) ); + vecPtr[3] = sfx32_shiftdown( fx32_mul(x,mtxPtr[3]) + fx32_mul(y,mtxPtr[7]) + fx32_mul(z,mtxPtr[11]) + fx32_mul(w,mtxPtr[15]) ); +} + +void MatrixMultVec3x3(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr) +{ + const s32 x = vecPtr[0]; + const s32 y = vecPtr[1]; + const s32 z = vecPtr[2]; + + vecPtr[0] = sfx32_shiftdown( fx32_mul(x,mtxPtr[0]) + fx32_mul(y,mtxPtr[4]) + fx32_mul(z,mtxPtr[ 8]) ); + vecPtr[1] = sfx32_shiftdown( fx32_mul(x,mtxPtr[1]) + fx32_mul(y,mtxPtr[5]) + fx32_mul(z,mtxPtr[ 9]) ); + vecPtr[2] = sfx32_shiftdown( fx32_mul(x,mtxPtr[2]) + fx32_mul(y,mtxPtr[6]) + fx32_mul(z,mtxPtr[10]) ); +} + +void MatrixTranslate(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr) +{ + mtxPtr[12] = sfx32_shiftdown( fx32_mul(mtxPtr[0], vecPtr[0]) + fx32_mul(mtxPtr[4], vecPtr[1]) + fx32_mul(mtxPtr[ 8], vecPtr[2]) + fx32_shiftup(mtxPtr[12]) ); + mtxPtr[13] = sfx32_shiftdown( fx32_mul(mtxPtr[1], vecPtr[0]) + fx32_mul(mtxPtr[5], vecPtr[1]) + fx32_mul(mtxPtr[ 9], vecPtr[2]) + fx32_shiftup(mtxPtr[13]) ); + mtxPtr[14] = sfx32_shiftdown( fx32_mul(mtxPtr[2], vecPtr[0]) + fx32_mul(mtxPtr[6], vecPtr[1]) + fx32_mul(mtxPtr[10], vecPtr[2]) + fx32_shiftup(mtxPtr[14]) ); + mtxPtr[15] = sfx32_shiftdown( fx32_mul(mtxPtr[3], vecPtr[0]) + fx32_mul(mtxPtr[7], vecPtr[1]) + fx32_mul(mtxPtr[11], vecPtr[2]) + fx32_shiftup(mtxPtr[15]) ); +} + +void MatrixScale(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr) +{ + mtxPtr[ 0] = sfx32_shiftdown( fx32_mul(mtxPtr[ 0], vecPtr[0]) ); + mtxPtr[ 1] = sfx32_shiftdown( fx32_mul(mtxPtr[ 1], vecPtr[0]) ); + mtxPtr[ 2] = sfx32_shiftdown( fx32_mul(mtxPtr[ 2], vecPtr[0]) ); + mtxPtr[ 3] = sfx32_shiftdown( fx32_mul(mtxPtr[ 3], vecPtr[0]) ); + + mtxPtr[ 4] = sfx32_shiftdown( fx32_mul(mtxPtr[ 4], vecPtr[1]) ); + mtxPtr[ 5] = sfx32_shiftdown( fx32_mul(mtxPtr[ 5], vecPtr[1]) ); + mtxPtr[ 6] = sfx32_shiftdown( fx32_mul(mtxPtr[ 6], vecPtr[1]) ); + mtxPtr[ 7] = sfx32_shiftdown( fx32_mul(mtxPtr[ 7], vecPtr[1]) ); + + mtxPtr[ 8] = sfx32_shiftdown( fx32_mul(mtxPtr[ 8], vecPtr[2]) ); + mtxPtr[ 9] = sfx32_shiftdown( fx32_mul(mtxPtr[ 9], vecPtr[2]) ); + mtxPtr[10] = sfx32_shiftdown( fx32_mul(mtxPtr[10], vecPtr[2]) ); + mtxPtr[11] = sfx32_shiftdown( fx32_mul(mtxPtr[11], vecPtr[2]) ); +} + +void MatrixMultiply(s32 *__restrict mtxPtrA, const s32 *__restrict mtxPtrB) { s32 tmpMatrix[16]; - - tmpMatrix[0] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[0])+fx32_mul(matrix[4],rightMatrix[1])+fx32_mul(matrix[8],rightMatrix[2])+fx32_mul(matrix[12],rightMatrix[3])); - tmpMatrix[1] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[0])+fx32_mul(matrix[5],rightMatrix[1])+fx32_mul(matrix[9],rightMatrix[2])+fx32_mul(matrix[13],rightMatrix[3])); - tmpMatrix[2] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[0])+fx32_mul(matrix[6],rightMatrix[1])+fx32_mul(matrix[10],rightMatrix[2])+fx32_mul(matrix[14],rightMatrix[3])); - tmpMatrix[3] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[0])+fx32_mul(matrix[7],rightMatrix[1])+fx32_mul(matrix[11],rightMatrix[2])+fx32_mul(matrix[15],rightMatrix[3])); - - tmpMatrix[4] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[4])+fx32_mul(matrix[4],rightMatrix[5])+fx32_mul(matrix[8],rightMatrix[6])+fx32_mul(matrix[12],rightMatrix[7])); - tmpMatrix[5] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[4])+fx32_mul(matrix[5],rightMatrix[5])+fx32_mul(matrix[9],rightMatrix[6])+fx32_mul(matrix[13],rightMatrix[7])); - tmpMatrix[6] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[4])+fx32_mul(matrix[6],rightMatrix[5])+fx32_mul(matrix[10],rightMatrix[6])+fx32_mul(matrix[14],rightMatrix[7])); - tmpMatrix[7] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[4])+fx32_mul(matrix[7],rightMatrix[5])+fx32_mul(matrix[11],rightMatrix[6])+fx32_mul(matrix[15],rightMatrix[7])); - - tmpMatrix[8] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[8])+fx32_mul(matrix[4],rightMatrix[9])+fx32_mul(matrix[8],rightMatrix[10])+fx32_mul(matrix[12],rightMatrix[11])); - tmpMatrix[9] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[8])+fx32_mul(matrix[5],rightMatrix[9])+fx32_mul(matrix[9],rightMatrix[10])+fx32_mul(matrix[13],rightMatrix[11])); - tmpMatrix[10] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[8])+fx32_mul(matrix[6],rightMatrix[9])+fx32_mul(matrix[10],rightMatrix[10])+fx32_mul(matrix[14],rightMatrix[11])); - tmpMatrix[11] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[8])+fx32_mul(matrix[7],rightMatrix[9])+fx32_mul(matrix[11],rightMatrix[10])+fx32_mul(matrix[15],rightMatrix[11])); - - tmpMatrix[12] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[12])+fx32_mul(matrix[4],rightMatrix[13])+fx32_mul(matrix[8],rightMatrix[14])+fx32_mul(matrix[12],rightMatrix[15])); - tmpMatrix[13] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[12])+fx32_mul(matrix[5],rightMatrix[13])+fx32_mul(matrix[9],rightMatrix[14])+fx32_mul(matrix[13],rightMatrix[15])); - tmpMatrix[14] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[12])+fx32_mul(matrix[6],rightMatrix[13])+fx32_mul(matrix[10],rightMatrix[14])+fx32_mul(matrix[14],rightMatrix[15])); - tmpMatrix[15] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[12])+fx32_mul(matrix[7],rightMatrix[13])+fx32_mul(matrix[11],rightMatrix[14])+fx32_mul(matrix[15],rightMatrix[15])); - - memcpy(matrix,tmpMatrix,sizeof(s32)*16); + + tmpMatrix[ 0] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 0])+fx32_mul(mtxPtrA[4],mtxPtrB[ 1])+fx32_mul(mtxPtrA[ 8],mtxPtrB[ 2])+fx32_mul(mtxPtrA[12],mtxPtrB[ 3]) ); + tmpMatrix[ 1] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 0])+fx32_mul(mtxPtrA[5],mtxPtrB[ 1])+fx32_mul(mtxPtrA[ 9],mtxPtrB[ 2])+fx32_mul(mtxPtrA[13],mtxPtrB[ 3]) ); + tmpMatrix[ 2] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 0])+fx32_mul(mtxPtrA[6],mtxPtrB[ 1])+fx32_mul(mtxPtrA[10],mtxPtrB[ 2])+fx32_mul(mtxPtrA[14],mtxPtrB[ 3]) ); + tmpMatrix[ 3] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 0])+fx32_mul(mtxPtrA[7],mtxPtrB[ 1])+fx32_mul(mtxPtrA[11],mtxPtrB[ 2])+fx32_mul(mtxPtrA[15],mtxPtrB[ 3]) ); + + tmpMatrix[ 4] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 4])+fx32_mul(mtxPtrA[4],mtxPtrB[ 5])+fx32_mul(mtxPtrA[ 8],mtxPtrB[ 6])+fx32_mul(mtxPtrA[12],mtxPtrB[ 7]) ); + tmpMatrix[ 5] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 4])+fx32_mul(mtxPtrA[5],mtxPtrB[ 5])+fx32_mul(mtxPtrA[ 9],mtxPtrB[ 6])+fx32_mul(mtxPtrA[13],mtxPtrB[ 7]) ); + tmpMatrix[ 6] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 4])+fx32_mul(mtxPtrA[6],mtxPtrB[ 5])+fx32_mul(mtxPtrA[10],mtxPtrB[ 6])+fx32_mul(mtxPtrA[14],mtxPtrB[ 7]) ); + tmpMatrix[ 7] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 4])+fx32_mul(mtxPtrA[7],mtxPtrB[ 5])+fx32_mul(mtxPtrA[11],mtxPtrB[ 6])+fx32_mul(mtxPtrA[15],mtxPtrB[ 7]) ); + + tmpMatrix[ 8] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 8])+fx32_mul(mtxPtrA[4],mtxPtrB[ 9])+fx32_mul(mtxPtrA[ 8],mtxPtrB[10])+fx32_mul(mtxPtrA[12],mtxPtrB[11]) ); + tmpMatrix[ 9] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 8])+fx32_mul(mtxPtrA[5],mtxPtrB[ 9])+fx32_mul(mtxPtrA[ 9],mtxPtrB[10])+fx32_mul(mtxPtrA[13],mtxPtrB[11]) ); + tmpMatrix[10] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 8])+fx32_mul(mtxPtrA[6],mtxPtrB[ 9])+fx32_mul(mtxPtrA[10],mtxPtrB[10])+fx32_mul(mtxPtrA[14],mtxPtrB[11]) ); + tmpMatrix[11] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 8])+fx32_mul(mtxPtrA[7],mtxPtrB[ 9])+fx32_mul(mtxPtrA[11],mtxPtrB[10])+fx32_mul(mtxPtrA[15],mtxPtrB[11]) ); + + tmpMatrix[12] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[12])+fx32_mul(mtxPtrA[4],mtxPtrB[13])+fx32_mul(mtxPtrA[ 8],mtxPtrB[14])+fx32_mul(mtxPtrA[12],mtxPtrB[15]) ); + tmpMatrix[13] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[12])+fx32_mul(mtxPtrA[5],mtxPtrB[13])+fx32_mul(mtxPtrA[ 9],mtxPtrB[14])+fx32_mul(mtxPtrA[13],mtxPtrB[15]) ); + tmpMatrix[14] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[12])+fx32_mul(mtxPtrA[6],mtxPtrB[13])+fx32_mul(mtxPtrA[10],mtxPtrB[14])+fx32_mul(mtxPtrA[14],mtxPtrB[15]) ); + tmpMatrix[15] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[12])+fx32_mul(mtxPtrA[7],mtxPtrB[13])+fx32_mul(mtxPtrA[11],mtxPtrB[14])+fx32_mul(mtxPtrA[15],mtxPtrB[15]) ); + + memcpy(mtxPtrA, tmpMatrix, sizeof(s32)*16); } - -void MatrixScale(s32 *matrix, const s32 *ptr) -{ - //zero 21-sep-2010 - verified unrolling seems faster on my cpu - MACRODO_N(12, - matrix[X] = sfx32_shiftdown(fx32_mul(matrix[X],ptr[X>>2])) - ); -} - -void MatrixTranslate(s32 *matrix, const s32 *ptr) -{ - MACRODO_N(4, - { - s64 temp = fx32_shiftup(matrix[X+12]); - temp += fx32_mul(matrix[X+0],ptr[0]); - temp += fx32_mul(matrix[X+4],ptr[1]); - temp += fx32_mul(matrix[X+8],ptr[2]); - matrix[X+12] = sfx32_shiftdown(temp); - }); -} - diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h index 1d0b667df..442e4ee87 100644 --- a/desmume/src/matrix.h +++ b/desmume/src/matrix.h @@ -1,6 +1,6 @@ /* Copyright (C) 2006-2007 shash - Copyright (C) 2007-2017 DeSmuME team + Copyright (C) 2007-2018 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -77,7 +77,24 @@ void Vector3Scale(float *dst, const float scale); void Vector3Copy(float *dst, const float *src); void Vector3Normalize(float *dst); -void Vector4Copy(float *dst, const float *src); +void Vector4Copy(float *dst, const float *src); + + +void _MatrixMultVec4x4_NoSIMD(const s32 *__restrict mtxPtr, float *__restrict vecPtr); + +void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr); +void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr); +void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr); +void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr); +void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB); + +template FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor); + +void MatrixMultVec4x4(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr); +void MatrixMultVec3x3(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr); +void MatrixTranslate(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr); +void MatrixScale(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr); +void MatrixMultiply(s32 *__restrict mtxPtrA, const s32 *__restrict mtxPtrB); //these functions are an unreliable, inaccurate floor. //it should only be used for positive numbers @@ -296,151 +313,4 @@ static void memset_u32_fast(void *dst, const u32 val) #endif // SIMD Functions -// NOSSE version always used in gfx3d.cpp -void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr); -void MatrixMultVec3x3_fixed(const s32 *matrix, s32 *vecPtr); - -//--------------------------- -//switched SSE functions -#ifdef ENABLE_SSE - -struct SSE_MATRIX -{ - SSE_MATRIX(const float *matrix) - : row0(_mm_load_ps(matrix)) - , row1(_mm_load_ps(matrix+4)) - , row2(_mm_load_ps(matrix+8)) - , row3(_mm_load_ps(matrix+12)) - {} - - union { - __m128 rows[4]; - struct { __m128 row0; __m128 row1; __m128 row2; __m128 row3; }; - }; - -}; - -FORCEINLINE __m128 _util_MatrixMultVec4x4_(const SSE_MATRIX &mat, __m128 vec) -{ - __m128 xmm5 = _mm_shuffle_ps(vec, vec, B8(01010101)); - __m128 xmm6 = _mm_shuffle_ps(vec, vec, B8(10101010)); - __m128 xmm7 = _mm_shuffle_ps(vec, vec, B8(11111111)); - __m128 xmm4 = _mm_shuffle_ps(vec, vec, B8(00000000)); - - xmm4 = _mm_mul_ps(xmm4,mat.row0); - xmm5 = _mm_mul_ps(xmm5,mat.row1); - xmm6 = _mm_mul_ps(xmm6,mat.row2); - xmm7 = _mm_mul_ps(xmm7,mat.row3); - xmm4 = _mm_add_ps(xmm4,xmm5); - xmm4 = _mm_add_ps(xmm4,xmm6); - xmm4 = _mm_add_ps(xmm4,xmm7); - return xmm4; -} - -FORCEINLINE void MatrixMultiply(float * matrix, const float * rightMatrix) -{ - //this seems to generate larger code, including many movaps, but maybe it is less harsh on the registers than the - //more hand-tailored approach - __m128 row0 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix)); - __m128 row1 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+4)); - __m128 row2 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+8)); - __m128 row3 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+12)); - _mm_store_ps(matrix,row0); - _mm_store_ps(matrix+4,row1); - _mm_store_ps(matrix+8,row2); - _mm_store_ps(matrix+12,row3); -} - -FORCEINLINE void MatrixMultVec4x4(const float *matrix, float *vecPtr) -{ - _mm_store_ps(vecPtr,_util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(vecPtr))); -} - -FORCEINLINE void MatrixMultVec3x3(const float * matrix, float * vecPtr) -{ - const __m128 vec = _mm_load_ps(vecPtr); - - __m128 xmm5 = _mm_shuffle_ps(vec, vec, B8(01010101)); - __m128 xmm6 = _mm_shuffle_ps(vec, vec, B8(10101010)); - __m128 xmm4 = _mm_shuffle_ps(vec, vec, B8(00000000)); - - const SSE_MATRIX mat(matrix); - - xmm4 = _mm_mul_ps(xmm4,mat.row0); - xmm5 = _mm_mul_ps(xmm5,mat.row1); - xmm6 = _mm_mul_ps(xmm6,mat.row2); - xmm4 = _mm_add_ps(xmm4,xmm5); - xmm4 = _mm_add_ps(xmm4,xmm6); - - _mm_store_ps(vecPtr,xmm4); -} - -FORCEINLINE void MatrixTranslate(float *matrix, const float *ptr) -{ - __m128 xmm4 = _mm_load_ps(ptr); - __m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101)); - __m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010)); - xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000)); - - xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(matrix)); - xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(matrix+4)); - xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(matrix+8)); - xmm4 = _mm_add_ps(xmm4,xmm5); - xmm4 = _mm_add_ps(xmm4,xmm6); - xmm4 = _mm_add_ps(xmm4,_mm_load_ps(matrix+12)); - _mm_store_ps(matrix+12,xmm4); -} - -FORCEINLINE void MatrixScale(float *matrix, const float *ptr) -{ - __m128 xmm4 = _mm_load_ps(ptr); - __m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101)); - __m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010)); - xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000)); - - xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(matrix)); - xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(matrix+4)); - xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(matrix+8)); - _mm_store_ps(matrix,xmm4); - _mm_store_ps(matrix+4,xmm5); - _mm_store_ps(matrix+8,xmm6); -} - -template -FORCEINLINE void vector_fix2float(float* matrix, const float divisor) -{ - CTASSERT(NUM_ROWS==3 || NUM_ROWS==4); - - const __m128 val = _mm_set_ps1(divisor); - - _mm_store_ps(matrix,_mm_div_ps(_mm_load_ps(matrix),val)); - _mm_store_ps(matrix+4,_mm_div_ps(_mm_load_ps(matrix+4),val)); - _mm_store_ps(matrix+8,_mm_div_ps(_mm_load_ps(matrix+8),val)); - if(NUM_ROWS==4) - _mm_store_ps(matrix+12,_mm_div_ps(_mm_load_ps(matrix+12),val)); -} - -#else //no sse - -void MatrixMultVec4x4 (const float *matrix, float *vecPtr); -void MatrixMultVec3x3(const float * matrix, float * vecPtr); -void MatrixMultiply(float * matrix, const float * rightMatrix); -void MatrixTranslate(float *matrix, const float *ptr); -void MatrixScale(float * matrix, const float * ptr); - -template -FORCEINLINE void vector_fix2float(float* matrix, const float divisor) -{ - for(int i=0;i