From c7bb41e4b10a4c86ffff3823c76ad5e4537e86f5 Mon Sep 17 00:00:00 2001 From: rogerman Date: Mon, 19 Feb 2018 11:43:55 -0800 Subject: [PATCH] matrix.cpp: Rework all matrix function parameters for explicit array sizing in order to aid compiler optimization and (hopefully) aid in code readability. Also add SSE4.1 versions for the main matrix functions. --- desmume/src/MMU.cpp | 9 +- desmume/src/frontend/windows/matrixView.cpp | 8 +- desmume/src/gfx3d.cpp | 257 ++++-- desmume/src/gfx3d.h | 23 +- desmume/src/matrix.cpp | 940 +++++++++++++------- desmume/src/matrix.h | 86 +- 6 files changed, 853 insertions(+), 470 deletions(-) diff --git a/desmume/src/MMU.cpp b/desmume/src/MMU.cpp index e26a84404..5deece4a2 100644 --- a/desmume/src/MMU.cpp +++ b/desmume/src/MMU.cpp @@ -1,7 +1,7 @@ /* Copyright (C) 2006 yopyop Copyright (C) 2007 shash - Copyright (C) 2007-2017 DeSmuME team + Copyright (C) 2007-2018 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -1936,7 +1936,6 @@ static INLINE void write_timer(int proc, int timerIndex, u16 val) NDS_RescheduleTimers(); } -extern CACHE_ALIGN MatrixStack mtxStack[4]; u32 TGXSTAT::read32() { u32 ret = 0; @@ -1945,8 +1944,8 @@ u32 TGXSTAT::read32() // stack position always equal zero. possible timings is wrong // using in "The Wild West" - int proj_level = mtxStack[MATRIXMODE_PROJECTION].position & 1; - int mv_level = mtxStack[MATRIXMODE_POSITION].position & 31; + int proj_level = mtxStackProjection.position & 1; + int mv_level = mtxStackPosition.position & 31; ret |= ((proj_level << 13) | (mv_level << 8)); ret |= sb<<14; //stack busy @@ -1981,7 +1980,7 @@ void TGXSTAT::write32(const u32 val) // Writing "1" to Bit15 does reset the Error Flag (Bit15), // and additionally resets the Projection Stack Pointer (Bit13) // (and probably (?) also the Texture Stack Pointer)?? - mtxStack[0].position = 0; + mtxStackProjection.position = 0; se = 0; //clear stack error flag } //printf("gxstat write: %08X while gxfifo.size=%d\n",val,gxFIFO.size); diff --git a/desmume/src/frontend/windows/matrixView.cpp b/desmume/src/frontend/windows/matrixView.cpp index 060a9b343..de3d248e5 100644 --- a/desmume/src/frontend/windows/matrixView.cpp +++ b/desmume/src/frontend/windows/matrixView.cpp @@ -64,7 +64,7 @@ void MatrixView_OnPaintPositionMatrix(HWND hwnd) stackIndex = SendMessage(hStackCombo, CB_GETCURSEL, 0, 0) - 1; - gfx3d_glGetMatrix(MATRIXMODE_POSITION, stackIndex, matrix); + gfx3d_glGetMatrix(stackIndex, matrix); MatrixView_SetMatrix(hwnd, idcGroup, matrix); } @@ -87,7 +87,7 @@ void MatrixView_OnPaintDirectionMatrix(HWND hwnd) stackIndex = SendMessage(hStackCombo, CB_GETCURSEL, 0, 0) - 1; - gfx3d_glGetMatrix(MATRIXMODE_POSITION_VECTOR, stackIndex, matrix); + gfx3d_glGetMatrix(stackIndex, matrix); MatrixView_SetMatrix(hwnd, idcGroup, matrix); } @@ -106,7 +106,7 @@ void MatrixView_OnPaintProjectionMatrix(HWND hwnd) float mat[16]; - gfx3d_glGetMatrix(MATRIXMODE_PROJECTION, -1, mat); + gfx3d_glGetMatrix(-1, mat); MatrixView_SetMatrix(hwnd, idcGroup, mat); } @@ -125,7 +125,7 @@ void MatrixView_OnPaintTextureMatrix(HWND hwnd) float mat[16]; - gfx3d_glGetMatrix(MATRIXMODE_TEXTURE, -1, mat); + gfx3d_glGetMatrix(-1, mat); MatrixView_SetMatrix(hwnd, idcGroup, mat); } diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index f7c52e56c..2bb267a53 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -1,6 +1,6 @@ /* Copyright (C) 2006 yopyop - Copyright (C) 2008-2017 DeSmuME team + Copyright (C) 2008-2018 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -288,12 +288,10 @@ static u16 *_gfx3d_color16 = NULL; // Matrix stack handling //TODO: decouple stack pointers from matrix stack type -CACHE_ALIGN MatrixStack mtxStack[4] = { - MatrixStack(1, 0), // Projection stack - MatrixStack(32, 1), // Coordinate stack - MatrixStack(32, 2), // Directional stack - MatrixStack(1, 3), // Texture stack -}; +CACHE_ALIGN MatrixStack mtxStackProjection; +CACHE_ALIGN MatrixStack mtxStackPosition; +CACHE_ALIGN MatrixStack mtxStackPositionVector; +CACHE_ALIGN MatrixStack mtxStackTexture; static CACHE_ALIGN s32 mtxCurrent[4][16]; static CACHE_ALIGN s32 mtxTemporal[16]; @@ -615,9 +613,10 @@ void gfx3d_reset() MatrixInit(mtxCurrent[MATRIXMODE_TEXTURE]); MatrixInit(mtxTemporal); - MatrixStackInit(&mtxStack[0]); - MatrixStackInit(&mtxStack[1]); - MatrixStackInit(&mtxStack[2]); + MatrixStackInit(&mtxStackProjection); + MatrixStackInit(&mtxStackPosition); + MatrixStackInit(&mtxStackPositionVector); + MatrixStackInit(&mtxStackTexture); clCmd = 0; clInd = 0; @@ -692,12 +691,12 @@ static s32 GEM_SaturateAndShiftdown36To32(const s64 val) return fx32_shiftdown(val); } -static void GEM_TransformVertex(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr) +static void GEM_TransformVertex(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]) { - const s32 x = vecPtr[0]; - const s32 y = vecPtr[1]; - const s32 z = vecPtr[2]; - const s32 w = vecPtr[3]; + const s32 x = vec[0]; + const s32 y = vec[1]; + const s32 z = vec[2]; + const s32 w = vec[3]; //saturation logic is most carefully tested by: //+ spectrobes beyond the portals excavation blower and drill tools: sets very large overflowing +x,+y in the modelview matrix to push things offscreen @@ -709,10 +708,10 @@ static void GEM_TransformVertex(const s32 *__restrict mtxPtr, s32 *__restrict ve //+ SM64: outside castle skybox //+ NSMB: mario head screen wipe - vecPtr[0] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtxPtr[0]) + GEM_Mul32x32To64(y,mtxPtr[4]) + GEM_Mul32x32To64(z,mtxPtr[ 8]) + GEM_Mul32x32To64(w,mtxPtr[12]) ); - vecPtr[1] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtxPtr[1]) + GEM_Mul32x32To64(y,mtxPtr[5]) + GEM_Mul32x32To64(z,mtxPtr[ 9]) + GEM_Mul32x32To64(w,mtxPtr[13]) ); - vecPtr[2] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtxPtr[2]) + GEM_Mul32x32To64(y,mtxPtr[6]) + GEM_Mul32x32To64(z,mtxPtr[10]) + GEM_Mul32x32To64(w,mtxPtr[14]) ); - vecPtr[3] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtxPtr[3]) + GEM_Mul32x32To64(y,mtxPtr[7]) + GEM_Mul32x32To64(z,mtxPtr[11]) + GEM_Mul32x32To64(w,mtxPtr[15]) ); + vec[0] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[0]) + GEM_Mul32x32To64(y,mtx[4]) + GEM_Mul32x32To64(z,mtx[ 8]) + GEM_Mul32x32To64(w,mtx[12]) ); + vec[1] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[1]) + GEM_Mul32x32To64(y,mtx[5]) + GEM_Mul32x32To64(z,mtx[ 9]) + GEM_Mul32x32To64(w,mtx[13]) ); + vec[2] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[2]) + GEM_Mul32x32To64(y,mtx[6]) + GEM_Mul32x32To64(z,mtx[10]) + GEM_Mul32x32To64(w,mtx[14]) ); + vec[3] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[3]) + GEM_Mul32x32To64(y,mtx[7]) + GEM_Mul32x32To64(z,mtx[11]) + GEM_Mul32x32To64(w,mtx[15]) ); } //--------------- @@ -975,25 +974,37 @@ static void gfx3d_glPushMatrix() //printf("%d %d %d %d -> ",mtxStack[0].position,mtxStack[1].position,mtxStack[2].position,mtxStack[3].position); //printf("PUSH mode: %d -> ",mode,mtxStack[mode].position); - if(mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE) + if (mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE) { - MatrixCopy(MatrixStackGetPos(&mtxStack[mode], 0), mtxCurrent[mode]); - - u32& index = mtxStack[mode].position; - if(index == 1) MMU_new.gxstat.se = 1; //unknown if this applies to the texture matrix - index += 1; - index &= 1; + if (mode == MATRIXMODE_PROJECTION) + { + MatrixCopy(mtxStackProjection.matrix[0], mtxCurrent[mode]); + + u32 &index = mtxStackProjection.position; + if (index == 1) MMU_new.gxstat.se = 1; + index += 1; + index &= 1; + } + else + { + MatrixCopy(mtxStackTexture.matrix[0], mtxCurrent[mode]); + + u32 &index = mtxStackTexture.position; + if (index == 1) MMU_new.gxstat.se = 1; //unknown if this applies to the texture matrix + index += 1; + index &= 1; + } } else { - u32& index = mtxStack[MATRIXMODE_POSITION].position; + u32 &index = mtxStackPosition.position; + + MatrixCopy(mtxStackPosition.matrix[index & 31], mtxCurrent[MATRIXMODE_POSITION]); + MatrixCopy(mtxStackPositionVector.matrix[index & 31], mtxCurrent[MATRIXMODE_POSITION_VECTOR]); - MatrixCopy(MatrixStackGetPos(&mtxStack[MATRIXMODE_POSITION], index&31), mtxCurrent[MATRIXMODE_POSITION]); - MatrixCopy(MatrixStackGetPos(&mtxStack[MATRIXMODE_POSITION_VECTOR], index&31), mtxCurrent[MATRIXMODE_POSITION_VECTOR]); - index += 1; index &= 63; - if(index >= 32) MMU_new.gxstat.se = 1; //(not sure, this might be off by 1) + if (index >= 32) MMU_new.gxstat.se = 1; //(not sure, this might be off by 1) } //printf("%d %d %d %d\n",mtxStack[0].position,mtxStack[1].position,mtxStack[2].position,mtxStack[3].position); @@ -1010,25 +1021,35 @@ static void gfx3d_glPopMatrix(u32 v) //printf("%d %d %d %d -> ",mtxStack[0].position,mtxStack[1].position,mtxStack[2].position,mtxStack[3].position); //printf("POP (%d): mode: %d -> ",v,mode,mtxStack[mode].position); - if(mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE) + if (mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE) { //parameter is ignored and treated as sensible (always 1) - - u32& index = mtxStack[mode].position; - index ^= 1; - if(index == 1) MMU_new.gxstat.se = 1; //unknown if this applies to the texture matrix - MatrixCopy(mtxCurrent[mode], MatrixStackGetPos(&mtxStack[mode], 0)); + + if (mode == MATRIXMODE_PROJECTION) + { + u32 &index = mtxStackProjection.position; + index ^= 1; + if (index == 1) MMU_new.gxstat.se = 1; + MatrixCopy(mtxCurrent[mode], mtxStackProjection.matrix[0]); + } + else + { + u32 &index = mtxStackTexture.position; + index ^= 1; + if (index == 1) MMU_new.gxstat.se = 1; //unknown if this applies to the texture matrix + MatrixCopy(mtxCurrent[mode], mtxStackTexture.matrix[0]); + } } else { - u32& index = mtxStack[MATRIXMODE_POSITION].position; + u32 &index = mtxStackPosition.position; index -= v & 63; index &= 63; - if(index >= 32) MMU_new.gxstat.se = 1; //(not sure, this might be off by 1) - - MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], MatrixStackGetPos(&mtxStack[MATRIXMODE_POSITION], index&31)); - MatrixCopy(mtxCurrent[MATRIXMODE_POSITION_VECTOR], MatrixStackGetPos(&mtxStack[MATRIXMODE_POSITION_VECTOR], index&31)); + if (index >= 32) MMU_new.gxstat.se = 1; //(not sure, this might be off by 1) + + MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], mtxStackPosition.matrix[index & 31]); + MatrixCopy(mtxCurrent[MATRIXMODE_POSITION_VECTOR], mtxStackPositionVector.matrix[index & 31]); } //printf("%d %d %d %d\n",mtxStack[0].position,mtxStack[1].position,mtxStack[2].position,mtxStack[3].position); @@ -1041,22 +1062,29 @@ static void gfx3d_glStoreMatrix(u32 v) //printf("%d %d %d %d -> ",mtxStack[0].position,mtxStack[1].position,mtxStack[2].position,mtxStack[3].position); //printf("STORE (%d): mode: %d -> ",v,mode,mtxStack[mode].position); - if(mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE) + if (mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE) { //parameter ignored and treated as sensible v = 0; - - MatrixStackLoadMatrix(&mtxStack[mode], v, mtxCurrent[mode]); + + if (mode == MATRIXMODE_PROJECTION) + { + MatrixCopy(mtxStackProjection.matrix[0], mtxCurrent[MATRIXMODE_PROJECTION]); + } + else + { + MatrixCopy(mtxStackTexture.matrix[0], mtxCurrent[MATRIXMODE_TEXTURE]); + } } else { v &= 31; //out of bounds function fully properly, but set errors (not sure, this might be off by 1) - if(v >= 31) MMU_new.gxstat.se = 1; - - MatrixStackLoadMatrix(&mtxStack[MATRIXMODE_POSITION], v, mtxCurrent[MATRIXMODE_POSITION]); - MatrixStackLoadMatrix(&mtxStack[MATRIXMODE_POSITION_VECTOR], v, mtxCurrent[MATRIXMODE_POSITION_VECTOR]); + if (v >= 31) MMU_new.gxstat.se = 1; + + MatrixCopy(mtxStackPosition.matrix[v], mtxCurrent[MATRIXMODE_POSITION]); + MatrixCopy(mtxStackPositionVector.matrix[v], mtxCurrent[MATRIXMODE_POSITION_VECTOR]); } //printf("%d %d %d %d\n",mtxStack[0].position,mtxStack[1].position,mtxStack[2].position,mtxStack[3].position); @@ -1066,19 +1094,27 @@ static void gfx3d_glStoreMatrix(u32 v) static void gfx3d_glRestoreMatrix(u32 v) { - if(mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE) + if (mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE) { //parameter ignored and treated as sensible v = 0; - MatrixCopy(mtxCurrent[mode], MatrixStackGetPos(&mtxStack[mode], v)); + + if (mode == MATRIXMODE_PROJECTION) + { + MatrixCopy(mtxCurrent[MATRIXMODE_PROJECTION], mtxStackProjection.matrix[0]); + } + else + { + MatrixCopy(mtxCurrent[MATRIXMODE_TEXTURE], mtxStackTexture.matrix[0]); + } } else { //out of bounds errors function fully properly, but set errors - MMU_new.gxstat.se = v>=31; //(not sure, this might be off by 1) - - MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], MatrixStackGetPos(&mtxStack[MATRIXMODE_POSITION], v)); - MatrixCopy(mtxCurrent[MATRIXMODE_POSITION_VECTOR], MatrixStackGetPos(&mtxStack[MATRIXMODE_POSITION_VECTOR], v)); + MMU_new.gxstat.se = (v >= 31) ? 1 : 0; //(not sure, this might be off by 1) + + MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], mtxStackPosition.matrix[v]); + MatrixCopy(mtxCurrent[MATRIXMODE_POSITION_VECTOR], mtxStackPositionVector.matrix[v]); } @@ -1853,7 +1889,7 @@ void gfx3d_UpdateToonTable(u8 offset, u32 val) s32 gfx3d_GetClipMatrix(const u32 index) { //printf("reading clip matrix: %d\n",index); - return (s32)MatrixGetMultipliedIndex(index, mtxCurrent[MATRIXMODE_PROJECTION], mtxCurrent[MATRIXMODE_POSITION]); + return MatrixGetMultipliedIndex(index, mtxCurrent[MATRIXMODE_PROJECTION], mtxCurrent[MATRIXMODE_POSITION]); } s32 gfx3d_GetDirectionalMatrix(const u32 index) @@ -2491,20 +2527,37 @@ void gfx3d_sendCommand(u32 cmd, u32 param) //-------------- //other misc stuff -void gfx3d_glGetMatrix(const MatrixMode m_mode, int index, float *dst) +template +void gfx3d_glGetMatrix(const int index, float (&dst)[16]) { - //if(index == -1) - //{ - // MatrixCopy(dest, mtxCurrent[m_mode]); - // return; - //} - - //MatrixCopy(dest, MatrixStackGetPos(&mtxStack[m_mode], index)); - - const s32 *src = (index == -1) ? mtxCurrent[m_mode] : MatrixStackGetPos(&mtxStack[m_mode], index); - - for (size_t i = 0; i < 16; i++) - dst[i] = src[i]/4096.0f; + if (index == -1) + { + MatrixCopy(dst, mtxCurrent[MODE]); + } + else + { + switch (MODE) + { + case MATRIXMODE_PROJECTION: + MatrixCopy(dst, mtxStackProjection.matrix[0]); + break; + + case MATRIXMODE_POSITION: + MatrixCopy(dst, mtxStackPosition.matrix[0]); + break; + + case MATRIXMODE_POSITION_VECTOR: + MatrixCopy(dst, mtxStackPositionVector.matrix[0]); + break; + + case MATRIXMODE_TEXTURE: + MatrixCopy(dst, mtxStackTexture.matrix[0]); + break; + + default: + break; + } + } } void gfx3d_glGetLightDirection(const size_t index, u32 &dst) @@ -2632,12 +2685,35 @@ void gfx3d_savestate(EMUFILE &os) for (size_t i = 0; i < polylist->count; i++) polylist->list[i].save(os); - for (size_t i = 0; i < ARRAY_SIZE(mtxStack); i++) + // Write matrix stack data + os.write_32LE(mtxStackProjection.position); + for (size_t j = 0; j < 16; j++) { - os.write_32LE(mtxStack[i].position); - - for (size_t j = 0; j < mtxStack[i].size*16; j++) - os.write_32LE(mtxStack[i].matrix[j]); + os.write_32LE(mtxStackProjection.matrix[0][j]); + } + + os.write_32LE(mtxStackPosition.position); + for (size_t i = 0; i < MatrixStack::size; i++) + { + for (size_t j = 0; j < 16; j++) + { + os.write_32LE(mtxStackPosition.matrix[i][j]); + } + } + + os.write_32LE(mtxStackPositionVector.position); + for (size_t i = 0; i < MatrixStack::size; i++) + { + for (size_t j = 0; j < 16; j++) + { + os.write_32LE(mtxStackPositionVector.matrix[i][j]); + } + } + + os.write_32LE(mtxStackTexture.position); + for (size_t j = 0; j < 16; j++) + { + os.write_32LE(mtxStackTexture.matrix[0][j]); } gxf_hardware.savestate(os); @@ -2703,12 +2779,35 @@ bool gfx3d_loadstate(EMUFILE &is, int size) if (version >= 2) { - for (size_t i = 0; i < ARRAY_SIZE(mtxStack); i++) + // Read matrix stack data + is.read_32LE(mtxStackProjection.position); + for (size_t j = 0; j < 16; j++) { - is.read_32LE(mtxStack[i].position); - - for (size_t j = 0; j < mtxStack[i].size*16; j++) - is.read_32LE(mtxStack[i].matrix[j]); + is.read_32LE(mtxStackProjection.matrix[0][j]); + } + + is.read_32LE(mtxStackPosition.position); + for (size_t i = 0; i < MatrixStack::size; i++) + { + for (size_t j = 0; j < 16; j++) + { + is.read_32LE(mtxStackPosition.matrix[i][j]); + } + } + + is.read_32LE(mtxStackPositionVector.position); + for (size_t i = 0; i < MatrixStack::size; i++) + { + for (size_t j = 0; j < 16; j++) + { + is.read_32LE(mtxStackPositionVector.matrix[i][j]); + } + } + + is.read_32LE(mtxStackTexture.position); + for (size_t j = 0; j < 16; j++) + { + is.read_32LE(mtxStackTexture.matrix[0][j]); } } diff --git a/desmume/src/gfx3d.h b/desmume/src/gfx3d.h index b1b4580bd..ab6cdd577 100644 --- a/desmume/src/gfx3d.h +++ b/desmume/src/gfx3d.h @@ -1,6 +1,6 @@ /* Copyright (C) 2006 yopyop - Copyright (C) 2008-2017 DeSmuME team + Copyright (C) 2008-2018 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -23,7 +23,8 @@ #include #include -#include "types.h" +#include "types.h" +#include "matrix.h" #include "GPU.h" class EMUFILE; @@ -74,16 +75,12 @@ class EMUFILE; // 15-bit to 24-bit depth formula from http://nocash.emubase.de/gbatek.htm#ds3drearplane extern CACHE_ALIGN u32 dsDepthExtend_15bit_to_24bit[32768]; -#define DS_DEPTH15TO24(depth) ( dsDepthExtend_15bit_to_24bit[(depth) & 0x7FFF] ) - -// MATRIX MODES -enum MatrixMode -{ - MATRIXMODE_PROJECTION = 0, - MATRIXMODE_POSITION = 1, - MATRIXMODE_POSITION_VECTOR = 2, - MATRIXMODE_TEXTURE = 3 -}; +#define DS_DEPTH15TO24(depth) ( dsDepthExtend_15bit_to_24bit[(depth) & 0x7FFF] ) + +extern CACHE_ALIGN MatrixStack mtxStackProjection; +extern CACHE_ALIGN MatrixStack mtxStackPosition; +extern CACHE_ALIGN MatrixStack mtxStackPositionVector; +extern CACHE_ALIGN MatrixStack mtxStackTexture; // POLYGON PRIMITIVE TYPES enum PolygonPrimitiveType @@ -633,7 +630,7 @@ void gfx3d_sendCommandToFIFO(u32 val); void gfx3d_sendCommand(u32 cmd, u32 param); //other misc stuff -void gfx3d_glGetMatrix(const MatrixMode mode, int index, float *dst); +template void gfx3d_glGetMatrix(const int index, float (&dst)[16]); void gfx3d_glGetLightDirection(const size_t index, u32 &dst); void gfx3d_glGetLightColor(const size_t index, u32 &dst); diff --git a/desmume/src/matrix.cpp b/desmume/src/matrix.cpp index b7c95a5bb..0586908df 100644 --- a/desmume/src/matrix.cpp +++ b/desmume/src/matrix.cpp @@ -25,17 +25,17 @@ #include "MMU.h" -void MatrixInit(s32 *mtxPtr) +void MatrixInit(s32 (&mtx)[16]) { - MatrixIdentity(mtxPtr); + MatrixIdentity(mtx); } -void MatrixInit(float *mtxPtr) +void MatrixInit(float (&mtx)[16]) { - MatrixIdentity(mtxPtr); + MatrixIdentity(mtx); } -void MatrixIdentity(s32 *mtxPtr) +void MatrixIdentity(s32 (&mtx)[16]) { static const CACHE_ALIGN s32 mtxIdentity[16] = { (1 << 12), 0, 0, 0, @@ -44,10 +44,10 @@ void MatrixIdentity(s32 *mtxPtr) 0, 0, 0, (1 << 12) }; - memcpy(mtxPtr, mtxIdentity, sizeof(s32)*16); + memcpy(mtx, mtxIdentity, sizeof(s32)*16); } -void MatrixIdentity(float *mtxPtr) +void MatrixIdentity(float (&mtx)[16]) { static const CACHE_ALIGN float mtxIdentity[16] = { 1.0f, 0.0f, 0.0f, 0.0f, @@ -56,37 +56,37 @@ void MatrixIdentity(float *mtxPtr) 0.0f, 0.0f, 0.0f, 1.0f }; - memcpy(mtxPtr, mtxIdentity, sizeof(float)*16); + memcpy(mtx, mtxIdentity, sizeof(float)*16); } -void MatrixSet(s32 *mtxPtr, const size_t x, const size_t y, const s32 value) +void MatrixSet(s32 (&mtx)[16], const size_t x, const size_t y, const s32 value) { - mtxPtr[x+(y<<2)] = value; + mtx[x+(y<<2)] = value; } -void MatrixSet(float *mtxPtr, const size_t x, const size_t y, const float value) +void MatrixSet(float (&mtx)[16], const size_t x, const size_t y, const float value) { - mtxPtr[x+(y<<2)] = value; + mtx[x+(y<<2)] = value; } -void MatrixSet(float *mtxPtr, const size_t x, const size_t y, const s32 value) +void MatrixSet(float (&mtx)[16], const size_t x, const size_t y, const s32 value) { - mtxPtr[x+(y<<2)] = value / 4096.0f; + mtx[x+(y<<2)] = (float)value / 4096.0f; } -void MatrixCopy(s32 *mtxDst, const s32 *mtxSrc) +void MatrixCopy(s32 (&mtxDst)[16], const s32 (&mtxSrc)[16]) { // We're going to assume that the two buffers are not the same. memcpy(mtxDst, mtxSrc, sizeof(s32)*16); } -void MatrixCopy(float *mtxDst, const float *mtxSrc) +void MatrixCopy(float (&mtxDst)[16], const float (&mtxSrc)[16]) { // We're going to assume that the two buffers are not the same. memcpy(mtxDst, mtxSrc, sizeof(float)*16); } -void MatrixCopy(float *mtxDst, const s32 *mtxSrc) +void MatrixCopy(float (&__restrict mtxDst)[16], const s32 (&__restrict mtxSrc)[16]) { mtxDst[ 0] = mtxSrc[ 0] / 4096.0f; mtxDst[ 1] = mtxSrc[ 1] / 4096.0f; @@ -109,76 +109,64 @@ void MatrixCopy(float *mtxDst, const s32 *mtxSrc) mtxDst[15] = mtxSrc[15] / 4096.0f; } -int MatrixCompare(const s32 *mtxDst, const s32 *mtxSrc) +int MatrixCompare(const s32 (&mtxDst)[16], const s32 (&mtxSrc)[16]) { return memcmp(mtxDst, mtxSrc, sizeof(s32)*16); } -int MatrixCompare(const float *mtxDst, const float *mtxSrc) +int MatrixCompare(const float (&mtxDst)[16], const float (&mtxSrc)[16]) { return memcmp(mtxDst, mtxSrc, sizeof(float)*16); } -s32 MatrixGetMultipliedIndex(const u32 index, s32 *matrix, s32 *rightMatrix) +s32 MatrixGetMultipliedIndex(const u32 index, const s32 (&mtxA)[16], const s32 (&mtxB)[16]) { - const size_t iMod = index%4, iDiv = (index>>2)<<2; - - s64 temp = ((s64)matrix[iMod ]*rightMatrix[iDiv ])+((s64)matrix[iMod+ 4]*rightMatrix[iDiv+1])+ - ((s64)matrix[iMod+8]*rightMatrix[iDiv+2])+((s64)matrix[iMod+12]*rightMatrix[iDiv+3]); - - return (s32)(temp>>12); + assert(index < 16); + + const size_t iMod = index % 4; + const size_t iDiv = (index >> 2) << 2; + + const s32 temp = sfx32_shiftdown( fx32_mul(mtxA[iMod ], mtxB[iDiv ]) + fx32_mul(mtxA[iMod+ 4], mtxB[iDiv+1]) + fx32_mul(mtxA[iMod+8], mtxB[iDiv+2]) + fx32_mul(mtxA[iMod+12], mtxB[iDiv+3]) ); + return temp; } -void MatrixStackInit(MatrixStack *stack) +float MatrixGetMultipliedIndex(const u32 index, const float (&mtxA)[16], const float (&mtxB)[16]) { - for (int i = 0; i < stack->size; i++) + assert(index < 16); + + const size_t iMod = index % 4; + const size_t iDiv = (index >> 2) << 2; + + const float temp = (mtxA[iMod ] * mtxB[iDiv ]) + (mtxA[iMod+ 4] * mtxB[iDiv+1]) + (mtxA[iMod+8] * mtxB[iDiv+2]) + (mtxA[iMod+12] * mtxB[iDiv+3]); + return temp; +} + +template +void MatrixStackInit(MatrixStack *stack) +{ + for (size_t i = 0; i < MatrixStack::size; i++) { - MatrixInit(&stack->matrix[i*16]); + MatrixInit(stack->matrix[i]); } + stack->position = 0; } -void MatrixStackSetMaxSize (MatrixStack *stack, int size) +template +s32* MatrixStackGet(MatrixStack *stack) { - int i; - - stack->size = size; - - if (stack->matrix != NULL) { - free (stack->matrix); - } - stack->matrix = new s32[stack->size*16*sizeof(s32)]; - - for (i = 0; i < stack->size; i++) - { - MatrixInit (&stack->matrix[i*16]); - } + return stack->matrix[stack->position]; } +template void MatrixStackInit(MatrixStack *stack); +template void MatrixStackInit(MatrixStack *stack); +template void MatrixStackInit(MatrixStack *stack); +template void MatrixStackInit(MatrixStack *stack); -MatrixStack::MatrixStack(int size, int type) -{ - MatrixStackSetMaxSize(this,size); - this->type = type; -} - - -s32* MatrixStackGetPos(MatrixStack *stack, const size_t pos) -{ - assert(pos < stack->size); - return &stack->matrix[pos*16]; -} - -s32* MatrixStackGet (MatrixStack *stack) -{ - return &stack->matrix[stack->position*16]; -} - -void MatrixStackLoadMatrix (MatrixStack *stack, const size_t pos, const s32 *ptr) -{ - assert(pos < stack->size); - MatrixCopy(&stack->matrix[pos*16], ptr); -} +template s32* MatrixStackGet(MatrixStack *stack); +template s32* MatrixStackGet(MatrixStack *stack); +template s32* MatrixStackGet(MatrixStack *stack); +template s32* MatrixStackGet(MatrixStack *stack); void Vector2Copy(float *dst, const float *src) { @@ -271,76 +259,76 @@ void Vector4Copy(float *dst, const float *src) dst[3] = src[3]; } -void _MatrixMultVec4x4_NoSIMD(const s32 *__restrict mtxPtr, float *__restrict vecPtr) +void _MatrixMultVec4x4_NoSIMD(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]) { const CACHE_ALIGN float mtxFloat[16] = { - mtxPtr[ 0] / 4096.0f, - mtxPtr[ 1] / 4096.0f, - mtxPtr[ 2] / 4096.0f, - mtxPtr[ 3] / 4096.0f, + mtx[ 0] / 4096.0f, + mtx[ 1] / 4096.0f, + mtx[ 2] / 4096.0f, + mtx[ 3] / 4096.0f, - mtxPtr[ 4] / 4096.0f, - mtxPtr[ 5] / 4096.0f, - mtxPtr[ 6] / 4096.0f, - mtxPtr[ 7] / 4096.0f, + mtx[ 4] / 4096.0f, + mtx[ 5] / 4096.0f, + mtx[ 6] / 4096.0f, + mtx[ 7] / 4096.0f, - mtxPtr[ 8] / 4096.0f, - mtxPtr[ 9] / 4096.0f, - mtxPtr[10] / 4096.0f, - mtxPtr[11] / 4096.0f, + mtx[ 8] / 4096.0f, + mtx[ 9] / 4096.0f, + mtx[10] / 4096.0f, + mtx[11] / 4096.0f, - mtxPtr[12] / 4096.0f, - mtxPtr[13] / 4096.0f, - mtxPtr[14] / 4096.0f, - mtxPtr[15] / 4096.0f + mtx[12] / 4096.0f, + mtx[13] / 4096.0f, + mtx[14] / 4096.0f, + mtx[15] / 4096.0f }; - const float x = vecPtr[0]; - const float y = vecPtr[1]; - const float z = vecPtr[2]; - const float w = vecPtr[3]; + const float x = vec[0]; + const float y = vec[1]; + const float z = vec[2]; + const float w = vec[3]; - vecPtr[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]) + (w * mtxFloat[12]); - vecPtr[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]) + (w * mtxFloat[13]); - vecPtr[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]) + (w * mtxFloat[14]); - vecPtr[3] = (x * mtxFloat[3]) + (y * mtxFloat[7]) + (z * mtxFloat[11]) + (w * mtxFloat[15]); + vec[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]) + (w * mtxFloat[12]); + vec[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]) + (w * mtxFloat[13]); + vec[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]) + (w * mtxFloat[14]); + vec[3] = (x * mtxFloat[3]) + (y * mtxFloat[7]) + (z * mtxFloat[11]) + (w * mtxFloat[15]); } #ifdef ENABLE_SSE -void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr) +void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]) { - const __m128 loadedVecPtr = _mm_load_ps(vecPtr); + const __m128 loadedVec = _mm_load_ps(vec); const __m128 convertScalar = _mm_set1_ps(1.0f/4096.0f); #ifdef ENABLE_SSE2 __m128 row[4] = { - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 0)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 4)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 8)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 12)) ) + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 0)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 4)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 8)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 12)) ) }; #else const CACHE_ALIGN float mtxFloat[16] = { - (float)mtxPtr[0], - (float)mtxPtr[1], - (float)mtxPtr[2], - (float)mtxPtr[3], + (float)mtx[0], + (float)mtx[1], + (float)mtx[2], + (float)mtx[3], - (float)mtxPtr[4], - (float)mtxPtr[5], - (float)mtxPtr[6], - (float)mtxPtr[7], + (float)mtx[4], + (float)mtx[5], + (float)mtx[6], + (float)mtx[7], - (float)mtxPtr[8], - (float)mtxPtr[9], - (float)mtxPtr[10], - (float)mtxPtr[11], + (float)mtx[8], + (float)mtx[9], + (float)mtx[10], + (float)mtx[11], - (float)mtxPtr[12], - (float)mtxPtr[13], - (float)mtxPtr[14], - (float)mtxPtr[15] + (float)mtx[12], + (float)mtx[13], + (float)mtx[14], + (float)mtx[15] }; __m128 row[4] = { @@ -356,49 +344,49 @@ void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr) row[2] = _mm_mul_ps(row[2], convertScalar); row[3] = _mm_mul_ps(row[3], convertScalar); - const __m128 vec[4] = { - _mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x00), - _mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x55), - _mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xAA), - _mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xFF) + const __m128 scalar[4] = { + _mm_shuffle_ps(loadedVec, loadedVec, 0x00), + _mm_shuffle_ps(loadedVec, loadedVec, 0x55), + _mm_shuffle_ps(loadedVec, loadedVec, 0xAA), + _mm_shuffle_ps(loadedVec, loadedVec, 0xFF) }; - const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], vec[0]), _mm_add_ps(_mm_mul_ps(row[1], vec[1]), _mm_add_ps(_mm_mul_ps(row[2], vec[2]), _mm_mul_ps(row[3], vec[3]))) ); - _mm_store_ps(vecPtr, calcVec); + const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], scalar[0]), _mm_add_ps(_mm_mul_ps(row[1], scalar[1]), _mm_add_ps(_mm_mul_ps(row[2], scalar[2]), _mm_mul_ps(row[3], scalar[3]))) ); + _mm_store_ps(vec, calcVec); } -void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr) +void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]) { - const __m128 loadedVecPtr = _mm_load_ps(vecPtr); + const __m128 loadedVec = _mm_load_ps(vec); const __m128 convertScalar = _mm_set1_ps(1.0f/4096.0f); #ifdef ENABLE_SSE2 __m128 row[3] = { - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 0)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 4)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 8)) ) + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 0)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 4)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 8)) ) }; #else const CACHE_ALIGN float mtxFloat[16] = { - (float)mtxPtr[0], - (float)mtxPtr[1], - (float)mtxPtr[2], - (float)mtxPtr[3], + (float)mtx[0], + (float)mtx[1], + (float)mtx[2], + (float)mtx[3], - (float)mtxPtr[4], - (float)mtxPtr[5], - (float)mtxPtr[6], - (float)mtxPtr[7], + (float)mtx[4], + (float)mtx[5], + (float)mtx[6], + (float)mtx[7], - (float)mtxPtr[8], - (float)mtxPtr[9], - (float)mtxPtr[10], - (float)mtxPtr[11], + (float)mtx[8], + (float)mtx[9], + (float)mtx[10], + (float)mtx[11], - (float)mtxPtr[12], - (float)mtxPtr[13], - (float)mtxPtr[14], - (float)mtxPtr[15] + (float)mtx[12], + (float)mtx[13], + (float)mtx[14], + (float)mtx[15] }; __m128 row[3] = { @@ -412,79 +400,79 @@ void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr) row[1] = _mm_mul_ps(row[1], convertScalar); row[2] = _mm_mul_ps(row[2], convertScalar); - const __m128 vec[3] = { - _mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x00), - _mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x55), - _mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xAA) + const __m128 scalar[3] = { + _mm_shuffle_ps(loadedVec, loadedVec, 0x00), + _mm_shuffle_ps(loadedVec, loadedVec, 0x55), + _mm_shuffle_ps(loadedVec, loadedVec, 0xAA) }; - const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], vec[0]), _mm_add_ps(_mm_mul_ps(row[1], vec[1]), _mm_mul_ps(row[2], vec[2])) ); - _mm_store_ps(vecPtr, calcVec); + const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], scalar[0]), _mm_add_ps(_mm_mul_ps(row[1], scalar[1]), _mm_mul_ps(row[2], scalar[2])) ); + _mm_store_ps(vec, calcVec); } -void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr) +void MatrixTranslate(float (&__restrict mtx)[16], const float (&__restrict vec)[4]) { - __m128 xmm4 = _mm_load_ps(vecPtr); + __m128 xmm4 = _mm_load_ps(vec); __m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101)); __m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010)); xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000)); - xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtxPtr)); - xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtxPtr+4)); - xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtxPtr+8)); + xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtx)); + xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtx+4)); + xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtx+8)); xmm4 = _mm_add_ps(xmm4,xmm5); xmm4 = _mm_add_ps(xmm4,xmm6); - xmm4 = _mm_add_ps(xmm4,_mm_load_ps(mtxPtr+12)); - _mm_store_ps(mtxPtr+12,xmm4); + xmm4 = _mm_add_ps(xmm4,_mm_load_ps(mtx+12)); + _mm_store_ps(mtx+12,xmm4); } -void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr) +void MatrixScale(float (&__restrict mtx)[16], const float (&__restrict vec)[4]) { - __m128 xmm4 = _mm_load_ps(vecPtr); + __m128 xmm4 = _mm_load_ps(vec); __m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101)); __m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010)); xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000)); - xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtxPtr)); - xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtxPtr+4)); - xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtxPtr+8)); - _mm_store_ps(mtxPtr,xmm4); - _mm_store_ps(mtxPtr+4,xmm5); - _mm_store_ps(mtxPtr+8,xmm6); + xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtx)); + xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtx+4)); + xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtx+8)); + _mm_store_ps(mtx,xmm4); + _mm_store_ps(mtx+4,xmm5); + _mm_store_ps(mtx+8,xmm6); } -void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB) +void MatrixMultiply(float (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]) { const __m128 convertScale = _mm_set1_ps(1.0f/4096.0f); #ifdef ENABLE_SSE2 __m128 rowB[4] = { - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 0)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 4)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 8)) ), - _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 12)) ) + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxB + 0)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxB + 4)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxB + 8)) ), + _mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxB + 12)) ) }; #else const CACHE_ALIGN float mtxFloatB[16] = { - (float)mtxPtrB[0], - (float)mtxPtrB[1], - (float)mtxPtrB[2], - (float)mtxPtrB[3], + (float)mtxB[ 0], + (float)mtxB[ 1], + (float)mtxB[ 2], + (float)mtxB[ 3], - (float)mtxPtrB[4], - (float)mtxPtrB[5], - (float)mtxPtrB[6], - (float)mtxPtrB[7], + (float)mtxB[ 4], + (float)mtxB[ 5], + (float)mtxB[ 6], + (float)mtxB[ 7], - (float)mtxPtrB[8], - (float)mtxPtrB[9], - (float)mtxPtrB[10], - (float)mtxPtrB[11], + (float)mtxB[ 8], + (float)mtxB[ 9], + (float)mtxB[10], + (float)mtxB[11], - (float)mtxPtrB[12], - (float)mtxPtrB[13], - (float)mtxPtrB[14], - (float)mtxPtrB[15] + (float)mtxB[12], + (float)mtxB[13], + (float)mtxB[14], + (float)mtxB[15] }; __m128 rowB[4] = { @@ -501,10 +489,10 @@ void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB) rowB[3] = _mm_mul_ps(rowB[3], convertScale); __m128 rowA[4] = { - _mm_load_ps(mtxPtrA + 0), - _mm_load_ps(mtxPtrA + 4), - _mm_load_ps(mtxPtrA + 8), - _mm_load_ps(mtxPtrA + 12) + _mm_load_ps(mtxA + 0), + _mm_load_ps(mtxA + 4), + _mm_load_ps(mtxA + 8), + _mm_load_ps(mtxA + 12) }; __m128 vecB[4]; @@ -515,243 +503,533 @@ void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB) vecB[2] = _mm_shuffle_ps(rowB[0], rowB[0], 0xAA); vecB[3] = _mm_shuffle_ps(rowB[0], rowB[0], 0xFF); calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) ); - _mm_store_ps(mtxPtrA + 0, calcRow); + _mm_store_ps(mtxA + 0, calcRow); vecB[0] = _mm_shuffle_ps(rowB[1], rowB[1], 0x00); vecB[1] = _mm_shuffle_ps(rowB[1], rowB[1], 0x55); vecB[2] = _mm_shuffle_ps(rowB[1], rowB[1], 0xAA); vecB[3] = _mm_shuffle_ps(rowB[1], rowB[1], 0xFF); calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) ); - _mm_store_ps(mtxPtrA + 4, calcRow); + _mm_store_ps(mtxA + 4, calcRow); vecB[0] = _mm_shuffle_ps(rowB[2], rowB[2], 0x00); vecB[1] = _mm_shuffle_ps(rowB[2], rowB[2], 0x55); vecB[2] = _mm_shuffle_ps(rowB[2], rowB[2], 0xAA); vecB[3] = _mm_shuffle_ps(rowB[2], rowB[2], 0xFF); calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) ); - _mm_store_ps(mtxPtrA + 8, calcRow); + _mm_store_ps(mtxA + 8, calcRow); vecB[0] = _mm_shuffle_ps(rowB[3], rowB[3], 0x00); vecB[1] = _mm_shuffle_ps(rowB[3], rowB[3], 0x55); vecB[2] = _mm_shuffle_ps(rowB[3], rowB[3], 0xAA); vecB[3] = _mm_shuffle_ps(rowB[3], rowB[3], 0xFF); calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) ); - _mm_store_ps(mtxPtrA + 12, calcRow); + _mm_store_ps(mtxA + 12, calcRow); } template -FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor) +FORCEINLINE void vector_fix2float(float (&mtx)[16], const float divisor) { const __m128 divisor_v128 = _mm_set1_ps(divisor); for (size_t i = 0; i < NUM_ROWS * 4; i+=4) { - _mm_store_ps( mtxPtr + i, _mm_div_ps(_mm_load_ps(mtxPtr + i), divisor_v128) ); + _mm_store_ps( mtx + i, _mm_div_ps(_mm_load_ps(mtx + i), divisor_v128) ); } } #else -void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr) +void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]) { - _MatrixMultVec4x4_NoSIMD(mtxPtr, vecPtr); + _MatrixMultVec4x4_NoSIMD(mtx, vec); } -void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr) +void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]) { const CACHE_ALIGN float mtxFloat[16] = { - mtxPtr[ 0] / 4096.0f, - mtxPtr[ 1] / 4096.0f, - mtxPtr[ 2] / 4096.0f, - mtxPtr[ 3] / 4096.0f, + mtx[ 0] / 4096.0f, + mtx[ 1] / 4096.0f, + mtx[ 2] / 4096.0f, + mtx[ 3] / 4096.0f, - mtxPtr[ 4] / 4096.0f, - mtxPtr[ 5] / 4096.0f, - mtxPtr[ 6] / 4096.0f, - mtxPtr[ 7] / 4096.0f, + mtx[ 4] / 4096.0f, + mtx[ 5] / 4096.0f, + mtx[ 6] / 4096.0f, + mtx[ 7] / 4096.0f, - mtxPtr[ 8] / 4096.0f, - mtxPtr[ 9] / 4096.0f, - mtxPtr[10] / 4096.0f, - mtxPtr[11] / 4096.0f, + mtx[ 8] / 4096.0f, + mtx[ 9] / 4096.0f, + mtx[10] / 4096.0f, + mtx[11] / 4096.0f, - mtxPtr[12] / 4096.0f, - mtxPtr[13] / 4096.0f, - mtxPtr[14] / 4096.0f, - mtxPtr[15] / 4096.0f + mtx[12] / 4096.0f, + mtx[13] / 4096.0f, + mtx[14] / 4096.0f, + mtx[15] / 4096.0f }; - const float x = vecPtr[0]; - const float y = vecPtr[1]; - const float z = vecPtr[2]; + const float x = vec[0]; + const float y = vec[1]; + const float z = vec[2]; - vecPtr[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]); - vecPtr[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]); - vecPtr[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]); + vec[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]); + vec[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]); + vec[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]); } -void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr) +void MatrixTranslate(float (&__restrict mtx)[16], const float (&__restrict vec)[4]) { - mtxPtr[12] += (mtxPtr[0] * vecPtr[0]) + (mtxPtr[4] * vecPtr[1]) + (mtxPtr[ 8] * vecPtr[2]); - mtxPtr[13] += (mtxPtr[1] * vecPtr[0]) + (mtxPtr[5] * vecPtr[1]) + (mtxPtr[ 9] * vecPtr[2]); - mtxPtr[14] += (mtxPtr[2] * vecPtr[0]) + (mtxPtr[6] * vecPtr[1]) + (mtxPtr[10] * vecPtr[2]); - mtxPtr[15] += (mtxPtr[3] * vecPtr[0]) + (mtxPtr[7] * vecPtr[1]) + (mtxPtr[11] * vecPtr[2]); + mtx[12] += (mtx[0] * vec[0]) + (mtx[4] * vec[1]) + (mtx[ 8] * vec[2]); + mtx[13] += (mtx[1] * vec[0]) + (mtx[5] * vec[1]) + (mtx[ 9] * vec[2]); + mtx[14] += (mtx[2] * vec[0]) + (mtx[6] * vec[1]) + (mtx[10] * vec[2]); + mtx[15] += (mtx[3] * vec[0]) + (mtx[7] * vec[1]) + (mtx[11] * vec[2]); } -void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr) +void MatrixScale(float (&__restrict mtx)[16], const float (&__restrict vec)[4]) { - mtxPtr[ 0] *= vecPtr[0]; - mtxPtr[ 1] *= vecPtr[0]; - mtxPtr[ 2] *= vecPtr[0]; - mtxPtr[ 3] *= vecPtr[0]; + mtx[ 0] *= vec[0]; + mtx[ 1] *= vec[0]; + mtx[ 2] *= vec[0]; + mtx[ 3] *= vec[0]; - mtxPtr[ 4] *= vecPtr[1]; - mtxPtr[ 5] *= vecPtr[1]; - mtxPtr[ 6] *= vecPtr[1]; - mtxPtr[ 7] *= vecPtr[1]; + mtx[ 4] *= vec[1]; + mtx[ 5] *= vec[1]; + mtx[ 6] *= vec[1]; + mtx[ 7] *= vec[1]; - mtxPtr[ 8] *= vecPtr[2]; - mtxPtr[ 9] *= vecPtr[2]; - mtxPtr[10] *= vecPtr[2]; - mtxPtr[11] *= vecPtr[2]; + mtx[ 8] *= vec[2]; + mtx[ 9] *= vec[2]; + mtx[10] *= vec[2]; + mtx[11] *= vec[2]; } -void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB) +void MatrixMultiply(float (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]) { const CACHE_ALIGN float mtxFloatB[16] = { - (float)mtxPtrB[ 0], - (float)mtxPtrB[ 1], - (float)mtxPtrB[ 2], - (float)mtxPtrB[ 3], + (float)mtxB[ 0], + (float)mtxB[ 1], + (float)mtxB[ 2], + (float)mtxB[ 3], - (float)mtxPtrB[ 4], - (float)mtxPtrB[ 5], - (float)mtxPtrB[ 6], - (float)mtxPtrB[ 7], + (float)mtxB[ 4], + (float)mtxB[ 5], + (float)mtxB[ 6], + (float)mtxB[ 7], - (float)mtxPtrB[ 8], - (float)mtxPtrB[ 9], - (float)mtxPtrB[10], - (float)mtxPtrB[11], + (float)mtxB[ 8], + (float)mtxB[ 9], + (float)mtxB[10], + (float)mtxB[11], - (float)mtxPtrB[12], - (float)mtxPtrB[13], - (float)mtxPtrB[14], - (float)mtxPtrB[15] + (float)mtxB[12], + (float)mtxB[13], + (float)mtxB[14], + (float)mtxB[15] }; - float tmpMatrix[16]; + CACHE_ALIGN float tmpMatrix[16]; - tmpMatrix[0] = (mtxPtrA[ 0] * mtxFloatB[ 0]) + (mtxPtrA[ 4] * mtxFloatB[ 1]) + (mtxPtrA[ 8] * mtxFloatB[ 2]) + (mtxPtrA[12] * mtxFloatB[ 3]); - tmpMatrix[1] = (mtxPtrA[ 1] * mtxFloatB[ 0]) + (mtxPtrA[ 5] * mtxFloatB[ 1]) + (mtxPtrA[ 9] * mtxFloatB[ 2]) + (mtxPtrA[13] * mtxFloatB[ 3]); - tmpMatrix[2] = (mtxPtrA[ 2] * mtxFloatB[ 0]) + (mtxPtrA[ 6] * mtxFloatB[ 1]) + (mtxPtrA[10] * mtxFloatB[ 2]) + (mtxPtrA[14] * mtxFloatB[ 3]); - tmpMatrix[3] = (mtxPtrA[ 3] * mtxFloatB[ 0]) + (mtxPtrA[ 7] * mtxFloatB[ 1]) + (mtxPtrA[11] * mtxFloatB[ 2]) + (mtxPtrA[15] * mtxFloatB[ 3]); + tmpMatrix[0] = (mtxA[ 0] * mtxFloatB[ 0]) + (mtxA[ 4] * mtxFloatB[ 1]) + (mtxA[ 8] * mtxFloatB[ 2]) + (mtxA[12] * mtxFloatB[ 3]); + tmpMatrix[1] = (mtxA[ 1] * mtxFloatB[ 0]) + (mtxA[ 5] * mtxFloatB[ 1]) + (mtxA[ 9] * mtxFloatB[ 2]) + (mtxA[13] * mtxFloatB[ 3]); + tmpMatrix[2] = (mtxA[ 2] * mtxFloatB[ 0]) + (mtxA[ 6] * mtxFloatB[ 1]) + (mtxA[10] * mtxFloatB[ 2]) + (mtxA[14] * mtxFloatB[ 3]); + tmpMatrix[3] = (mtxA[ 3] * mtxFloatB[ 0]) + (mtxA[ 7] * mtxFloatB[ 1]) + (mtxA[11] * mtxFloatB[ 2]) + (mtxA[15] * mtxFloatB[ 3]); - tmpMatrix[4] = (mtxPtrA[ 0] * mtxFloatB[ 4]) + (mtxPtrA[ 4] * mtxFloatB[ 5]) + (mtxPtrA[ 8] * mtxFloatB[ 6]) + (mtxPtrA[12] * mtxFloatB[ 7]); - tmpMatrix[5] = (mtxPtrA[ 1] * mtxFloatB[ 4]) + (mtxPtrA[ 5] * mtxFloatB[ 5]) + (mtxPtrA[ 9] * mtxFloatB[ 6]) + (mtxPtrA[13] * mtxFloatB[ 7]); - tmpMatrix[6] = (mtxPtrA[ 2] * mtxFloatB[ 4]) + (mtxPtrA[ 6] * mtxFloatB[ 5]) + (mtxPtrA[10] * mtxFloatB[ 6]) + (mtxPtrA[14] * mtxFloatB[ 7]); - tmpMatrix[7] = (mtxPtrA[ 3] * mtxFloatB[ 4]) + (mtxPtrA[ 7] * mtxFloatB[ 5]) + (mtxPtrA[11] * mtxFloatB[ 6]) + (mtxPtrA[15] * mtxFloatB[ 7]); + tmpMatrix[4] = (mtxA[ 0] * mtxFloatB[ 4]) + (mtxA[ 4] * mtxFloatB[ 5]) + (mtxA[ 8] * mtxFloatB[ 6]) + (mtxA[12] * mtxFloatB[ 7]); + tmpMatrix[5] = (mtxA[ 1] * mtxFloatB[ 4]) + (mtxA[ 5] * mtxFloatB[ 5]) + (mtxA[ 9] * mtxFloatB[ 6]) + (mtxA[13] * mtxFloatB[ 7]); + tmpMatrix[6] = (mtxA[ 2] * mtxFloatB[ 4]) + (mtxA[ 6] * mtxFloatB[ 5]) + (mtxA[10] * mtxFloatB[ 6]) + (mtxA[14] * mtxFloatB[ 7]); + tmpMatrix[7] = (mtxA[ 3] * mtxFloatB[ 4]) + (mtxA[ 7] * mtxFloatB[ 5]) + (mtxA[11] * mtxFloatB[ 6]) + (mtxA[15] * mtxFloatB[ 7]); - tmpMatrix[8] = (mtxPtrA[ 0] * mtxFloatB[ 8]) + (mtxPtrA[ 4] * mtxFloatB[ 9]) + (mtxPtrA[ 8] * mtxFloatB[10]) + (mtxPtrA[12] * mtxFloatB[11]); - tmpMatrix[9] = (mtxPtrA[ 1] * mtxFloatB[ 8]) + (mtxPtrA[ 5] * mtxFloatB[ 9]) + (mtxPtrA[ 9] * mtxFloatB[10]) + (mtxPtrA[13] * mtxFloatB[11]); - tmpMatrix[10] = (mtxPtrA[ 2] * mtxFloatB[ 8]) + (mtxPtrA[ 6] * mtxFloatB[ 9]) + (mtxPtrA[10] * mtxFloatB[10]) + (mtxPtrA[14] * mtxFloatB[11]); - tmpMatrix[11] = (mtxPtrA[ 3] * mtxFloatB[ 8]) + (mtxPtrA[ 7] * mtxFloatB[ 9]) + (mtxPtrA[11] * mtxFloatB[10]) + (mtxPtrA[15] * mtxFloatB[11]); + tmpMatrix[8] = (mtxA[ 0] * mtxFloatB[ 8]) + (mtxA[ 4] * mtxFloatB[ 9]) + (mtxA[ 8] * mtxFloatB[10]) + (mtxA[12] * mtxFloatB[11]); + tmpMatrix[9] = (mtxA[ 1] * mtxFloatB[ 8]) + (mtxA[ 5] * mtxFloatB[ 9]) + (mtxA[ 9] * mtxFloatB[10]) + (mtxA[13] * mtxFloatB[11]); + tmpMatrix[10] = (mtxA[ 2] * mtxFloatB[ 8]) + (mtxA[ 6] * mtxFloatB[ 9]) + (mtxA[10] * mtxFloatB[10]) + (mtxA[14] * mtxFloatB[11]); + tmpMatrix[11] = (mtxA[ 3] * mtxFloatB[ 8]) + (mtxA[ 7] * mtxFloatB[ 9]) + (mtxA[11] * mtxFloatB[10]) + (mtxA[15] * mtxFloatB[11]); - tmpMatrix[12] = (mtxPtrA[ 0] * mtxFloatB[12]) + (mtxPtrA[ 4] * mtxFloatB[13]) + (mtxPtrA[ 8] * mtxFloatB[14]) + (mtxPtrA[12] * mtxFloatB[15]); - tmpMatrix[13] = (mtxPtrA[ 1] * mtxFloatB[12]) + (mtxPtrA[ 5] * mtxFloatB[13]) + (mtxPtrA[ 9] * mtxFloatB[14]) + (mtxPtrA[13] * mtxFloatB[15]); - tmpMatrix[14] = (mtxPtrA[ 2] * mtxFloatB[12]) + (mtxPtrA[ 6] * mtxFloatB[13]) + (mtxPtrA[10] * mtxFloatB[14]) + (mtxPtrA[14] * mtxFloatB[15]); - tmpMatrix[15] = (mtxPtrA[ 3] * mtxFloatB[12]) + (mtxPtrA[ 7] * mtxFloatB[13]) + (mtxPtrA[11] * mtxFloatB[14]) + (mtxPtrA[15] * mtxFloatB[15]); + tmpMatrix[12] = (mtxA[ 0] * mtxFloatB[12]) + (mtxA[ 4] * mtxFloatB[13]) + (mtxA[ 8] * mtxFloatB[14]) + (mtxA[12] * mtxFloatB[15]); + tmpMatrix[13] = (mtxA[ 1] * mtxFloatB[12]) + (mtxA[ 5] * mtxFloatB[13]) + (mtxA[ 9] * mtxFloatB[14]) + (mtxA[13] * mtxFloatB[15]); + tmpMatrix[14] = (mtxA[ 2] * mtxFloatB[12]) + (mtxA[ 6] * mtxFloatB[13]) + (mtxA[10] * mtxFloatB[14]) + (mtxA[14] * mtxFloatB[15]); + tmpMatrix[15] = (mtxA[ 3] * mtxFloatB[12]) + (mtxA[ 7] * mtxFloatB[13]) + (mtxA[11] * mtxFloatB[14]) + (mtxA[15] * mtxFloatB[15]); - memcpy(mtxPtrA, tmpMatrix, sizeof(float)*16); + memcpy(mtxA, tmpMatrix, sizeof(float)*16); } template -FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor) +FORCEINLINE void vector_fix2float(float (&mtx)[16], const float divisor) { for (size_t i = 0; i < NUM_ROWS * 4; i+=4) { - mtxPtr[i+0] /= divisor; - mtxPtr[i+1] /= divisor; - mtxPtr[i+2] /= divisor; - mtxPtr[i+3] /= divisor; + mtx[i+0] /= divisor; + mtx[i+1] /= divisor; + mtx[i+2] /= divisor; + mtx[i+3] /= divisor; } } #endif -void MatrixMultVec4x4(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr) +#ifdef ENABLE_SSE4_1 + +FORCEINLINE void _Vec4_MultiplyByMatrix(__m128i &outVec, + const __m128i &c0, const __m128i &c1, const __m128i &c2, const __m128i &c3, + const __m128i &rowLo0, const __m128i &rowLo1, const __m128i &rowLo2, const __m128i &rowLo3, + const __m128i &rowHi0, const __m128i &rowHi1, const __m128i &rowHi2, const __m128i &rowHi3) { - const s32 x = vecPtr[0]; - const s32 y = vecPtr[1]; - const s32 z = vecPtr[2]; - const s32 w = vecPtr[3]; + __m128i outVecLo = _mm_add_epi64( _mm_add_epi64(_mm_mul_epi32(rowLo0, c0), _mm_mul_epi32(rowLo1, c1)), _mm_add_epi64(_mm_mul_epi32(rowLo2, c2), _mm_mul_epi32(rowLo3, c3)) ); + outVecLo = _mm_srli_epi64(outVecLo, 12); + outVecLo = _mm_shuffle_epi32(outVecLo, 0xD8); - vecPtr[0] = sfx32_shiftdown( fx32_mul(x,mtxPtr[0]) + fx32_mul(y,mtxPtr[4]) + fx32_mul(z,mtxPtr[ 8]) + fx32_mul(w,mtxPtr[12]) ); - vecPtr[1] = sfx32_shiftdown( fx32_mul(x,mtxPtr[1]) + fx32_mul(y,mtxPtr[5]) + fx32_mul(z,mtxPtr[ 9]) + fx32_mul(w,mtxPtr[13]) ); - vecPtr[2] = sfx32_shiftdown( fx32_mul(x,mtxPtr[2]) + fx32_mul(y,mtxPtr[6]) + fx32_mul(z,mtxPtr[10]) + fx32_mul(w,mtxPtr[14]) ); - vecPtr[3] = sfx32_shiftdown( fx32_mul(x,mtxPtr[3]) + fx32_mul(y,mtxPtr[7]) + fx32_mul(z,mtxPtr[11]) + fx32_mul(w,mtxPtr[15]) ); + __m128i outVecHi = _mm_add_epi64( _mm_add_epi64(_mm_mul_epi32(rowHi0, c0), _mm_mul_epi32(rowHi1, c1)), _mm_add_epi64(_mm_mul_epi32(rowHi2, c2), _mm_mul_epi32(rowHi3, c3)) ); + outVecHi = _mm_srli_epi64(outVecHi, 12); + outVecHi = _mm_shuffle_epi32(outVecHi, 0x8D); + + outVec = _mm_blendv_epi8(outVecLo, outVecHi, _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)); } -void MatrixMultVec3x3(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr) +static s32 GEM_SaturateAndShiftdown36To32(const s64 val) { - const s32 x = vecPtr[0]; - const s32 y = vecPtr[1]; - const s32 z = vecPtr[2]; + if(val>(s64)0x000007FFFFFFFFFFULL) return (s32)0x7FFFFFFFU; + if(val<(s64)0xFFFFF80000000000ULL) return (s32)0x80000000U; - vecPtr[0] = sfx32_shiftdown( fx32_mul(x,mtxPtr[0]) + fx32_mul(y,mtxPtr[4]) + fx32_mul(z,mtxPtr[ 8]) ); - vecPtr[1] = sfx32_shiftdown( fx32_mul(x,mtxPtr[1]) + fx32_mul(y,mtxPtr[5]) + fx32_mul(z,mtxPtr[ 9]) ); - vecPtr[2] = sfx32_shiftdown( fx32_mul(x,mtxPtr[2]) + fx32_mul(y,mtxPtr[6]) + fx32_mul(z,mtxPtr[10]) ); + return fx32_shiftdown(val); } -void MatrixTranslate(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr) +FORCEINLINE void _Vec3_MultiplyByMatrix(__m128i &outVec, + const __m128i &c0, const __m128i &c1, const __m128i &c2, + const __m128i &rowLo0, const __m128i &rowLo1, const __m128i &rowLo2, + const __m128i &rowHi0, const __m128i &rowHi1, const __m128i &rowHi2) { - mtxPtr[12] = sfx32_shiftdown( fx32_mul(mtxPtr[0], vecPtr[0]) + fx32_mul(mtxPtr[4], vecPtr[1]) + fx32_mul(mtxPtr[ 8], vecPtr[2]) + fx32_shiftup(mtxPtr[12]) ); - mtxPtr[13] = sfx32_shiftdown( fx32_mul(mtxPtr[1], vecPtr[0]) + fx32_mul(mtxPtr[5], vecPtr[1]) + fx32_mul(mtxPtr[ 9], vecPtr[2]) + fx32_shiftup(mtxPtr[13]) ); - mtxPtr[14] = sfx32_shiftdown( fx32_mul(mtxPtr[2], vecPtr[0]) + fx32_mul(mtxPtr[6], vecPtr[1]) + fx32_mul(mtxPtr[10], vecPtr[2]) + fx32_shiftup(mtxPtr[14]) ); - mtxPtr[15] = sfx32_shiftdown( fx32_mul(mtxPtr[3], vecPtr[0]) + fx32_mul(mtxPtr[7], vecPtr[1]) + fx32_mul(mtxPtr[11], vecPtr[2]) + fx32_shiftup(mtxPtr[15]) ); + __m128i outVecLo = _mm_add_epi64( _mm_mul_epi32(rowLo0, c0), _mm_add_epi64(_mm_mul_epi32(rowLo1, c1), _mm_mul_epi32(rowLo2, c2)) ); + outVecLo = _mm_srli_epi64(outVecLo, 12); + outVecLo = _mm_shuffle_epi32(outVecLo, 0xD8); + + __m128i outVecHi = _mm_add_epi64( _mm_mul_epi32(rowHi0, c0), _mm_add_epi64(_mm_mul_epi32(rowLo1, c1), _mm_mul_epi32(rowHi2, c2)) ); + outVecHi = _mm_srli_epi64(outVecHi, 12); + outVecHi = _mm_shuffle_epi32(outVecHi, 0x8D); + + outVec = _mm_blendv_epi8(outVecLo, outVecHi, _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)); } -void MatrixScale(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr) +FORCEINLINE void _Vec4_Translate(__m128i &outVec, + const __m128i &c0, const __m128i &c1, const __m128i &c2, + const __m128i &rowLo0, const __m128i &rowLo1, const __m128i &rowLo2, const __m128i &rowLo3, + const __m128i &rowHi0, const __m128i &rowHi1, const __m128i &rowHi2, const __m128i &rowHi3) { - mtxPtr[ 0] = sfx32_shiftdown( fx32_mul(mtxPtr[ 0], vecPtr[0]) ); - mtxPtr[ 1] = sfx32_shiftdown( fx32_mul(mtxPtr[ 1], vecPtr[0]) ); - mtxPtr[ 2] = sfx32_shiftdown( fx32_mul(mtxPtr[ 2], vecPtr[0]) ); - mtxPtr[ 3] = sfx32_shiftdown( fx32_mul(mtxPtr[ 3], vecPtr[0]) ); + __m128i outVecLo = _mm_add_epi64( _mm_add_epi64(_mm_mul_epi32(rowLo0, c0), _mm_mul_epi32(rowLo1, c1)), _mm_add_epi64(_mm_mul_epi32(rowLo2, c2), _mm_slli_epi64(rowLo3, 12)) ); + outVecLo = _mm_srli_epi64(outVecLo, 12); + outVecLo = _mm_shuffle_epi32(outVecLo, 0xD8); - mtxPtr[ 4] = sfx32_shiftdown( fx32_mul(mtxPtr[ 4], vecPtr[1]) ); - mtxPtr[ 5] = sfx32_shiftdown( fx32_mul(mtxPtr[ 5], vecPtr[1]) ); - mtxPtr[ 6] = sfx32_shiftdown( fx32_mul(mtxPtr[ 6], vecPtr[1]) ); - mtxPtr[ 7] = sfx32_shiftdown( fx32_mul(mtxPtr[ 7], vecPtr[1]) ); + __m128i outVecHi = _mm_add_epi64( _mm_add_epi64(_mm_mul_epi32(rowHi0, c0), _mm_mul_epi32(rowHi1, c1)), _mm_add_epi64(_mm_mul_epi32(rowHi2, c2), _mm_slli_epi64(rowHi3, 12)) ); + outVecHi = _mm_srli_epi64(outVecHi, 12); + outVecHi = _mm_shuffle_epi32(outVecHi, 0x8D); - mtxPtr[ 8] = sfx32_shiftdown( fx32_mul(mtxPtr[ 8], vecPtr[2]) ); - mtxPtr[ 9] = sfx32_shiftdown( fx32_mul(mtxPtr[ 9], vecPtr[2]) ); - mtxPtr[10] = sfx32_shiftdown( fx32_mul(mtxPtr[10], vecPtr[2]) ); - mtxPtr[11] = sfx32_shiftdown( fx32_mul(mtxPtr[11], vecPtr[2]) ); + outVec = _mm_blendv_epi8(outVecLo, outVecHi, _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)); } -void MatrixMultiply(s32 *__restrict mtxPtrA, const s32 *__restrict mtxPtrB) +FORCEINLINE void _Vec4_Scale(__m128i &inoutVec, const __m128i &scalar) { - s32 tmpMatrix[16]; + __m128i outVecLo = _mm_cvtepu32_epi64(inoutVec); + __m128i outVecHi = _mm_cvtepu32_epi64( _mm_srli_si128(inoutVec, 8) ); - tmpMatrix[ 0] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 0])+fx32_mul(mtxPtrA[4],mtxPtrB[ 1])+fx32_mul(mtxPtrA[ 8],mtxPtrB[ 2])+fx32_mul(mtxPtrA[12],mtxPtrB[ 3]) ); - tmpMatrix[ 1] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 0])+fx32_mul(mtxPtrA[5],mtxPtrB[ 1])+fx32_mul(mtxPtrA[ 9],mtxPtrB[ 2])+fx32_mul(mtxPtrA[13],mtxPtrB[ 3]) ); - tmpMatrix[ 2] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 0])+fx32_mul(mtxPtrA[6],mtxPtrB[ 1])+fx32_mul(mtxPtrA[10],mtxPtrB[ 2])+fx32_mul(mtxPtrA[14],mtxPtrB[ 3]) ); - tmpMatrix[ 3] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 0])+fx32_mul(mtxPtrA[7],mtxPtrB[ 1])+fx32_mul(mtxPtrA[11],mtxPtrB[ 2])+fx32_mul(mtxPtrA[15],mtxPtrB[ 3]) ); + outVecLo = _mm_mul_epi32(outVecLo, scalar); + outVecLo = _mm_srli_epi64(outVecLo, 12); + outVecLo = _mm_shuffle_epi32(outVecLo, 0xD8); - tmpMatrix[ 4] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 4])+fx32_mul(mtxPtrA[4],mtxPtrB[ 5])+fx32_mul(mtxPtrA[ 8],mtxPtrB[ 6])+fx32_mul(mtxPtrA[12],mtxPtrB[ 7]) ); - tmpMatrix[ 5] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 4])+fx32_mul(mtxPtrA[5],mtxPtrB[ 5])+fx32_mul(mtxPtrA[ 9],mtxPtrB[ 6])+fx32_mul(mtxPtrA[13],mtxPtrB[ 7]) ); - tmpMatrix[ 6] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 4])+fx32_mul(mtxPtrA[6],mtxPtrB[ 5])+fx32_mul(mtxPtrA[10],mtxPtrB[ 6])+fx32_mul(mtxPtrA[14],mtxPtrB[ 7]) ); - tmpMatrix[ 7] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 4])+fx32_mul(mtxPtrA[7],mtxPtrB[ 5])+fx32_mul(mtxPtrA[11],mtxPtrB[ 6])+fx32_mul(mtxPtrA[15],mtxPtrB[ 7]) ); + outVecHi = _mm_mul_epi32(outVecHi, scalar); + outVecHi = _mm_srli_epi64(outVecHi, 12); + outVecHi = _mm_shuffle_epi32(outVecHi, 0x8D); - tmpMatrix[ 8] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 8])+fx32_mul(mtxPtrA[4],mtxPtrB[ 9])+fx32_mul(mtxPtrA[ 8],mtxPtrB[10])+fx32_mul(mtxPtrA[12],mtxPtrB[11]) ); - tmpMatrix[ 9] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 8])+fx32_mul(mtxPtrA[5],mtxPtrB[ 9])+fx32_mul(mtxPtrA[ 9],mtxPtrB[10])+fx32_mul(mtxPtrA[13],mtxPtrB[11]) ); - tmpMatrix[10] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 8])+fx32_mul(mtxPtrA[6],mtxPtrB[ 9])+fx32_mul(mtxPtrA[10],mtxPtrB[10])+fx32_mul(mtxPtrA[14],mtxPtrB[11]) ); - tmpMatrix[11] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 8])+fx32_mul(mtxPtrA[7],mtxPtrB[ 9])+fx32_mul(mtxPtrA[11],mtxPtrB[10])+fx32_mul(mtxPtrA[15],mtxPtrB[11]) ); - - tmpMatrix[12] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[12])+fx32_mul(mtxPtrA[4],mtxPtrB[13])+fx32_mul(mtxPtrA[ 8],mtxPtrB[14])+fx32_mul(mtxPtrA[12],mtxPtrB[15]) ); - tmpMatrix[13] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[12])+fx32_mul(mtxPtrA[5],mtxPtrB[13])+fx32_mul(mtxPtrA[ 9],mtxPtrB[14])+fx32_mul(mtxPtrA[13],mtxPtrB[15]) ); - tmpMatrix[14] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[12])+fx32_mul(mtxPtrA[6],mtxPtrB[13])+fx32_mul(mtxPtrA[10],mtxPtrB[14])+fx32_mul(mtxPtrA[14],mtxPtrB[15]) ); - tmpMatrix[15] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[12])+fx32_mul(mtxPtrA[7],mtxPtrB[13])+fx32_mul(mtxPtrA[11],mtxPtrB[14])+fx32_mul(mtxPtrA[15],mtxPtrB[15]) ); - - memcpy(mtxPtrA, tmpMatrix, sizeof(s32)*16); + inoutVec = _mm_blendv_epi8(outVecLo, outVecHi, _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)); } + +void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]) +{ + const __m128i inVec = _mm_load_si128((__m128i *)vec); + + const __m128i scalar[4] = { + _mm_shuffle_epi32(inVec, 0x00), + _mm_shuffle_epi32(inVec, 0x55), + _mm_shuffle_epi32(inVec, 0xAA), + _mm_shuffle_epi32(inVec, 0xFF) + }; + + const __m128i row[4] = { + _mm_load_si128((__m128i *)(mtx + 0)), + _mm_load_si128((__m128i *)(mtx + 4)), + _mm_load_si128((__m128i *)(mtx + 8)), + _mm_load_si128((__m128i *)(mtx + 12)) + }; + + const __m128i rowLo[4] = { + _mm_cvtepu32_epi64(row[0]), + _mm_cvtepu32_epi64(row[1]), + _mm_cvtepu32_epi64(row[2]), + _mm_cvtepu32_epi64(row[3]) + }; + + const __m128i rowHi[4] = { + _mm_cvtepu32_epi64( _mm_srli_si128(row[0], 8)), + _mm_cvtepu32_epi64( _mm_srli_si128(row[1], 8)), + _mm_cvtepu32_epi64( _mm_srli_si128(row[2], 8)), + _mm_cvtepu32_epi64( _mm_srli_si128(row[3], 8)) + }; + + __m128i outVec; + _Vec4_MultiplyByMatrix(outVec, + scalar[0], scalar[1], scalar[2], scalar[3], + rowLo[0], rowLo[1], rowLo[2], rowLo[3], + rowHi[0], rowHi[1], rowHi[2], rowHi[3]); + + _mm_store_si128((__m128i *)vec, outVec); +} + +void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]) +{ + const __m128i inVec = _mm_load_si128((__m128i *)vec); + + const __m128i scalar[3] = { + _mm_shuffle_epi32(inVec, 0x00), + _mm_shuffle_epi32(inVec, 0x55), + _mm_shuffle_epi32(inVec, 0xAA) + }; + + const __m128i row[3] = { + _mm_load_si128((__m128i *)(mtx + 0)), + _mm_load_si128((__m128i *)(mtx + 4)), + _mm_load_si128((__m128i *)(mtx + 8)) + }; + + const __m128i rowLo[3] = { + _mm_cvtepu32_epi64(row[0]), + _mm_cvtepu32_epi64(row[1]), + _mm_cvtepu32_epi64(row[2]) + }; + + const __m128i rowHi[3] = { + _mm_cvtepu32_epi64( _mm_srli_si128(row[0], 8)), + _mm_cvtepu32_epi64( _mm_srli_si128(row[1], 8)), + _mm_cvtepu32_epi64( _mm_srli_si128(row[2], 8)) + }; + + __m128i outVec; + _Vec3_MultiplyByMatrix(outVec, + scalar[0], scalar[1], scalar[2], + rowLo[0], rowLo[1], rowLo[2], + rowHi[0], rowHi[1], rowHi[2]); + + outVec = _mm_blend_epi16(outVec, inVec, 0xC0); + _mm_store_si128((__m128i *)vec, outVec); +} + +void MatrixTranslate(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]) +{ + const __m128i inVec = _mm_load_si128((__m128i *)vec); + + const __m128i scalar[3] = { + _mm_shuffle_epi32(inVec, 0x00), + _mm_shuffle_epi32(inVec, 0x55), + _mm_shuffle_epi32(inVec, 0xAA) + }; + + const __m128i row[4] = { + _mm_load_si128((__m128i *)(mtx + 0)), + _mm_load_si128((__m128i *)(mtx + 4)), + _mm_load_si128((__m128i *)(mtx + 8)), + _mm_load_si128((__m128i *)(mtx + 12)) + }; + + const __m128i rowLo[4] = { + _mm_cvtepu32_epi64(row[0]), + _mm_cvtepu32_epi64(row[1]), + _mm_cvtepu32_epi64(row[2]), + _mm_cvtepu32_epi64(row[3]) + }; + + const __m128i rowHi[4] = { + _mm_cvtepu32_epi64( _mm_srli_si128(row[0], 8)), + _mm_cvtepu32_epi64( _mm_srli_si128(row[1], 8)), + _mm_cvtepu32_epi64( _mm_srli_si128(row[2], 8)), + _mm_cvtepu32_epi64( _mm_srli_si128(row[3], 8)) + }; + + __m128i outVec; + _Vec4_Translate(outVec, + scalar[0], scalar[1], scalar[2], + rowLo[0], rowLo[1], rowLo[2], rowLo[3], + rowHi[0], rowHi[1], rowHi[2], rowHi[3]); + + _mm_store_si128((__m128i *)(mtx + 12), outVec); +} + +void MatrixScale(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]) +{ + const __m128i inVec = _mm_load_si128((__m128i *)vec); + const __m128i scalar[3] = { + _mm_shuffle_epi32(inVec, 0x00), + _mm_shuffle_epi32(inVec, 0x55), + _mm_shuffle_epi32(inVec, 0xAA) + }; + + __m128i row[3] = { + _mm_load_si128((__m128i *)(mtx + 0)), + _mm_load_si128((__m128i *)(mtx + 4)), + _mm_load_si128((__m128i *)(mtx + 8)) + }; + + _Vec4_Scale(row[0], scalar[0]); + _mm_store_si128((__m128i *)(mtx + 0), row[0]); + + _Vec4_Scale(row[1], scalar[1]); + _mm_store_si128((__m128i *)(mtx + 4), row[1]); + + _Vec4_Scale(row[2], scalar[2]); + _mm_store_si128((__m128i *)(mtx + 8), row[2]); +} + +void MatrixMultiply(s32 (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]) +{ + const __m128i rowA[4] = { + _mm_load_si128((__m128i *)(mtxA + 0)), + _mm_load_si128((__m128i *)(mtxA + 4)), + _mm_load_si128((__m128i *)(mtxA + 8)), + _mm_load_si128((__m128i *)(mtxA + 12)) + }; + + const __m128i rowB[4] = { + _mm_load_si128((__m128i *)(mtxB + 0)), + _mm_load_si128((__m128i *)(mtxB + 4)), + _mm_load_si128((__m128i *)(mtxB + 8)), + _mm_load_si128((__m128i *)(mtxB + 12)) + }; + + const __m128i rowLo[4] = { + _mm_cvtepu32_epi64(rowA[0]), + _mm_cvtepu32_epi64(rowA[1]), + _mm_cvtepu32_epi64(rowA[2]), + _mm_cvtepu32_epi64(rowA[3]) + }; + + const __m128i rowHi[4] = { + _mm_cvtepu32_epi64( _mm_srli_si128(rowA[0], 8)), + _mm_cvtepu32_epi64( _mm_srli_si128(rowA[1], 8)), + _mm_cvtepu32_epi64( _mm_srli_si128(rowA[2], 8)), + _mm_cvtepu32_epi64( _mm_srli_si128(rowA[3], 8)) + }; + + __m128i outVec; + __m128i scalar[4]; + + scalar[0] = _mm_shuffle_epi32(rowB[0], 0x00); + scalar[1] = _mm_shuffle_epi32(rowB[0], 0x55); + scalar[2] = _mm_shuffle_epi32(rowB[0], 0xAA); + scalar[3] = _mm_shuffle_epi32(rowB[0], 0xFF); + _Vec4_MultiplyByMatrix(outVec, + scalar[0], scalar[1], scalar[2], scalar[3], + rowLo[0], rowLo[1], rowLo[2], rowLo[3], + rowHi[0], rowHi[1], rowHi[2], rowHi[3]); + _mm_store_si128((__m128i *)(mtxA + 0), outVec); + + scalar[0] = _mm_shuffle_epi32(rowB[1], 0x00); + scalar[1] = _mm_shuffle_epi32(rowB[1], 0x55); + scalar[2] = _mm_shuffle_epi32(rowB[1], 0xAA); + scalar[3] = _mm_shuffle_epi32(rowB[1], 0xFF); + _Vec4_MultiplyByMatrix(outVec, + scalar[0], scalar[1], scalar[2], scalar[3], + rowLo[0], rowLo[1], rowLo[2], rowLo[3], + rowHi[0], rowHi[1], rowHi[2], rowHi[3]); + _mm_store_si128((__m128i *)(mtxA + 4), outVec); + + scalar[0] = _mm_shuffle_epi32(rowB[2], 0x00); + scalar[1] = _mm_shuffle_epi32(rowB[2], 0x55); + scalar[2] = _mm_shuffle_epi32(rowB[2], 0xAA); + scalar[3] = _mm_shuffle_epi32(rowB[2], 0xFF); + _Vec4_MultiplyByMatrix(outVec, + scalar[0], scalar[1], scalar[2], scalar[3], + rowLo[0], rowLo[1], rowLo[2], rowLo[3], + rowHi[0], rowHi[1], rowHi[2], rowHi[3]); + _mm_store_si128((__m128i *)(mtxA + 8), outVec); + + scalar[0] = _mm_shuffle_epi32(rowB[3], 0x00); + scalar[1] = _mm_shuffle_epi32(rowB[3], 0x55); + scalar[2] = _mm_shuffle_epi32(rowB[3], 0xAA); + scalar[3] = _mm_shuffle_epi32(rowB[3], 0xFF); + _Vec4_MultiplyByMatrix(outVec, + scalar[0], scalar[1], scalar[2], scalar[3], + rowLo[0], rowLo[1], rowLo[2], rowLo[3], + rowHi[0], rowHi[1], rowHi[2], rowHi[3]); + _mm_store_si128((__m128i *)(mtxA + 12), outVec); +} + +#else + +FORCEINLINE void _Vec4_MultiplyByMatrix(s32 (&__restrict outVec)[4], const s32 (&__restrict inVec)[4], const s32 (&__restrict mtx)[16]) +{ + outVec[0] = sfx32_shiftdown( fx32_mul(mtx[0],inVec[0]) + fx32_mul(mtx[4],inVec[1]) + fx32_mul(mtx[ 8],inVec[2]) + fx32_mul(mtx[12],inVec[3]) ); + outVec[1] = sfx32_shiftdown( fx32_mul(mtx[1],inVec[0]) + fx32_mul(mtx[5],inVec[1]) + fx32_mul(mtx[ 9],inVec[2]) + fx32_mul(mtx[13],inVec[3]) ); + outVec[2] = sfx32_shiftdown( fx32_mul(mtx[2],inVec[0]) + fx32_mul(mtx[6],inVec[1]) + fx32_mul(mtx[10],inVec[2]) + fx32_mul(mtx[14],inVec[3]) ); + outVec[3] = sfx32_shiftdown( fx32_mul(mtx[3],inVec[0]) + fx32_mul(mtx[7],inVec[1]) + fx32_mul(mtx[11],inVec[2]) + fx32_mul(mtx[15],inVec[3]) ); +} + +FORCEINLINE void _Vec3_MultiplyByMatrix(s32 (&__restrict outVec)[4], const s32 (&__restrict inVec)[3], const s32 (&__restrict mtx)[16]) +{ + outVec[0] = sfx32_shiftdown( fx32_mul(mtx[0],inVec[0]) + fx32_mul(mtx[4],inVec[1]) + fx32_mul(mtx[ 8],inVec[2]) ); + outVec[1] = sfx32_shiftdown( fx32_mul(mtx[1],inVec[0]) + fx32_mul(mtx[5],inVec[1]) + fx32_mul(mtx[ 9],inVec[2]) ); + outVec[2] = sfx32_shiftdown( fx32_mul(mtx[2],inVec[0]) + fx32_mul(mtx[6],inVec[1]) + fx32_mul(mtx[10],inVec[2]) ); +} + +FORCEINLINE void _Vec4_Scale(s32 (&inoutVec)[4], const s32 scalar) +{ + inoutVec[0] = sfx32_shiftdown( fx32_mul(inoutVec[0], scalar) ); + inoutVec[1] = sfx32_shiftdown( fx32_mul(inoutVec[1], scalar) ); + inoutVec[2] = sfx32_shiftdown( fx32_mul(inoutVec[2], scalar) ); + inoutVec[3] = sfx32_shiftdown( fx32_mul(inoutVec[3], scalar) ); +} + +void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]) +{ + const CACHE_ALIGN s32 __restrict tmpVec[4] = { + vec[0], vec[1], vec[2], vec[3] + }; + + _Vec4_MultiplyByMatrix(vec, tmpVec, mtx); +} + +void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]) +{ + const CACHE_ALIGN s32 __restrict tmpVec[3] = { + vec[0], vec[1], vec[2] + }; + + _Vec3_MultiplyByMatrix(vec, tmpVec, mtx); +} + +void MatrixTranslate(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]) +{ + mtx[12] = sfx32_shiftdown( fx32_mul(mtx[0], vec[0]) + fx32_mul(mtx[4], vec[1]) + fx32_mul(mtx[ 8], vec[2]) + fx32_shiftup(mtx[12]) ); + mtx[13] = sfx32_shiftdown( fx32_mul(mtx[1], vec[0]) + fx32_mul(mtx[5], vec[1]) + fx32_mul(mtx[ 9], vec[2]) + fx32_shiftup(mtx[13]) ); + mtx[14] = sfx32_shiftdown( fx32_mul(mtx[2], vec[0]) + fx32_mul(mtx[6], vec[1]) + fx32_mul(mtx[10], vec[2]) + fx32_shiftup(mtx[14]) ); + mtx[15] = sfx32_shiftdown( fx32_mul(mtx[3], vec[0]) + fx32_mul(mtx[7], vec[1]) + fx32_mul(mtx[11], vec[2]) + fx32_shiftup(mtx[15]) ); +} + +void MatrixScale(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]) +{ + _Vec4_Scale((s32 (&__restrict)[4])mtx[0], vec[0]); + _Vec4_Scale((s32 (&__restrict)[4])mtx[4], vec[1]); + _Vec4_Scale((s32 (&__restrict)[4])mtx[8], vec[2]); +} + +void MatrixMultiply(s32 (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]) +{ + const CACHE_ALIGN s32 tmpMtxA[16] = { + mtxA[ 0], mtxA[ 1], mtxA[ 2], mtxA[ 3], + mtxA[ 4], mtxA[ 5], mtxA[ 6], mtxA[ 7], + mtxA[ 8], mtxA[ 9], mtxA[10], mtxA[11], + mtxA[12], mtxA[13], mtxA[14], mtxA[15] + }; + + _Vec4_MultiplyByMatrix((s32 (&__restrict)[4])mtxA[ 0], (s32 (&__restrict)[4])mtxB[ 0], tmpMtxA); + _Vec4_MultiplyByMatrix((s32 (&__restrict)[4])mtxA[ 4], (s32 (&__restrict)[4])mtxB[ 4], tmpMtxA); + _Vec4_MultiplyByMatrix((s32 (&__restrict)[4])mtxA[ 8], (s32 (&__restrict)[4])mtxB[ 8], tmpMtxA); + _Vec4_MultiplyByMatrix((s32 (&__restrict)[4])mtxA[12], (s32 (&__restrict)[4])mtxB[12], tmpMtxA); +} + +#endif diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h index 12fe8f740..220638a39 100644 --- a/desmume/src/matrix.h +++ b/desmume/src/matrix.h @@ -31,42 +31,52 @@ #ifdef ENABLE_SSE2 #include +#endif + +#ifdef ENABLE_SSE4_1 +#include "smmintrin.h" #endif - + +enum MatrixMode +{ + MATRIXMODE_PROJECTION = 0, + MATRIXMODE_POSITION = 1, + MATRIXMODE_POSITION_VECTOR = 2, + MATRIXMODE_TEXTURE = 3 +}; + +template struct MatrixStack { - MatrixStack(int size, int type); - s32 *matrix; - u32 position; - s32 size; - u8 type; -}; + static const size_t size = ((MODE == MATRIXMODE_PROJECTION) || (MODE == MATRIXMODE_TEXTURE)) ? 1 : 32; + static const MatrixMode type = MODE; + + s32 matrix[size][16]; + u32 position; +}; -void MatrixInit(s32 *mtxPtr); -void MatrixInit(float *mtxPtr); +void MatrixInit(s32 (&mtx)[16]); +void MatrixInit(float (&mtx)[16]); -void MatrixIdentity(s32 *mtxPtr); -void MatrixIdentity(float *mtxPtr); +void MatrixIdentity(s32 (&mtx)[16]); +void MatrixIdentity(float (&mtx)[16]); -void MatrixSet(s32 *mtxPtr, const size_t x, const size_t y, const s32 value); -void MatrixSet(float *mtxPtr, const size_t x, const size_t y, const float value); -void MatrixSet(float *mtxPtr, const size_t x, const size_t y, const s32 value); +void MatrixSet(s32 (&mtx)[16], const size_t x, const size_t y, const s32 value); +void MatrixSet(float (&mtx)[16], const size_t x, const size_t y, const float value); +void MatrixSet(float (&mtx)[16], const size_t x, const size_t y, const s32 value); -void MatrixCopy(s32 *mtxDst, const s32 *mtxSrc); -void MatrixCopy(float *mtxDst, const float *mtxSrc); -void MatrixCopy(float *mtxDst, const s32 *mtxSrc); +void MatrixCopy(s32 (&mtxDst)[16], const s32 (&mtxSrc)[16]); +void MatrixCopy(float (&mtxDst)[16], const float (&mtxSrc)[16]); +void MatrixCopy(float (&__restrict mtxDst)[16], const s32 (&__restrict mtxSrc)[16]); -int MatrixCompare(const s32 *mtxDst, const s32 *mtxSrc); -int MatrixCompare(const float *mtxDst, const float *mtxSrc); +int MatrixCompare(const s32 (&mtxDst)[16], const s32 (&mtxSrc)[16]); +int MatrixCompare(const float (&mtxDst)[16], const float (&mtxSrc)[16]); -s32 MatrixGetMultipliedIndex(const u32 index, s32 *matrix, s32 *rightMatrix); -float MatrixGetMultipliedIndex(const u32 index, float *matrix, float *rightMatrix); +s32 MatrixGetMultipliedIndex(const u32 index, const s32 (&mtxA)[16], const s32 (&mtxB)[16]); +float MatrixGetMultipliedIndex(const u32 index, const float (&mtxA)[16], const float (&mtxB)[16]); -void MatrixStackInit (MatrixStack *stack); -void MatrixStackSetMaxSize (MatrixStack *stack, int size); -s32* MatrixStackGetPos (MatrixStack *stack, const size_t pos); -s32* MatrixStackGet (MatrixStack *stack); -void MatrixStackLoadMatrix (MatrixStack *stack, const size_t pos, const s32 *ptr); +template void MatrixStackInit(MatrixStack *stack); +template s32* MatrixStackGet(MatrixStack *stack); void Vector2Copy(float *dst, const float *src); void Vector2Add(float *dst, const float *src); @@ -86,21 +96,21 @@ void Vector3Normalize(float *dst); void Vector4Copy(float *dst, const float *src); -void _MatrixMultVec4x4_NoSIMD(const s32 *__restrict mtxPtr, float *__restrict vecPtr); +void _MatrixMultVec4x4_NoSIMD(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]); -void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr); -void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr); -void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr); -void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr); -void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB); +void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]); +void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]); +void MatrixTranslate(float (&__restrict mtx)[16], const float (&__restrict vec)[4]); +void MatrixScale(float (&__restrict mtx)[16], const float (&__restrict vec)[4]); +void MatrixMultiply(float (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]); -template FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor); +template FORCEINLINE void vector_fix2float(float (&mtx)[16], const float divisor); -void MatrixMultVec4x4(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr); -void MatrixMultVec3x3(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr); -void MatrixTranslate(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr); -void MatrixScale(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr); -void MatrixMultiply(s32 *__restrict mtxPtrA, const s32 *__restrict mtxPtrB); +void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]); +void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]); +void MatrixTranslate(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]); +void MatrixScale(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]); +void MatrixMultiply(s32 (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]); //these functions are an unreliable, inaccurate floor. //it should only be used for positive numbers