matrix.cpp: Do a bunch of code cleanup.
This commit is contained in:
parent
c41a006b2a
commit
249afccfca
|
@ -609,11 +609,11 @@ void gfx3d_reset()
|
|||
memset(colorRGB, 0, sizeof(colorRGB));
|
||||
memset(&tempVertInfo, 0, sizeof(tempVertInfo));
|
||||
|
||||
MatrixInit (mtxCurrent[0]);
|
||||
MatrixInit (mtxCurrent[1]);
|
||||
MatrixInit (mtxCurrent[2]);
|
||||
MatrixInit (mtxCurrent[3]);
|
||||
MatrixInit (mtxTemporal);
|
||||
MatrixInit(mtxCurrent[MATRIXMODE_PROJECTION]);
|
||||
MatrixInit(mtxCurrent[MATRIXMODE_POSITION]);
|
||||
MatrixInit(mtxCurrent[MATRIXMODE_POSITION_VECTOR]);
|
||||
MatrixInit(mtxCurrent[MATRIXMODE_TEXTURE]);
|
||||
MatrixInit(mtxTemporal);
|
||||
|
||||
MatrixStackInit(&mtxStack[0]);
|
||||
MatrixStackInit(&mtxStack[1]);
|
||||
|
@ -727,13 +727,13 @@ static void SetVertex()
|
|||
if (texCoordTransformMode == TextureTransformationMode_VertexSource)
|
||||
{
|
||||
//Tested by: Eledees The Adventures of Kai and Zero (E) [title screen and frontend menus]
|
||||
last_s = (s32)(((s64)s16coord[0] * mtxCurrent[3][0] +
|
||||
(s64)s16coord[1] * mtxCurrent[3][4] +
|
||||
(s64)s16coord[2] * mtxCurrent[3][8] +
|
||||
last_s = (s32)(((s64)s16coord[0] * mtxCurrent[MATRIXMODE_TEXTURE][0] +
|
||||
(s64)s16coord[1] * mtxCurrent[MATRIXMODE_TEXTURE][4] +
|
||||
(s64)s16coord[2] * mtxCurrent[MATRIXMODE_TEXTURE][8] +
|
||||
(((s64)(_s))<<24))>>24);
|
||||
last_t = (s32)(((s64)s16coord[0] * mtxCurrent[3][1] +
|
||||
(s64)s16coord[1] * mtxCurrent[3][5] +
|
||||
(s64)s16coord[2] * mtxCurrent[3][9] +
|
||||
last_t = (s32)(((s64)s16coord[0] * mtxCurrent[MATRIXMODE_TEXTURE][1] +
|
||||
(s64)s16coord[1] * mtxCurrent[MATRIXMODE_TEXTURE][5] +
|
||||
(s64)s16coord[2] * mtxCurrent[MATRIXMODE_TEXTURE][9] +
|
||||
(((s64)(_t))<<24))>>24);
|
||||
}
|
||||
|
||||
|
@ -744,8 +744,8 @@ static void SetVertex()
|
|||
if(polylist->count >= POLYLIST_SIZE)
|
||||
return;
|
||||
|
||||
GEM_TransformVertex(mtxCurrent[1],coordTransformed); //modelview
|
||||
GEM_TransformVertex(mtxCurrent[0],coordTransformed); //projection
|
||||
GEM_TransformVertex(mtxCurrent[MATRIXMODE_POSITION], coordTransformed); //modelview
|
||||
GEM_TransformVertex(mtxCurrent[MATRIXMODE_PROJECTION], coordTransformed); //projection
|
||||
|
||||
//TODO - culling should be done here.
|
||||
//TODO - viewport transform?
|
||||
|
@ -930,7 +930,7 @@ static void gfx3d_glLightDirection_cache(const size_t index)
|
|||
cacheLightDirection[index][3] = 0;
|
||||
|
||||
//Multiply the vector by the directional matrix
|
||||
MatrixMultVec3x3_fixed(mtxCurrent[2], cacheLightDirection[index]);
|
||||
MatrixMultVec3x3(mtxCurrent[MATRIXMODE_POSITION_VECTOR], cacheLightDirection[index]);
|
||||
|
||||
//Calculate the half angle vector
|
||||
s32 lineOfSight[4] = {0, 0, (-1)<<12, 0};
|
||||
|
@ -1092,7 +1092,7 @@ static void gfx3d_glLoadIdentity()
|
|||
GFX_DELAY(19);
|
||||
|
||||
if (mode == MATRIXMODE_POSITION_VECTOR)
|
||||
MatrixIdentity(mtxCurrent[1]);
|
||||
MatrixIdentity(mtxCurrent[MATRIXMODE_POSITION]);
|
||||
|
||||
//printf("identity: %d to: \n",mode); MatrixPrint(mtxCurrent[1]);
|
||||
}
|
||||
|
@ -1110,7 +1110,7 @@ static BOOL gfx3d_glLoadMatrix4x4(s32 v)
|
|||
//vector_fix2float<4>(mtxCurrent[mode], 4096.f);
|
||||
|
||||
if (mode == MATRIXMODE_POSITION_VECTOR)
|
||||
MatrixCopy(mtxCurrent[1], mtxCurrent[2]);
|
||||
MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], mtxCurrent[MATRIXMODE_POSITION_VECTOR]);
|
||||
|
||||
//printf("load4x4: matrix %d to: \n",mode); MatrixPrint(mtxCurrent[1]);
|
||||
return TRUE;
|
||||
|
@ -1134,7 +1134,7 @@ static BOOL gfx3d_glLoadMatrix4x3(s32 v)
|
|||
GFX_DELAY(30);
|
||||
|
||||
if (mode == MATRIXMODE_POSITION_VECTOR)
|
||||
MatrixCopy(mtxCurrent[1], mtxCurrent[2]);
|
||||
MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], mtxCurrent[MATRIXMODE_POSITION_VECTOR]);
|
||||
//printf("load4x3: matrix %d to: \n",mode); MatrixPrint(mtxCurrent[1]);
|
||||
return TRUE;
|
||||
}
|
||||
|
@ -1155,7 +1155,7 @@ static BOOL gfx3d_glMultMatrix4x4(s32 v)
|
|||
|
||||
if (mode == MATRIXMODE_POSITION_VECTOR)
|
||||
{
|
||||
MatrixMultiply(mtxCurrent[1], mtxTemporal);
|
||||
MatrixMultiply(mtxCurrent[MATRIXMODE_POSITION], mtxTemporal);
|
||||
GFX_DELAY_M2(30);
|
||||
}
|
||||
|
||||
|
@ -1186,7 +1186,7 @@ static BOOL gfx3d_glMultMatrix4x3(s32 v)
|
|||
|
||||
if (mode == MATRIXMODE_POSITION_VECTOR)
|
||||
{
|
||||
MatrixMultiply (mtxCurrent[1], mtxTemporal);
|
||||
MatrixMultiply (mtxCurrent[MATRIXMODE_POSITION], mtxTemporal);
|
||||
GFX_DELAY_M2(30);
|
||||
}
|
||||
|
||||
|
@ -1219,7 +1219,7 @@ static BOOL gfx3d_glMultMatrix3x3(s32 v)
|
|||
|
||||
if (mode == MATRIXMODE_POSITION_VECTOR)
|
||||
{
|
||||
MatrixMultiply(mtxCurrent[1], mtxTemporal);
|
||||
MatrixMultiply(mtxCurrent[MATRIXMODE_POSITION], mtxTemporal);
|
||||
GFX_DELAY_M2(30);
|
||||
}
|
||||
|
||||
|
@ -1248,8 +1248,8 @@ static BOOL gfx3d_glScale(s32 v)
|
|||
//note: pos-vector mode should not cause both matrices to scale.
|
||||
//the whole purpose is to keep the vector matrix orthogonal
|
||||
//so, I am leaving this commented out as an example of what not to do.
|
||||
//if (mode == 2)
|
||||
// MatrixScale (mtxCurrent[1], scale);
|
||||
//if (mode == MATRIXMODE_POSITION_VECTOR)
|
||||
// MatrixScale (mtxCurrent[MATRIXMODE_POSITION], scale);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
@ -1268,7 +1268,7 @@ static BOOL gfx3d_glTranslate(s32 v)
|
|||
|
||||
if (mode == MATRIXMODE_POSITION_VECTOR)
|
||||
{
|
||||
MatrixTranslate(mtxCurrent[1], trans);
|
||||
MatrixTranslate(mtxCurrent[MATRIXMODE_POSITION], trans);
|
||||
GFX_DELAY_M2(30);
|
||||
}
|
||||
|
||||
|
@ -1297,11 +1297,11 @@ static void gfx3d_glNormal(s32 v)
|
|||
{
|
||||
//SM64 highlight rendered star in main menu tests this
|
||||
//also smackdown 2010 player textures tested this (needed cast on _s and _t)
|
||||
last_s = (s32)(((s64)normal[0] * mtxCurrent[3][0] + (s64)normal[1] * mtxCurrent[3][4] + (s64)normal[2] * mtxCurrent[3][8] + (((s64)_s)<<24))>>24);
|
||||
last_t = (s32)(((s64)normal[0] * mtxCurrent[3][1] + (s64)normal[1] * mtxCurrent[3][5] + (s64)normal[2] * mtxCurrent[3][9] + (((s64)_t)<<24))>>24);
|
||||
last_s = (s32)(((s64)normal[0] * mtxCurrent[MATRIXMODE_TEXTURE][0] + (s64)normal[1] * mtxCurrent[MATRIXMODE_TEXTURE][4] + (s64)normal[2] * mtxCurrent[MATRIXMODE_TEXTURE][8] + (((s64)_s)<<24))>>24);
|
||||
last_t = (s32)(((s64)normal[0] * mtxCurrent[MATRIXMODE_TEXTURE][1] + (s64)normal[1] * mtxCurrent[MATRIXMODE_TEXTURE][5] + (s64)normal[2] * mtxCurrent[MATRIXMODE_TEXTURE][9] + (((s64)_t)<<24))>>24);
|
||||
}
|
||||
|
||||
MatrixMultVec3x3_fixed(mtxCurrent[2],normal);
|
||||
MatrixMultVec3x3(mtxCurrent[MATRIXMODE_POSITION_VECTOR], normal);
|
||||
|
||||
//apply lighting model
|
||||
u8 diffuse[3] = {
|
||||
|
@ -1395,8 +1395,8 @@ static void gfx3d_glTexCoord(s32 val)
|
|||
if (texCoordTransformMode == TextureTransformationMode_TexCoordSource)
|
||||
{
|
||||
//dragon quest 4 overworld will test this
|
||||
last_s = (s32) (( (s64)_s * mtxCurrent[3][0] + (s64)_t * mtxCurrent[3][4] + (s64)mtxCurrent[3][8] + (s64)mtxCurrent[3][12])>>12);
|
||||
last_t = (s32) (( (s64)_s * mtxCurrent[3][1] + (s64)_t * mtxCurrent[3][5] + (s64)mtxCurrent[3][9] + (s64)mtxCurrent[3][13])>>12);
|
||||
last_s = (s32) (( (s64)_s * mtxCurrent[MATRIXMODE_TEXTURE][0] + (s64)_t * mtxCurrent[MATRIXMODE_TEXTURE][4] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][8] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][12])>>12);
|
||||
last_t = (s32) (( (s64)_s * mtxCurrent[MATRIXMODE_TEXTURE][1] + (s64)_t * mtxCurrent[MATRIXMODE_TEXTURE][5] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][9] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][13])>>12);
|
||||
}
|
||||
else if (texCoordTransformMode == TextureTransformationMode_None)
|
||||
{
|
||||
|
@ -1684,13 +1684,13 @@ static BOOL gfx3d_glBoxTest(u32 v)
|
|||
//MatrixMultVec4x4_M2(mtxCurrent[0], verts[i].coord);
|
||||
|
||||
//but change it all to floating point and do it that way instead
|
||||
CACHE_ALIGN float temp1[16] = {mtxCurrent[1][0]/4096.0f,mtxCurrent[1][1]/4096.0f,mtxCurrent[1][2]/4096.0f,mtxCurrent[1][3]/4096.0f,mtxCurrent[1][4]/4096.0f,mtxCurrent[1][5]/4096.0f,mtxCurrent[1][6]/4096.0f,mtxCurrent[1][7]/4096.0f,mtxCurrent[1][8]/4096.0f,mtxCurrent[1][9]/4096.0f,mtxCurrent[1][10]/4096.0f,mtxCurrent[1][11]/4096.0f,mtxCurrent[1][12]/4096.0f,mtxCurrent[1][13]/4096.0f,mtxCurrent[1][14]/4096.0f,mtxCurrent[1][15]/4096.0f};
|
||||
CACHE_ALIGN float temp0[16] = {mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f,mtxCurrent[0][4]/4096.0f,mtxCurrent[0][5]/4096.0f,mtxCurrent[0][6]/4096.0f,mtxCurrent[0][7]/4096.0f,mtxCurrent[0][8]/4096.0f,mtxCurrent[0][9]/4096.0f,mtxCurrent[0][10]/4096.0f,mtxCurrent[0][11]/4096.0f,mtxCurrent[0][12]/4096.0f,mtxCurrent[0][13]/4096.0f,mtxCurrent[0][14]/4096.0f,mtxCurrent[0][15]/4096.0f};
|
||||
|
||||
//DS_ALIGN(16) VERT_POS4f vert = { verts[i].x, verts[i].y, verts[i].z, verts[i].w };
|
||||
|
||||
_NOSSE_MatrixMultVec4x4(temp1,verts[i].coord);
|
||||
_NOSSE_MatrixMultVec4x4(temp0,verts[i].coord);
|
||||
|
||||
//_MatrixMultVec4x4_NoSIMD(mtxCurrent[MATRIXMODE_POSITION], verts[i].coord);
|
||||
//_MatrixMultVec4x4_NoSIMD(mtxCurrent[MATRIXMODE_PROJECTION], verts[i].coord);
|
||||
MatrixMultVec4x4(mtxCurrent[MATRIXMODE_POSITION], verts[i].coord);
|
||||
MatrixMultVec4x4(mtxCurrent[MATRIXMODE_PROJECTION], verts[i].coord);
|
||||
}
|
||||
|
||||
//clip each poly
|
||||
|
@ -1742,12 +1742,9 @@ static BOOL gfx3d_glPosTest(u32 v)
|
|||
PTind = 0;
|
||||
|
||||
PTcoords[3] = 1.0f;
|
||||
|
||||
CACHE_ALIGN float temp1[16] = {mtxCurrent[1][0]/4096.0f,mtxCurrent[1][1]/4096.0f,mtxCurrent[1][2]/4096.0f,mtxCurrent[1][3]/4096.0f,mtxCurrent[1][4]/4096.0f,mtxCurrent[1][5]/4096.0f,mtxCurrent[1][6]/4096.0f,mtxCurrent[1][7]/4096.0f,mtxCurrent[1][8]/4096.0f,mtxCurrent[1][9]/4096.0f,mtxCurrent[1][10]/4096.0f,mtxCurrent[1][11]/4096.0f,mtxCurrent[1][12]/4096.0f,mtxCurrent[1][13]/4096.0f,mtxCurrent[1][14]/4096.0f,mtxCurrent[1][15]/4096.0f};
|
||||
CACHE_ALIGN float temp0[16] = {mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f,mtxCurrent[0][4]/4096.0f,mtxCurrent[0][5]/4096.0f,mtxCurrent[0][6]/4096.0f,mtxCurrent[0][7]/4096.0f,mtxCurrent[0][8]/4096.0f,mtxCurrent[0][9]/4096.0f,mtxCurrent[0][10]/4096.0f,mtxCurrent[0][11]/4096.0f,mtxCurrent[0][12]/4096.0f,mtxCurrent[0][13]/4096.0f,mtxCurrent[0][14]/4096.0f,mtxCurrent[0][15]/4096.0f};
|
||||
|
||||
MatrixMultVec4x4(temp1, PTcoords);
|
||||
MatrixMultVec4x4(temp0, PTcoords);
|
||||
|
||||
MatrixMultVec4x4(mtxCurrent[MATRIXMODE_POSITION], PTcoords);
|
||||
MatrixMultVec4x4(mtxCurrent[MATRIXMODE_PROJECTION], PTcoords);
|
||||
|
||||
MMU_new.gxstat.tb = 0;
|
||||
|
||||
|
@ -1765,13 +1762,14 @@ static void gfx3d_glVecTest(u32 v)
|
|||
//i am not sure exactly what it is doing, maybe it is testing to ensure
|
||||
//that the normal vector for the point of interest is camera-facing.
|
||||
|
||||
CACHE_ALIGN float normal[4] = { normalTable[v&1023],
|
||||
normalTable[(v>>10)&1023],
|
||||
normalTable[(v>>20)&1023],
|
||||
0};
|
||||
|
||||
CACHE_ALIGN float temp[16] = {mtxCurrent[2][0]/4096.0f,mtxCurrent[2][1]/4096.0f,mtxCurrent[2][2]/4096.0f,mtxCurrent[2][3]/4096.0f,mtxCurrent[2][4]/4096.0f,mtxCurrent[2][5]/4096.0f,mtxCurrent[2][6]/4096.0f,mtxCurrent[2][7]/4096.0f,mtxCurrent[2][8]/4096.0f,mtxCurrent[2][9]/4096.0f,mtxCurrent[2][10]/4096.0f,mtxCurrent[2][11]/4096.0f,mtxCurrent[2][12]/4096.0f,mtxCurrent[2][13]/4096.0f,mtxCurrent[2][14]/4096.0f,mtxCurrent[2][15]/4096.0f};
|
||||
MatrixMultVec4x4(temp, normal);
|
||||
CACHE_ALIGN float normal[4] = {
|
||||
normalTable[v&1023],
|
||||
normalTable[(v>>10)&1023],
|
||||
normalTable[(v>>20)&1023],
|
||||
0
|
||||
};
|
||||
|
||||
MatrixMultVec4x4(mtxCurrent[MATRIXMODE_POSITION_VECTOR], normal);
|
||||
|
||||
s16 x = (s16)(normal[0]*4096);
|
||||
s16 y = (s16)(normal[1]*4096);
|
||||
|
@ -1853,7 +1851,7 @@ void gfx3d_UpdateToonTable(u8 offset, u32 val)
|
|||
s32 gfx3d_GetClipMatrix(const u32 index)
|
||||
{
|
||||
//printf("reading clip matrix: %d\n",index);
|
||||
return (s32)MatrixGetMultipliedIndex(index, mtxCurrent[0], mtxCurrent[1]);
|
||||
return (s32)MatrixGetMultipliedIndex(index, mtxCurrent[MATRIXMODE_PROJECTION], mtxCurrent[MATRIXMODE_POSITION]);
|
||||
}
|
||||
|
||||
s32 gfx3d_GetDirectionalMatrix(const u32 index)
|
||||
|
@ -1861,7 +1859,7 @@ s32 gfx3d_GetDirectionalMatrix(const u32 index)
|
|||
const size_t _index = (((index / 3) * 4) + (index % 3));
|
||||
|
||||
//return (s32)(mtxCurrent[2][_index]*(1<<12));
|
||||
return mtxCurrent[2][_index];
|
||||
return mtxCurrent[MATRIXMODE_POSITION_VECTOR][_index];
|
||||
}
|
||||
|
||||
void gfx3d_glAlphaFunc(u32 v)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
Copyright (C) 2006-2007 shash
|
||||
Copyright (C) 2007-2017 DeSmuME team
|
||||
Copyright (C) 2007-2018 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -24,118 +24,6 @@
|
|||
#include "matrix.h"
|
||||
#include "MMU.h"
|
||||
|
||||
void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr)
|
||||
{
|
||||
float x = vecPtr[0];
|
||||
float y = vecPtr[1];
|
||||
float z = vecPtr[2];
|
||||
float w = vecPtr[3];
|
||||
|
||||
vecPtr[0] = x * matrix[0] + y * matrix[4] + z * matrix[ 8] + w * matrix[12];
|
||||
vecPtr[1] = x * matrix[1] + y * matrix[5] + z * matrix[ 9] + w * matrix[13];
|
||||
vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10] + w * matrix[14];
|
||||
vecPtr[3] = x * matrix[3] + y * matrix[7] + z * matrix[11] + w * matrix[15];
|
||||
}
|
||||
|
||||
void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr)
|
||||
{
|
||||
const s32 x = vecPtr[0];
|
||||
const s32 y = vecPtr[1];
|
||||
const s32 z = vecPtr[2];
|
||||
const s32 w = vecPtr[3];
|
||||
|
||||
vecPtr[0] = sfx32_shiftdown(fx32_mul(x,matrix[0]) + fx32_mul(y,matrix[4]) + fx32_mul(z,matrix [8]) + fx32_mul(w,matrix[12]));
|
||||
vecPtr[1] = sfx32_shiftdown(fx32_mul(x,matrix[1]) + fx32_mul(y,matrix[5]) + fx32_mul(z,matrix[ 9]) + fx32_mul(w,matrix[13]));
|
||||
vecPtr[2] = sfx32_shiftdown(fx32_mul(x,matrix[2]) + fx32_mul(y,matrix[6]) + fx32_mul(z,matrix[10]) + fx32_mul(w,matrix[14]));
|
||||
vecPtr[3] = sfx32_shiftdown(fx32_mul(x,matrix[3]) + fx32_mul(y,matrix[7]) + fx32_mul(z,matrix[11]) + fx32_mul(w,matrix[15]));
|
||||
}
|
||||
|
||||
void MatrixMultVec3x3_fixed(const s32 *matrix, s32 *vecPtr)
|
||||
{
|
||||
const s32 x = vecPtr[0];
|
||||
const s32 y = vecPtr[1];
|
||||
const s32 z = vecPtr[2];
|
||||
|
||||
vecPtr[0] = sfx32_shiftdown(fx32_mul(x,matrix[0]) + fx32_mul(y,matrix[4]) + fx32_mul(z,matrix[8]));
|
||||
vecPtr[1] = sfx32_shiftdown(fx32_mul(x,matrix[1]) + fx32_mul(y,matrix[5]) + fx32_mul(z,matrix[9]));
|
||||
vecPtr[2] = sfx32_shiftdown(fx32_mul(x,matrix[2]) + fx32_mul(y,matrix[6]) + fx32_mul(z,matrix[10]));
|
||||
}
|
||||
|
||||
//-------------------------
|
||||
//switched SSE functions: implementations for no SSE
|
||||
#ifndef ENABLE_SSE
|
||||
void MatrixMultVec4x4 (const float *matrix, float *vecPtr)
|
||||
{
|
||||
_NOSSE_MatrixMultVec4x4(matrix, vecPtr);
|
||||
}
|
||||
|
||||
|
||||
void MatrixMultVec3x3 (const float *matrix, float *vecPtr)
|
||||
{
|
||||
float x = vecPtr[0];
|
||||
float y = vecPtr[1];
|
||||
float z = vecPtr[2];
|
||||
|
||||
vecPtr[0] = x * matrix[0] + y * matrix[4] + z * matrix[ 8];
|
||||
vecPtr[1] = x * matrix[1] + y * matrix[5] + z * matrix[ 9];
|
||||
vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10];
|
||||
}
|
||||
|
||||
void MatrixMultiply (float *matrix, const float *rightMatrix)
|
||||
{
|
||||
float tmpMatrix[16];
|
||||
|
||||
tmpMatrix[0] = (matrix[0]*rightMatrix[0])+(matrix[4]*rightMatrix[1])+(matrix[8]*rightMatrix[2])+(matrix[12]*rightMatrix[3]);
|
||||
tmpMatrix[1] = (matrix[1]*rightMatrix[0])+(matrix[5]*rightMatrix[1])+(matrix[9]*rightMatrix[2])+(matrix[13]*rightMatrix[3]);
|
||||
tmpMatrix[2] = (matrix[2]*rightMatrix[0])+(matrix[6]*rightMatrix[1])+(matrix[10]*rightMatrix[2])+(matrix[14]*rightMatrix[3]);
|
||||
tmpMatrix[3] = (matrix[3]*rightMatrix[0])+(matrix[7]*rightMatrix[1])+(matrix[11]*rightMatrix[2])+(matrix[15]*rightMatrix[3]);
|
||||
|
||||
tmpMatrix[4] = (matrix[0]*rightMatrix[4])+(matrix[4]*rightMatrix[5])+(matrix[8]*rightMatrix[6])+(matrix[12]*rightMatrix[7]);
|
||||
tmpMatrix[5] = (matrix[1]*rightMatrix[4])+(matrix[5]*rightMatrix[5])+(matrix[9]*rightMatrix[6])+(matrix[13]*rightMatrix[7]);
|
||||
tmpMatrix[6] = (matrix[2]*rightMatrix[4])+(matrix[6]*rightMatrix[5])+(matrix[10]*rightMatrix[6])+(matrix[14]*rightMatrix[7]);
|
||||
tmpMatrix[7] = (matrix[3]*rightMatrix[4])+(matrix[7]*rightMatrix[5])+(matrix[11]*rightMatrix[6])+(matrix[15]*rightMatrix[7]);
|
||||
|
||||
tmpMatrix[8] = (matrix[0]*rightMatrix[8])+(matrix[4]*rightMatrix[9])+(matrix[8]*rightMatrix[10])+(matrix[12]*rightMatrix[11]);
|
||||
tmpMatrix[9] = (matrix[1]*rightMatrix[8])+(matrix[5]*rightMatrix[9])+(matrix[9]*rightMatrix[10])+(matrix[13]*rightMatrix[11]);
|
||||
tmpMatrix[10] = (matrix[2]*rightMatrix[8])+(matrix[6]*rightMatrix[9])+(matrix[10]*rightMatrix[10])+(matrix[14]*rightMatrix[11]);
|
||||
tmpMatrix[11] = (matrix[3]*rightMatrix[8])+(matrix[7]*rightMatrix[9])+(matrix[11]*rightMatrix[10])+(matrix[15]*rightMatrix[11]);
|
||||
|
||||
tmpMatrix[12] = (matrix[0]*rightMatrix[12])+(matrix[4]*rightMatrix[13])+(matrix[8]*rightMatrix[14])+(matrix[12]*rightMatrix[15]);
|
||||
tmpMatrix[13] = (matrix[1]*rightMatrix[12])+(matrix[5]*rightMatrix[13])+(matrix[9]*rightMatrix[14])+(matrix[13]*rightMatrix[15]);
|
||||
tmpMatrix[14] = (matrix[2]*rightMatrix[12])+(matrix[6]*rightMatrix[13])+(matrix[10]*rightMatrix[14])+(matrix[14]*rightMatrix[15]);
|
||||
tmpMatrix[15] = (matrix[3]*rightMatrix[12])+(matrix[7]*rightMatrix[13])+(matrix[11]*rightMatrix[14])+(matrix[15]*rightMatrix[15]);
|
||||
|
||||
memcpy (matrix, tmpMatrix, sizeof(float)*16);
|
||||
}
|
||||
|
||||
void MatrixTranslate (float *matrix, const float *ptr)
|
||||
{
|
||||
matrix[12] += (matrix[0]*ptr[0])+(matrix[4]*ptr[1])+(matrix[ 8]*ptr[2]);
|
||||
matrix[13] += (matrix[1]*ptr[0])+(matrix[5]*ptr[1])+(matrix[ 9]*ptr[2]);
|
||||
matrix[14] += (matrix[2]*ptr[0])+(matrix[6]*ptr[1])+(matrix[10]*ptr[2]);
|
||||
matrix[15] += (matrix[3]*ptr[0])+(matrix[7]*ptr[1])+(matrix[11]*ptr[2]);
|
||||
}
|
||||
|
||||
void MatrixScale (float *matrix, const float *ptr)
|
||||
{
|
||||
matrix[0] *= ptr[0];
|
||||
matrix[1] *= ptr[0];
|
||||
matrix[2] *= ptr[0];
|
||||
matrix[3] *= ptr[0];
|
||||
|
||||
matrix[4] *= ptr[1];
|
||||
matrix[5] *= ptr[1];
|
||||
matrix[6] *= ptr[1];
|
||||
matrix[7] *= ptr[1];
|
||||
|
||||
matrix[8] *= ptr[2];
|
||||
matrix[9] *= ptr[2];
|
||||
matrix[10] *= ptr[2];
|
||||
matrix[11] *= ptr[2];
|
||||
}
|
||||
|
||||
#endif //switched c/asm functions
|
||||
//-----------------------------------------
|
||||
|
||||
void MatrixInit (s32 *matrix)
|
||||
{
|
||||
|
@ -345,51 +233,487 @@ void Vector4Copy(float *dst, const float *src)
|
|||
dst[3] = src[3];
|
||||
}
|
||||
|
||||
void _MatrixMultVec4x4_NoSIMD(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
|
||||
{
|
||||
const CACHE_ALIGN float mtxFloat[16] = {
|
||||
mtxPtr[ 0] / 4096.0f,
|
||||
mtxPtr[ 1] / 4096.0f,
|
||||
mtxPtr[ 2] / 4096.0f,
|
||||
mtxPtr[ 3] / 4096.0f,
|
||||
|
||||
mtxPtr[ 4] / 4096.0f,
|
||||
mtxPtr[ 5] / 4096.0f,
|
||||
mtxPtr[ 6] / 4096.0f,
|
||||
mtxPtr[ 7] / 4096.0f,
|
||||
|
||||
mtxPtr[ 8] / 4096.0f,
|
||||
mtxPtr[ 9] / 4096.0f,
|
||||
mtxPtr[10] / 4096.0f,
|
||||
mtxPtr[11] / 4096.0f,
|
||||
|
||||
mtxPtr[12] / 4096.0f,
|
||||
mtxPtr[13] / 4096.0f,
|
||||
mtxPtr[14] / 4096.0f,
|
||||
mtxPtr[15] / 4096.0f
|
||||
};
|
||||
|
||||
const float x = vecPtr[0];
|
||||
const float y = vecPtr[1];
|
||||
const float z = vecPtr[2];
|
||||
const float w = vecPtr[3];
|
||||
|
||||
vecPtr[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]) + (w * mtxFloat[12]);
|
||||
vecPtr[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]) + (w * mtxFloat[13]);
|
||||
vecPtr[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]) + (w * mtxFloat[14]);
|
||||
vecPtr[3] = (x * mtxFloat[3]) + (y * mtxFloat[7]) + (z * mtxFloat[11]) + (w * mtxFloat[15]);
|
||||
}
|
||||
|
||||
void MatrixMultiply (s32 *matrix, const s32 *rightMatrix)
|
||||
#ifdef ENABLE_SSE
|
||||
|
||||
void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
|
||||
{
|
||||
const __m128 loadedVecPtr = _mm_load_ps(vecPtr);
|
||||
const __m128 convertScalar = _mm_set1_ps(1.0f/4096.0f);
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
__m128 row[4] = {
|
||||
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 0)) ),
|
||||
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 4)) ),
|
||||
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 8)) ),
|
||||
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 12)) )
|
||||
};
|
||||
#else
|
||||
const CACHE_ALIGN float mtxFloat[16] = {
|
||||
(float)mtxPtr[0],
|
||||
(float)mtxPtr[1],
|
||||
(float)mtxPtr[2],
|
||||
(float)mtxPtr[3],
|
||||
|
||||
(float)mtxPtr[4],
|
||||
(float)mtxPtr[5],
|
||||
(float)mtxPtr[6],
|
||||
(float)mtxPtr[7],
|
||||
|
||||
(float)mtxPtr[8],
|
||||
(float)mtxPtr[9],
|
||||
(float)mtxPtr[10],
|
||||
(float)mtxPtr[11],
|
||||
|
||||
(float)mtxPtr[12],
|
||||
(float)mtxPtr[13],
|
||||
(float)mtxPtr[14],
|
||||
(float)mtxPtr[15]
|
||||
};
|
||||
|
||||
__m128 row[4] = {
|
||||
_mm_load_ps(mtxFloat + 0),
|
||||
_mm_load_ps(mtxFloat + 4),
|
||||
_mm_load_ps(mtxFloat + 8),
|
||||
_mm_load_ps(mtxFloat + 12)
|
||||
};
|
||||
#endif
|
||||
|
||||
row[0] = _mm_mul_ps(row[0], convertScalar);
|
||||
row[1] = _mm_mul_ps(row[1], convertScalar);
|
||||
row[2] = _mm_mul_ps(row[2], convertScalar);
|
||||
row[3] = _mm_mul_ps(row[3], convertScalar);
|
||||
|
||||
const __m128 vec[4] = {
|
||||
_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x00),
|
||||
_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x55),
|
||||
_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xAA),
|
||||
_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xFF)
|
||||
};
|
||||
|
||||
const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], vec[0]), _mm_add_ps(_mm_mul_ps(row[1], vec[1]), _mm_add_ps(_mm_mul_ps(row[2], vec[2]), _mm_mul_ps(row[3], vec[3]))) );
|
||||
_mm_store_ps(vecPtr, calcVec);
|
||||
}
|
||||
|
||||
void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
|
||||
{
|
||||
const __m128 loadedVecPtr = _mm_load_ps(vecPtr);
|
||||
const __m128 convertScalar = _mm_set1_ps(1.0f/4096.0f);
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
__m128 row[3] = {
|
||||
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 0)) ),
|
||||
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 4)) ),
|
||||
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 8)) )
|
||||
};
|
||||
#else
|
||||
const CACHE_ALIGN float mtxFloat[16] = {
|
||||
(float)mtxPtr[0],
|
||||
(float)mtxPtr[1],
|
||||
(float)mtxPtr[2],
|
||||
(float)mtxPtr[3],
|
||||
|
||||
(float)mtxPtr[4],
|
||||
(float)mtxPtr[5],
|
||||
(float)mtxPtr[6],
|
||||
(float)mtxPtr[7],
|
||||
|
||||
(float)mtxPtr[8],
|
||||
(float)mtxPtr[9],
|
||||
(float)mtxPtr[10],
|
||||
(float)mtxPtr[11],
|
||||
|
||||
(float)mtxPtr[12],
|
||||
(float)mtxPtr[13],
|
||||
(float)mtxPtr[14],
|
||||
(float)mtxPtr[15]
|
||||
};
|
||||
|
||||
__m128 row[3] = {
|
||||
_mm_load_ps(mtxFloat + 0),
|
||||
_mm_load_ps(mtxFloat + 4),
|
||||
_mm_load_ps(mtxFloat + 8)
|
||||
};
|
||||
#endif
|
||||
|
||||
row[0] = _mm_mul_ps(row[0], convertScalar);
|
||||
row[1] = _mm_mul_ps(row[1], convertScalar);
|
||||
row[2] = _mm_mul_ps(row[2], convertScalar);
|
||||
|
||||
const __m128 vec[3] = {
|
||||
_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x00),
|
||||
_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x55),
|
||||
_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xAA)
|
||||
};
|
||||
|
||||
const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], vec[0]), _mm_add_ps(_mm_mul_ps(row[1], vec[1]), _mm_mul_ps(row[2], vec[2])) );
|
||||
_mm_store_ps(vecPtr, calcVec);
|
||||
}
|
||||
|
||||
void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr)
|
||||
{
|
||||
__m128 xmm4 = _mm_load_ps(vecPtr);
|
||||
__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
|
||||
__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
|
||||
xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
|
||||
|
||||
xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtxPtr));
|
||||
xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtxPtr+4));
|
||||
xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtxPtr+8));
|
||||
xmm4 = _mm_add_ps(xmm4,xmm5);
|
||||
xmm4 = _mm_add_ps(xmm4,xmm6);
|
||||
xmm4 = _mm_add_ps(xmm4,_mm_load_ps(mtxPtr+12));
|
||||
_mm_store_ps(mtxPtr+12,xmm4);
|
||||
}
|
||||
|
||||
void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr)
|
||||
{
|
||||
__m128 xmm4 = _mm_load_ps(vecPtr);
|
||||
__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
|
||||
__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
|
||||
xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
|
||||
|
||||
xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtxPtr));
|
||||
xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtxPtr+4));
|
||||
xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtxPtr+8));
|
||||
_mm_store_ps(mtxPtr,xmm4);
|
||||
_mm_store_ps(mtxPtr+4,xmm5);
|
||||
_mm_store_ps(mtxPtr+8,xmm6);
|
||||
}
|
||||
|
||||
void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB)
|
||||
{
|
||||
const __m128 convertScale = _mm_set1_ps(1.0f/4096.0f);
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
__m128 rowB[4] = {
|
||||
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 0)) ),
|
||||
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 4)) ),
|
||||
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 8)) ),
|
||||
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 12)) )
|
||||
};
|
||||
#else
|
||||
const CACHE_ALIGN float mtxFloatB[16] = {
|
||||
(float)mtxPtrB[0],
|
||||
(float)mtxPtrB[1],
|
||||
(float)mtxPtrB[2],
|
||||
(float)mtxPtrB[3],
|
||||
|
||||
(float)mtxPtrB[4],
|
||||
(float)mtxPtrB[5],
|
||||
(float)mtxPtrB[6],
|
||||
(float)mtxPtrB[7],
|
||||
|
||||
(float)mtxPtrB[8],
|
||||
(float)mtxPtrB[9],
|
||||
(float)mtxPtrB[10],
|
||||
(float)mtxPtrB[11],
|
||||
|
||||
(float)mtxPtrB[12],
|
||||
(float)mtxPtrB[13],
|
||||
(float)mtxPtrB[14],
|
||||
(float)mtxPtrB[15]
|
||||
};
|
||||
|
||||
__m128 rowB[4] = {
|
||||
_mm_load_ps(mtxFloatB + 0),
|
||||
_mm_load_ps(mtxFloatB + 4),
|
||||
_mm_load_ps(mtxFloatB + 8),
|
||||
_mm_load_ps(mtxFloatB + 12)
|
||||
};
|
||||
#endif
|
||||
|
||||
rowB[0] = _mm_mul_ps(rowB[0], convertScale);
|
||||
rowB[1] = _mm_mul_ps(rowB[1], convertScale);
|
||||
rowB[2] = _mm_mul_ps(rowB[2], convertScale);
|
||||
rowB[3] = _mm_mul_ps(rowB[3], convertScale);
|
||||
|
||||
__m128 rowA[4] = {
|
||||
_mm_load_ps(mtxPtrA + 0),
|
||||
_mm_load_ps(mtxPtrA + 4),
|
||||
_mm_load_ps(mtxPtrA + 8),
|
||||
_mm_load_ps(mtxPtrA + 12)
|
||||
};
|
||||
|
||||
__m128 vecB[4];
|
||||
__m128 calcRow;
|
||||
|
||||
vecB[0] = _mm_shuffle_ps(rowB[0], rowB[0], 0x00);
|
||||
vecB[1] = _mm_shuffle_ps(rowB[0], rowB[0], 0x55);
|
||||
vecB[2] = _mm_shuffle_ps(rowB[0], rowB[0], 0xAA);
|
||||
vecB[3] = _mm_shuffle_ps(rowB[0], rowB[0], 0xFF);
|
||||
calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
|
||||
_mm_store_ps(mtxPtrA + 0, calcRow);
|
||||
|
||||
vecB[0] = _mm_shuffle_ps(rowB[1], rowB[1], 0x00);
|
||||
vecB[1] = _mm_shuffle_ps(rowB[1], rowB[1], 0x55);
|
||||
vecB[2] = _mm_shuffle_ps(rowB[1], rowB[1], 0xAA);
|
||||
vecB[3] = _mm_shuffle_ps(rowB[1], rowB[1], 0xFF);
|
||||
calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
|
||||
_mm_store_ps(mtxPtrA + 4, calcRow);
|
||||
|
||||
vecB[0] = _mm_shuffle_ps(rowB[2], rowB[2], 0x00);
|
||||
vecB[1] = _mm_shuffle_ps(rowB[2], rowB[2], 0x55);
|
||||
vecB[2] = _mm_shuffle_ps(rowB[2], rowB[2], 0xAA);
|
||||
vecB[3] = _mm_shuffle_ps(rowB[2], rowB[2], 0xFF);
|
||||
calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
|
||||
_mm_store_ps(mtxPtrA + 8, calcRow);
|
||||
|
||||
vecB[0] = _mm_shuffle_ps(rowB[3], rowB[3], 0x00);
|
||||
vecB[1] = _mm_shuffle_ps(rowB[3], rowB[3], 0x55);
|
||||
vecB[2] = _mm_shuffle_ps(rowB[3], rowB[3], 0xAA);
|
||||
vecB[3] = _mm_shuffle_ps(rowB[3], rowB[3], 0xFF);
|
||||
calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
|
||||
_mm_store_ps(mtxPtrA + 12, calcRow);
|
||||
}
|
||||
|
||||
template<size_t NUM_ROWS>
|
||||
FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor)
|
||||
{
|
||||
const __m128 divisor_v128 = _mm_set1_ps(divisor);
|
||||
|
||||
for (size_t i = 0; i < NUM_ROWS * 4; i+=4)
|
||||
{
|
||||
_mm_store_ps( mtxPtr + i, _mm_div_ps(_mm_load_ps(mtxPtr + i), divisor_v128) );
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
|
||||
{
|
||||
_MatrixMultVec4x4_NoSIMD(mtxPtr, vecPtr);
|
||||
}
|
||||
|
||||
void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
|
||||
{
|
||||
const CACHE_ALIGN float mtxFloat[16] = {
|
||||
mtxPtr[ 0] / 4096.0f,
|
||||
mtxPtr[ 1] / 4096.0f,
|
||||
mtxPtr[ 2] / 4096.0f,
|
||||
mtxPtr[ 3] / 4096.0f,
|
||||
|
||||
mtxPtr[ 4] / 4096.0f,
|
||||
mtxPtr[ 5] / 4096.0f,
|
||||
mtxPtr[ 6] / 4096.0f,
|
||||
mtxPtr[ 7] / 4096.0f,
|
||||
|
||||
mtxPtr[ 8] / 4096.0f,
|
||||
mtxPtr[ 9] / 4096.0f,
|
||||
mtxPtr[10] / 4096.0f,
|
||||
mtxPtr[11] / 4096.0f,
|
||||
|
||||
mtxPtr[12] / 4096.0f,
|
||||
mtxPtr[13] / 4096.0f,
|
||||
mtxPtr[14] / 4096.0f,
|
||||
mtxPtr[15] / 4096.0f
|
||||
};
|
||||
|
||||
const float x = vecPtr[0];
|
||||
const float y = vecPtr[1];
|
||||
const float z = vecPtr[2];
|
||||
|
||||
vecPtr[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]);
|
||||
vecPtr[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]);
|
||||
vecPtr[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]);
|
||||
}
|
||||
|
||||
void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr)
|
||||
{
|
||||
mtxPtr[12] += (mtxPtr[0] * vecPtr[0]) + (mtxPtr[4] * vecPtr[1]) + (mtxPtr[ 8] * vecPtr[2]);
|
||||
mtxPtr[13] += (mtxPtr[1] * vecPtr[0]) + (mtxPtr[5] * vecPtr[1]) + (mtxPtr[ 9] * vecPtr[2]);
|
||||
mtxPtr[14] += (mtxPtr[2] * vecPtr[0]) + (mtxPtr[6] * vecPtr[1]) + (mtxPtr[10] * vecPtr[2]);
|
||||
mtxPtr[15] += (mtxPtr[3] * vecPtr[0]) + (mtxPtr[7] * vecPtr[1]) + (mtxPtr[11] * vecPtr[2]);
|
||||
}
|
||||
|
||||
void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr)
|
||||
{
|
||||
mtxPtr[ 0] *= vecPtr[0];
|
||||
mtxPtr[ 1] *= vecPtr[0];
|
||||
mtxPtr[ 2] *= vecPtr[0];
|
||||
mtxPtr[ 3] *= vecPtr[0];
|
||||
|
||||
mtxPtr[ 4] *= vecPtr[1];
|
||||
mtxPtr[ 5] *= vecPtr[1];
|
||||
mtxPtr[ 6] *= vecPtr[1];
|
||||
mtxPtr[ 7] *= vecPtr[1];
|
||||
|
||||
mtxPtr[ 8] *= vecPtr[2];
|
||||
mtxPtr[ 9] *= vecPtr[2];
|
||||
mtxPtr[10] *= vecPtr[2];
|
||||
mtxPtr[11] *= vecPtr[2];
|
||||
}
|
||||
|
||||
void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB)
|
||||
{
|
||||
const CACHE_ALIGN float mtxFloatB[16] = {
|
||||
(float)mtxPtrB[ 0],
|
||||
(float)mtxPtrB[ 1],
|
||||
(float)mtxPtrB[ 2],
|
||||
(float)mtxPtrB[ 3],
|
||||
|
||||
(float)mtxPtrB[ 4],
|
||||
(float)mtxPtrB[ 5],
|
||||
(float)mtxPtrB[ 6],
|
||||
(float)mtxPtrB[ 7],
|
||||
|
||||
(float)mtxPtrB[ 8],
|
||||
(float)mtxPtrB[ 9],
|
||||
(float)mtxPtrB[10],
|
||||
(float)mtxPtrB[11],
|
||||
|
||||
(float)mtxPtrB[12],
|
||||
(float)mtxPtrB[13],
|
||||
(float)mtxPtrB[14],
|
||||
(float)mtxPtrB[15]
|
||||
};
|
||||
|
||||
float tmpMatrix[16];
|
||||
|
||||
tmpMatrix[0] = (mtxPtrA[ 0] * mtxFloatB[ 0]) + (mtxPtrA[ 4] * mtxFloatB[ 1]) + (mtxPtrA[ 8] * mtxFloatB[ 2]) + (mtxPtrA[12] * mtxFloatB[ 3]);
|
||||
tmpMatrix[1] = (mtxPtrA[ 1] * mtxFloatB[ 0]) + (mtxPtrA[ 5] * mtxFloatB[ 1]) + (mtxPtrA[ 9] * mtxFloatB[ 2]) + (mtxPtrA[13] * mtxFloatB[ 3]);
|
||||
tmpMatrix[2] = (mtxPtrA[ 2] * mtxFloatB[ 0]) + (mtxPtrA[ 6] * mtxFloatB[ 1]) + (mtxPtrA[10] * mtxFloatB[ 2]) + (mtxPtrA[14] * mtxFloatB[ 3]);
|
||||
tmpMatrix[3] = (mtxPtrA[ 3] * mtxFloatB[ 0]) + (mtxPtrA[ 7] * mtxFloatB[ 1]) + (mtxPtrA[11] * mtxFloatB[ 2]) + (mtxPtrA[15] * mtxFloatB[ 3]);
|
||||
|
||||
tmpMatrix[4] = (mtxPtrA[ 0] * mtxFloatB[ 4]) + (mtxPtrA[ 4] * mtxFloatB[ 5]) + (mtxPtrA[ 8] * mtxFloatB[ 6]) + (mtxPtrA[12] * mtxFloatB[ 7]);
|
||||
tmpMatrix[5] = (mtxPtrA[ 1] * mtxFloatB[ 4]) + (mtxPtrA[ 5] * mtxFloatB[ 5]) + (mtxPtrA[ 9] * mtxFloatB[ 6]) + (mtxPtrA[13] * mtxFloatB[ 7]);
|
||||
tmpMatrix[6] = (mtxPtrA[ 2] * mtxFloatB[ 4]) + (mtxPtrA[ 6] * mtxFloatB[ 5]) + (mtxPtrA[10] * mtxFloatB[ 6]) + (mtxPtrA[14] * mtxFloatB[ 7]);
|
||||
tmpMatrix[7] = (mtxPtrA[ 3] * mtxFloatB[ 4]) + (mtxPtrA[ 7] * mtxFloatB[ 5]) + (mtxPtrA[11] * mtxFloatB[ 6]) + (mtxPtrA[15] * mtxFloatB[ 7]);
|
||||
|
||||
tmpMatrix[8] = (mtxPtrA[ 0] * mtxFloatB[ 8]) + (mtxPtrA[ 4] * mtxFloatB[ 9]) + (mtxPtrA[ 8] * mtxFloatB[10]) + (mtxPtrA[12] * mtxFloatB[11]);
|
||||
tmpMatrix[9] = (mtxPtrA[ 1] * mtxFloatB[ 8]) + (mtxPtrA[ 5] * mtxFloatB[ 9]) + (mtxPtrA[ 9] * mtxFloatB[10]) + (mtxPtrA[13] * mtxFloatB[11]);
|
||||
tmpMatrix[10] = (mtxPtrA[ 2] * mtxFloatB[ 8]) + (mtxPtrA[ 6] * mtxFloatB[ 9]) + (mtxPtrA[10] * mtxFloatB[10]) + (mtxPtrA[14] * mtxFloatB[11]);
|
||||
tmpMatrix[11] = (mtxPtrA[ 3] * mtxFloatB[ 8]) + (mtxPtrA[ 7] * mtxFloatB[ 9]) + (mtxPtrA[11] * mtxFloatB[10]) + (mtxPtrA[15] * mtxFloatB[11]);
|
||||
|
||||
tmpMatrix[12] = (mtxPtrA[ 0] * mtxFloatB[12]) + (mtxPtrA[ 4] * mtxFloatB[13]) + (mtxPtrA[ 8] * mtxFloatB[14]) + (mtxPtrA[12] * mtxFloatB[15]);
|
||||
tmpMatrix[13] = (mtxPtrA[ 1] * mtxFloatB[12]) + (mtxPtrA[ 5] * mtxFloatB[13]) + (mtxPtrA[ 9] * mtxFloatB[14]) + (mtxPtrA[13] * mtxFloatB[15]);
|
||||
tmpMatrix[14] = (mtxPtrA[ 2] * mtxFloatB[12]) + (mtxPtrA[ 6] * mtxFloatB[13]) + (mtxPtrA[10] * mtxFloatB[14]) + (mtxPtrA[14] * mtxFloatB[15]);
|
||||
tmpMatrix[15] = (mtxPtrA[ 3] * mtxFloatB[12]) + (mtxPtrA[ 7] * mtxFloatB[13]) + (mtxPtrA[11] * mtxFloatB[14]) + (mtxPtrA[15] * mtxFloatB[15]);
|
||||
|
||||
memcpy(mtxPtrA, tmpMatrix, sizeof(float)*16);
|
||||
}
|
||||
|
||||
template<size_t NUM_ROWS>
|
||||
FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor)
|
||||
{
|
||||
for (size_t i = 0; i < NUM_ROWS * 4; i+=4)
|
||||
{
|
||||
mtxPtr[i+0] /= divisor;
|
||||
mtxPtr[i+1] /= divisor;
|
||||
mtxPtr[i+2] /= divisor;
|
||||
mtxPtr[i+3] /= divisor;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void MatrixMultVec4x4(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr)
|
||||
{
|
||||
const s32 x = vecPtr[0];
|
||||
const s32 y = vecPtr[1];
|
||||
const s32 z = vecPtr[2];
|
||||
const s32 w = vecPtr[3];
|
||||
|
||||
vecPtr[0] = sfx32_shiftdown( fx32_mul(x,mtxPtr[0]) + fx32_mul(y,mtxPtr[4]) + fx32_mul(z,mtxPtr[ 8]) + fx32_mul(w,mtxPtr[12]) );
|
||||
vecPtr[1] = sfx32_shiftdown( fx32_mul(x,mtxPtr[1]) + fx32_mul(y,mtxPtr[5]) + fx32_mul(z,mtxPtr[ 9]) + fx32_mul(w,mtxPtr[13]) );
|
||||
vecPtr[2] = sfx32_shiftdown( fx32_mul(x,mtxPtr[2]) + fx32_mul(y,mtxPtr[6]) + fx32_mul(z,mtxPtr[10]) + fx32_mul(w,mtxPtr[14]) );
|
||||
vecPtr[3] = sfx32_shiftdown( fx32_mul(x,mtxPtr[3]) + fx32_mul(y,mtxPtr[7]) + fx32_mul(z,mtxPtr[11]) + fx32_mul(w,mtxPtr[15]) );
|
||||
}
|
||||
|
||||
void MatrixMultVec3x3(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr)
|
||||
{
|
||||
const s32 x = vecPtr[0];
|
||||
const s32 y = vecPtr[1];
|
||||
const s32 z = vecPtr[2];
|
||||
|
||||
vecPtr[0] = sfx32_shiftdown( fx32_mul(x,mtxPtr[0]) + fx32_mul(y,mtxPtr[4]) + fx32_mul(z,mtxPtr[ 8]) );
|
||||
vecPtr[1] = sfx32_shiftdown( fx32_mul(x,mtxPtr[1]) + fx32_mul(y,mtxPtr[5]) + fx32_mul(z,mtxPtr[ 9]) );
|
||||
vecPtr[2] = sfx32_shiftdown( fx32_mul(x,mtxPtr[2]) + fx32_mul(y,mtxPtr[6]) + fx32_mul(z,mtxPtr[10]) );
|
||||
}
|
||||
|
||||
void MatrixTranslate(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr)
|
||||
{
|
||||
mtxPtr[12] = sfx32_shiftdown( fx32_mul(mtxPtr[0], vecPtr[0]) + fx32_mul(mtxPtr[4], vecPtr[1]) + fx32_mul(mtxPtr[ 8], vecPtr[2]) + fx32_shiftup(mtxPtr[12]) );
|
||||
mtxPtr[13] = sfx32_shiftdown( fx32_mul(mtxPtr[1], vecPtr[0]) + fx32_mul(mtxPtr[5], vecPtr[1]) + fx32_mul(mtxPtr[ 9], vecPtr[2]) + fx32_shiftup(mtxPtr[13]) );
|
||||
mtxPtr[14] = sfx32_shiftdown( fx32_mul(mtxPtr[2], vecPtr[0]) + fx32_mul(mtxPtr[6], vecPtr[1]) + fx32_mul(mtxPtr[10], vecPtr[2]) + fx32_shiftup(mtxPtr[14]) );
|
||||
mtxPtr[15] = sfx32_shiftdown( fx32_mul(mtxPtr[3], vecPtr[0]) + fx32_mul(mtxPtr[7], vecPtr[1]) + fx32_mul(mtxPtr[11], vecPtr[2]) + fx32_shiftup(mtxPtr[15]) );
|
||||
}
|
||||
|
||||
void MatrixScale(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr)
|
||||
{
|
||||
mtxPtr[ 0] = sfx32_shiftdown( fx32_mul(mtxPtr[ 0], vecPtr[0]) );
|
||||
mtxPtr[ 1] = sfx32_shiftdown( fx32_mul(mtxPtr[ 1], vecPtr[0]) );
|
||||
mtxPtr[ 2] = sfx32_shiftdown( fx32_mul(mtxPtr[ 2], vecPtr[0]) );
|
||||
mtxPtr[ 3] = sfx32_shiftdown( fx32_mul(mtxPtr[ 3], vecPtr[0]) );
|
||||
|
||||
mtxPtr[ 4] = sfx32_shiftdown( fx32_mul(mtxPtr[ 4], vecPtr[1]) );
|
||||
mtxPtr[ 5] = sfx32_shiftdown( fx32_mul(mtxPtr[ 5], vecPtr[1]) );
|
||||
mtxPtr[ 6] = sfx32_shiftdown( fx32_mul(mtxPtr[ 6], vecPtr[1]) );
|
||||
mtxPtr[ 7] = sfx32_shiftdown( fx32_mul(mtxPtr[ 7], vecPtr[1]) );
|
||||
|
||||
mtxPtr[ 8] = sfx32_shiftdown( fx32_mul(mtxPtr[ 8], vecPtr[2]) );
|
||||
mtxPtr[ 9] = sfx32_shiftdown( fx32_mul(mtxPtr[ 9], vecPtr[2]) );
|
||||
mtxPtr[10] = sfx32_shiftdown( fx32_mul(mtxPtr[10], vecPtr[2]) );
|
||||
mtxPtr[11] = sfx32_shiftdown( fx32_mul(mtxPtr[11], vecPtr[2]) );
|
||||
}
|
||||
|
||||
void MatrixMultiply(s32 *__restrict mtxPtrA, const s32 *__restrict mtxPtrB)
|
||||
{
|
||||
s32 tmpMatrix[16];
|
||||
|
||||
tmpMatrix[0] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[0])+fx32_mul(matrix[4],rightMatrix[1])+fx32_mul(matrix[8],rightMatrix[2])+fx32_mul(matrix[12],rightMatrix[3]));
|
||||
tmpMatrix[1] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[0])+fx32_mul(matrix[5],rightMatrix[1])+fx32_mul(matrix[9],rightMatrix[2])+fx32_mul(matrix[13],rightMatrix[3]));
|
||||
tmpMatrix[2] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[0])+fx32_mul(matrix[6],rightMatrix[1])+fx32_mul(matrix[10],rightMatrix[2])+fx32_mul(matrix[14],rightMatrix[3]));
|
||||
tmpMatrix[3] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[0])+fx32_mul(matrix[7],rightMatrix[1])+fx32_mul(matrix[11],rightMatrix[2])+fx32_mul(matrix[15],rightMatrix[3]));
|
||||
|
||||
tmpMatrix[4] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[4])+fx32_mul(matrix[4],rightMatrix[5])+fx32_mul(matrix[8],rightMatrix[6])+fx32_mul(matrix[12],rightMatrix[7]));
|
||||
tmpMatrix[5] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[4])+fx32_mul(matrix[5],rightMatrix[5])+fx32_mul(matrix[9],rightMatrix[6])+fx32_mul(matrix[13],rightMatrix[7]));
|
||||
tmpMatrix[6] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[4])+fx32_mul(matrix[6],rightMatrix[5])+fx32_mul(matrix[10],rightMatrix[6])+fx32_mul(matrix[14],rightMatrix[7]));
|
||||
tmpMatrix[7] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[4])+fx32_mul(matrix[7],rightMatrix[5])+fx32_mul(matrix[11],rightMatrix[6])+fx32_mul(matrix[15],rightMatrix[7]));
|
||||
|
||||
tmpMatrix[8] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[8])+fx32_mul(matrix[4],rightMatrix[9])+fx32_mul(matrix[8],rightMatrix[10])+fx32_mul(matrix[12],rightMatrix[11]));
|
||||
tmpMatrix[9] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[8])+fx32_mul(matrix[5],rightMatrix[9])+fx32_mul(matrix[9],rightMatrix[10])+fx32_mul(matrix[13],rightMatrix[11]));
|
||||
tmpMatrix[10] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[8])+fx32_mul(matrix[6],rightMatrix[9])+fx32_mul(matrix[10],rightMatrix[10])+fx32_mul(matrix[14],rightMatrix[11]));
|
||||
tmpMatrix[11] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[8])+fx32_mul(matrix[7],rightMatrix[9])+fx32_mul(matrix[11],rightMatrix[10])+fx32_mul(matrix[15],rightMatrix[11]));
|
||||
|
||||
tmpMatrix[12] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[12])+fx32_mul(matrix[4],rightMatrix[13])+fx32_mul(matrix[8],rightMatrix[14])+fx32_mul(matrix[12],rightMatrix[15]));
|
||||
tmpMatrix[13] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[12])+fx32_mul(matrix[5],rightMatrix[13])+fx32_mul(matrix[9],rightMatrix[14])+fx32_mul(matrix[13],rightMatrix[15]));
|
||||
tmpMatrix[14] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[12])+fx32_mul(matrix[6],rightMatrix[13])+fx32_mul(matrix[10],rightMatrix[14])+fx32_mul(matrix[14],rightMatrix[15]));
|
||||
tmpMatrix[15] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[12])+fx32_mul(matrix[7],rightMatrix[13])+fx32_mul(matrix[11],rightMatrix[14])+fx32_mul(matrix[15],rightMatrix[15]));
|
||||
|
||||
memcpy(matrix,tmpMatrix,sizeof(s32)*16);
|
||||
|
||||
tmpMatrix[ 0] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 0])+fx32_mul(mtxPtrA[4],mtxPtrB[ 1])+fx32_mul(mtxPtrA[ 8],mtxPtrB[ 2])+fx32_mul(mtxPtrA[12],mtxPtrB[ 3]) );
|
||||
tmpMatrix[ 1] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 0])+fx32_mul(mtxPtrA[5],mtxPtrB[ 1])+fx32_mul(mtxPtrA[ 9],mtxPtrB[ 2])+fx32_mul(mtxPtrA[13],mtxPtrB[ 3]) );
|
||||
tmpMatrix[ 2] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 0])+fx32_mul(mtxPtrA[6],mtxPtrB[ 1])+fx32_mul(mtxPtrA[10],mtxPtrB[ 2])+fx32_mul(mtxPtrA[14],mtxPtrB[ 3]) );
|
||||
tmpMatrix[ 3] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 0])+fx32_mul(mtxPtrA[7],mtxPtrB[ 1])+fx32_mul(mtxPtrA[11],mtxPtrB[ 2])+fx32_mul(mtxPtrA[15],mtxPtrB[ 3]) );
|
||||
|
||||
tmpMatrix[ 4] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 4])+fx32_mul(mtxPtrA[4],mtxPtrB[ 5])+fx32_mul(mtxPtrA[ 8],mtxPtrB[ 6])+fx32_mul(mtxPtrA[12],mtxPtrB[ 7]) );
|
||||
tmpMatrix[ 5] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 4])+fx32_mul(mtxPtrA[5],mtxPtrB[ 5])+fx32_mul(mtxPtrA[ 9],mtxPtrB[ 6])+fx32_mul(mtxPtrA[13],mtxPtrB[ 7]) );
|
||||
tmpMatrix[ 6] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 4])+fx32_mul(mtxPtrA[6],mtxPtrB[ 5])+fx32_mul(mtxPtrA[10],mtxPtrB[ 6])+fx32_mul(mtxPtrA[14],mtxPtrB[ 7]) );
|
||||
tmpMatrix[ 7] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 4])+fx32_mul(mtxPtrA[7],mtxPtrB[ 5])+fx32_mul(mtxPtrA[11],mtxPtrB[ 6])+fx32_mul(mtxPtrA[15],mtxPtrB[ 7]) );
|
||||
|
||||
tmpMatrix[ 8] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 8])+fx32_mul(mtxPtrA[4],mtxPtrB[ 9])+fx32_mul(mtxPtrA[ 8],mtxPtrB[10])+fx32_mul(mtxPtrA[12],mtxPtrB[11]) );
|
||||
tmpMatrix[ 9] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 8])+fx32_mul(mtxPtrA[5],mtxPtrB[ 9])+fx32_mul(mtxPtrA[ 9],mtxPtrB[10])+fx32_mul(mtxPtrA[13],mtxPtrB[11]) );
|
||||
tmpMatrix[10] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 8])+fx32_mul(mtxPtrA[6],mtxPtrB[ 9])+fx32_mul(mtxPtrA[10],mtxPtrB[10])+fx32_mul(mtxPtrA[14],mtxPtrB[11]) );
|
||||
tmpMatrix[11] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 8])+fx32_mul(mtxPtrA[7],mtxPtrB[ 9])+fx32_mul(mtxPtrA[11],mtxPtrB[10])+fx32_mul(mtxPtrA[15],mtxPtrB[11]) );
|
||||
|
||||
tmpMatrix[12] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[12])+fx32_mul(mtxPtrA[4],mtxPtrB[13])+fx32_mul(mtxPtrA[ 8],mtxPtrB[14])+fx32_mul(mtxPtrA[12],mtxPtrB[15]) );
|
||||
tmpMatrix[13] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[12])+fx32_mul(mtxPtrA[5],mtxPtrB[13])+fx32_mul(mtxPtrA[ 9],mtxPtrB[14])+fx32_mul(mtxPtrA[13],mtxPtrB[15]) );
|
||||
tmpMatrix[14] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[12])+fx32_mul(mtxPtrA[6],mtxPtrB[13])+fx32_mul(mtxPtrA[10],mtxPtrB[14])+fx32_mul(mtxPtrA[14],mtxPtrB[15]) );
|
||||
tmpMatrix[15] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[12])+fx32_mul(mtxPtrA[7],mtxPtrB[13])+fx32_mul(mtxPtrA[11],mtxPtrB[14])+fx32_mul(mtxPtrA[15],mtxPtrB[15]) );
|
||||
|
||||
memcpy(mtxPtrA, tmpMatrix, sizeof(s32)*16);
|
||||
}
|
||||
|
||||
void MatrixScale(s32 *matrix, const s32 *ptr)
|
||||
{
|
||||
//zero 21-sep-2010 - verified unrolling seems faster on my cpu
|
||||
MACRODO_N(12,
|
||||
matrix[X] = sfx32_shiftdown(fx32_mul(matrix[X],ptr[X>>2]))
|
||||
);
|
||||
}
|
||||
|
||||
void MatrixTranslate(s32 *matrix, const s32 *ptr)
|
||||
{
|
||||
MACRODO_N(4,
|
||||
{
|
||||
s64 temp = fx32_shiftup(matrix[X+12]);
|
||||
temp += fx32_mul(matrix[X+0],ptr[0]);
|
||||
temp += fx32_mul(matrix[X+4],ptr[1]);
|
||||
temp += fx32_mul(matrix[X+8],ptr[2]);
|
||||
matrix[X+12] = sfx32_shiftdown(temp);
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
Copyright (C) 2006-2007 shash
|
||||
Copyright (C) 2007-2017 DeSmuME team
|
||||
Copyright (C) 2007-2018 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -77,7 +77,24 @@ void Vector3Scale(float *dst, const float scale);
|
|||
void Vector3Copy(float *dst, const float *src);
|
||||
void Vector3Normalize(float *dst);
|
||||
|
||||
void Vector4Copy(float *dst, const float *src);
|
||||
void Vector4Copy(float *dst, const float *src);
|
||||
|
||||
|
||||
void _MatrixMultVec4x4_NoSIMD(const s32 *__restrict mtxPtr, float *__restrict vecPtr);
|
||||
|
||||
void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr);
|
||||
void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr);
|
||||
void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr);
|
||||
void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr);
|
||||
void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB);
|
||||
|
||||
template<size_t NUM_ROWS> FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor);
|
||||
|
||||
void MatrixMultVec4x4(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr);
|
||||
void MatrixMultVec3x3(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr);
|
||||
void MatrixTranslate(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr);
|
||||
void MatrixScale(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr);
|
||||
void MatrixMultiply(s32 *__restrict mtxPtrA, const s32 *__restrict mtxPtrB);
|
||||
|
||||
//these functions are an unreliable, inaccurate floor.
|
||||
//it should only be used for positive numbers
|
||||
|
@ -296,151 +313,4 @@ static void memset_u32_fast(void *dst, const u32 val)
|
|||
|
||||
#endif // SIMD Functions
|
||||
|
||||
// NOSSE version always used in gfx3d.cpp
|
||||
void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr);
|
||||
void MatrixMultVec3x3_fixed(const s32 *matrix, s32 *vecPtr);
|
||||
|
||||
//---------------------------
|
||||
//switched SSE functions
|
||||
#ifdef ENABLE_SSE
|
||||
|
||||
struct SSE_MATRIX
|
||||
{
|
||||
SSE_MATRIX(const float *matrix)
|
||||
: row0(_mm_load_ps(matrix))
|
||||
, row1(_mm_load_ps(matrix+4))
|
||||
, row2(_mm_load_ps(matrix+8))
|
||||
, row3(_mm_load_ps(matrix+12))
|
||||
{}
|
||||
|
||||
union {
|
||||
__m128 rows[4];
|
||||
struct { __m128 row0; __m128 row1; __m128 row2; __m128 row3; };
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
FORCEINLINE __m128 _util_MatrixMultVec4x4_(const SSE_MATRIX &mat, __m128 vec)
|
||||
{
|
||||
__m128 xmm5 = _mm_shuffle_ps(vec, vec, B8(01010101));
|
||||
__m128 xmm6 = _mm_shuffle_ps(vec, vec, B8(10101010));
|
||||
__m128 xmm7 = _mm_shuffle_ps(vec, vec, B8(11111111));
|
||||
__m128 xmm4 = _mm_shuffle_ps(vec, vec, B8(00000000));
|
||||
|
||||
xmm4 = _mm_mul_ps(xmm4,mat.row0);
|
||||
xmm5 = _mm_mul_ps(xmm5,mat.row1);
|
||||
xmm6 = _mm_mul_ps(xmm6,mat.row2);
|
||||
xmm7 = _mm_mul_ps(xmm7,mat.row3);
|
||||
xmm4 = _mm_add_ps(xmm4,xmm5);
|
||||
xmm4 = _mm_add_ps(xmm4,xmm6);
|
||||
xmm4 = _mm_add_ps(xmm4,xmm7);
|
||||
return xmm4;
|
||||
}
|
||||
|
||||
FORCEINLINE void MatrixMultiply(float * matrix, const float * rightMatrix)
|
||||
{
|
||||
//this seems to generate larger code, including many movaps, but maybe it is less harsh on the registers than the
|
||||
//more hand-tailored approach
|
||||
__m128 row0 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix));
|
||||
__m128 row1 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+4));
|
||||
__m128 row2 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+8));
|
||||
__m128 row3 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+12));
|
||||
_mm_store_ps(matrix,row0);
|
||||
_mm_store_ps(matrix+4,row1);
|
||||
_mm_store_ps(matrix+8,row2);
|
||||
_mm_store_ps(matrix+12,row3);
|
||||
}
|
||||
|
||||
FORCEINLINE void MatrixMultVec4x4(const float *matrix, float *vecPtr)
|
||||
{
|
||||
_mm_store_ps(vecPtr,_util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(vecPtr)));
|
||||
}
|
||||
|
||||
FORCEINLINE void MatrixMultVec3x3(const float * matrix, float * vecPtr)
|
||||
{
|
||||
const __m128 vec = _mm_load_ps(vecPtr);
|
||||
|
||||
__m128 xmm5 = _mm_shuffle_ps(vec, vec, B8(01010101));
|
||||
__m128 xmm6 = _mm_shuffle_ps(vec, vec, B8(10101010));
|
||||
__m128 xmm4 = _mm_shuffle_ps(vec, vec, B8(00000000));
|
||||
|
||||
const SSE_MATRIX mat(matrix);
|
||||
|
||||
xmm4 = _mm_mul_ps(xmm4,mat.row0);
|
||||
xmm5 = _mm_mul_ps(xmm5,mat.row1);
|
||||
xmm6 = _mm_mul_ps(xmm6,mat.row2);
|
||||
xmm4 = _mm_add_ps(xmm4,xmm5);
|
||||
xmm4 = _mm_add_ps(xmm4,xmm6);
|
||||
|
||||
_mm_store_ps(vecPtr,xmm4);
|
||||
}
|
||||
|
||||
FORCEINLINE void MatrixTranslate(float *matrix, const float *ptr)
|
||||
{
|
||||
__m128 xmm4 = _mm_load_ps(ptr);
|
||||
__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
|
||||
__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
|
||||
xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
|
||||
|
||||
xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(matrix));
|
||||
xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(matrix+4));
|
||||
xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(matrix+8));
|
||||
xmm4 = _mm_add_ps(xmm4,xmm5);
|
||||
xmm4 = _mm_add_ps(xmm4,xmm6);
|
||||
xmm4 = _mm_add_ps(xmm4,_mm_load_ps(matrix+12));
|
||||
_mm_store_ps(matrix+12,xmm4);
|
||||
}
|
||||
|
||||
FORCEINLINE void MatrixScale(float *matrix, const float *ptr)
|
||||
{
|
||||
__m128 xmm4 = _mm_load_ps(ptr);
|
||||
__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
|
||||
__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
|
||||
xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
|
||||
|
||||
xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(matrix));
|
||||
xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(matrix+4));
|
||||
xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(matrix+8));
|
||||
_mm_store_ps(matrix,xmm4);
|
||||
_mm_store_ps(matrix+4,xmm5);
|
||||
_mm_store_ps(matrix+8,xmm6);
|
||||
}
|
||||
|
||||
template<int NUM_ROWS>
|
||||
FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
|
||||
{
|
||||
CTASSERT(NUM_ROWS==3 || NUM_ROWS==4);
|
||||
|
||||
const __m128 val = _mm_set_ps1(divisor);
|
||||
|
||||
_mm_store_ps(matrix,_mm_div_ps(_mm_load_ps(matrix),val));
|
||||
_mm_store_ps(matrix+4,_mm_div_ps(_mm_load_ps(matrix+4),val));
|
||||
_mm_store_ps(matrix+8,_mm_div_ps(_mm_load_ps(matrix+8),val));
|
||||
if(NUM_ROWS==4)
|
||||
_mm_store_ps(matrix+12,_mm_div_ps(_mm_load_ps(matrix+12),val));
|
||||
}
|
||||
|
||||
#else //no sse
|
||||
|
||||
void MatrixMultVec4x4 (const float *matrix, float *vecPtr);
|
||||
void MatrixMultVec3x3(const float * matrix, float * vecPtr);
|
||||
void MatrixMultiply(float * matrix, const float * rightMatrix);
|
||||
void MatrixTranslate(float *matrix, const float *ptr);
|
||||
void MatrixScale(float * matrix, const float * ptr);
|
||||
|
||||
template<int NUM_ROWS>
|
||||
FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
|
||||
{
|
||||
for(int i=0;i<NUM_ROWS*4;i++)
|
||||
matrix[i] /= divisor;
|
||||
}
|
||||
|
||||
#endif //switched SSE functions
|
||||
|
||||
void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr);
|
||||
|
||||
void MatrixMultiply(s32* matrix, const s32* rightMatrix);
|
||||
void MatrixScale(s32 *matrix, const s32 *ptr);
|
||||
void MatrixTranslate(s32 *matrix, const s32 *ptr);
|
||||
|
||||
#endif // MATRIX_H
|
||||
|
|
Loading…
Reference in New Issue