matrix.cpp: Do a bunch of code cleanup.

This commit is contained in:
rogerman 2018-02-16 11:59:19 -08:00
parent c41a006b2a
commit 249afccfca
3 changed files with 546 additions and 354 deletions

View File

@ -609,11 +609,11 @@ void gfx3d_reset()
memset(colorRGB, 0, sizeof(colorRGB));
memset(&tempVertInfo, 0, sizeof(tempVertInfo));
MatrixInit (mtxCurrent[0]);
MatrixInit (mtxCurrent[1]);
MatrixInit (mtxCurrent[2]);
MatrixInit (mtxCurrent[3]);
MatrixInit (mtxTemporal);
MatrixInit(mtxCurrent[MATRIXMODE_PROJECTION]);
MatrixInit(mtxCurrent[MATRIXMODE_POSITION]);
MatrixInit(mtxCurrent[MATRIXMODE_POSITION_VECTOR]);
MatrixInit(mtxCurrent[MATRIXMODE_TEXTURE]);
MatrixInit(mtxTemporal);
MatrixStackInit(&mtxStack[0]);
MatrixStackInit(&mtxStack[1]);
@ -727,13 +727,13 @@ static void SetVertex()
if (texCoordTransformMode == TextureTransformationMode_VertexSource)
{
//Tested by: Eledees The Adventures of Kai and Zero (E) [title screen and frontend menus]
last_s = (s32)(((s64)s16coord[0] * mtxCurrent[3][0] +
(s64)s16coord[1] * mtxCurrent[3][4] +
(s64)s16coord[2] * mtxCurrent[3][8] +
last_s = (s32)(((s64)s16coord[0] * mtxCurrent[MATRIXMODE_TEXTURE][0] +
(s64)s16coord[1] * mtxCurrent[MATRIXMODE_TEXTURE][4] +
(s64)s16coord[2] * mtxCurrent[MATRIXMODE_TEXTURE][8] +
(((s64)(_s))<<24))>>24);
last_t = (s32)(((s64)s16coord[0] * mtxCurrent[3][1] +
(s64)s16coord[1] * mtxCurrent[3][5] +
(s64)s16coord[2] * mtxCurrent[3][9] +
last_t = (s32)(((s64)s16coord[0] * mtxCurrent[MATRIXMODE_TEXTURE][1] +
(s64)s16coord[1] * mtxCurrent[MATRIXMODE_TEXTURE][5] +
(s64)s16coord[2] * mtxCurrent[MATRIXMODE_TEXTURE][9] +
(((s64)(_t))<<24))>>24);
}
@ -744,8 +744,8 @@ static void SetVertex()
if(polylist->count >= POLYLIST_SIZE)
return;
GEM_TransformVertex(mtxCurrent[1],coordTransformed); //modelview
GEM_TransformVertex(mtxCurrent[0],coordTransformed); //projection
GEM_TransformVertex(mtxCurrent[MATRIXMODE_POSITION], coordTransformed); //modelview
GEM_TransformVertex(mtxCurrent[MATRIXMODE_PROJECTION], coordTransformed); //projection
//TODO - culling should be done here.
//TODO - viewport transform?
@ -930,7 +930,7 @@ static void gfx3d_glLightDirection_cache(const size_t index)
cacheLightDirection[index][3] = 0;
//Multiply the vector by the directional matrix
MatrixMultVec3x3_fixed(mtxCurrent[2], cacheLightDirection[index]);
MatrixMultVec3x3(mtxCurrent[MATRIXMODE_POSITION_VECTOR], cacheLightDirection[index]);
//Calculate the half angle vector
s32 lineOfSight[4] = {0, 0, (-1)<<12, 0};
@ -1092,7 +1092,7 @@ static void gfx3d_glLoadIdentity()
GFX_DELAY(19);
if (mode == MATRIXMODE_POSITION_VECTOR)
MatrixIdentity(mtxCurrent[1]);
MatrixIdentity(mtxCurrent[MATRIXMODE_POSITION]);
//printf("identity: %d to: \n",mode); MatrixPrint(mtxCurrent[1]);
}
@ -1110,7 +1110,7 @@ static BOOL gfx3d_glLoadMatrix4x4(s32 v)
//vector_fix2float<4>(mtxCurrent[mode], 4096.f);
if (mode == MATRIXMODE_POSITION_VECTOR)
MatrixCopy(mtxCurrent[1], mtxCurrent[2]);
MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], mtxCurrent[MATRIXMODE_POSITION_VECTOR]);
//printf("load4x4: matrix %d to: \n",mode); MatrixPrint(mtxCurrent[1]);
return TRUE;
@ -1134,7 +1134,7 @@ static BOOL gfx3d_glLoadMatrix4x3(s32 v)
GFX_DELAY(30);
if (mode == MATRIXMODE_POSITION_VECTOR)
MatrixCopy(mtxCurrent[1], mtxCurrent[2]);
MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], mtxCurrent[MATRIXMODE_POSITION_VECTOR]);
//printf("load4x3: matrix %d to: \n",mode); MatrixPrint(mtxCurrent[1]);
return TRUE;
}
@ -1155,7 +1155,7 @@ static BOOL gfx3d_glMultMatrix4x4(s32 v)
if (mode == MATRIXMODE_POSITION_VECTOR)
{
MatrixMultiply(mtxCurrent[1], mtxTemporal);
MatrixMultiply(mtxCurrent[MATRIXMODE_POSITION], mtxTemporal);
GFX_DELAY_M2(30);
}
@ -1186,7 +1186,7 @@ static BOOL gfx3d_glMultMatrix4x3(s32 v)
if (mode == MATRIXMODE_POSITION_VECTOR)
{
MatrixMultiply (mtxCurrent[1], mtxTemporal);
MatrixMultiply (mtxCurrent[MATRIXMODE_POSITION], mtxTemporal);
GFX_DELAY_M2(30);
}
@ -1219,7 +1219,7 @@ static BOOL gfx3d_glMultMatrix3x3(s32 v)
if (mode == MATRIXMODE_POSITION_VECTOR)
{
MatrixMultiply(mtxCurrent[1], mtxTemporal);
MatrixMultiply(mtxCurrent[MATRIXMODE_POSITION], mtxTemporal);
GFX_DELAY_M2(30);
}
@ -1248,8 +1248,8 @@ static BOOL gfx3d_glScale(s32 v)
//note: pos-vector mode should not cause both matrices to scale.
//the whole purpose is to keep the vector matrix orthogonal
//so, I am leaving this commented out as an example of what not to do.
//if (mode == 2)
// MatrixScale (mtxCurrent[1], scale);
//if (mode == MATRIXMODE_POSITION_VECTOR)
// MatrixScale (mtxCurrent[MATRIXMODE_POSITION], scale);
return TRUE;
}
@ -1268,7 +1268,7 @@ static BOOL gfx3d_glTranslate(s32 v)
if (mode == MATRIXMODE_POSITION_VECTOR)
{
MatrixTranslate(mtxCurrent[1], trans);
MatrixTranslate(mtxCurrent[MATRIXMODE_POSITION], trans);
GFX_DELAY_M2(30);
}
@ -1297,11 +1297,11 @@ static void gfx3d_glNormal(s32 v)
{
//SM64 highlight rendered star in main menu tests this
//also smackdown 2010 player textures tested this (needed cast on _s and _t)
last_s = (s32)(((s64)normal[0] * mtxCurrent[3][0] + (s64)normal[1] * mtxCurrent[3][4] + (s64)normal[2] * mtxCurrent[3][8] + (((s64)_s)<<24))>>24);
last_t = (s32)(((s64)normal[0] * mtxCurrent[3][1] + (s64)normal[1] * mtxCurrent[3][5] + (s64)normal[2] * mtxCurrent[3][9] + (((s64)_t)<<24))>>24);
last_s = (s32)(((s64)normal[0] * mtxCurrent[MATRIXMODE_TEXTURE][0] + (s64)normal[1] * mtxCurrent[MATRIXMODE_TEXTURE][4] + (s64)normal[2] * mtxCurrent[MATRIXMODE_TEXTURE][8] + (((s64)_s)<<24))>>24);
last_t = (s32)(((s64)normal[0] * mtxCurrent[MATRIXMODE_TEXTURE][1] + (s64)normal[1] * mtxCurrent[MATRIXMODE_TEXTURE][5] + (s64)normal[2] * mtxCurrent[MATRIXMODE_TEXTURE][9] + (((s64)_t)<<24))>>24);
}
MatrixMultVec3x3_fixed(mtxCurrent[2],normal);
MatrixMultVec3x3(mtxCurrent[MATRIXMODE_POSITION_VECTOR], normal);
//apply lighting model
u8 diffuse[3] = {
@ -1395,8 +1395,8 @@ static void gfx3d_glTexCoord(s32 val)
if (texCoordTransformMode == TextureTransformationMode_TexCoordSource)
{
//dragon quest 4 overworld will test this
last_s = (s32) (( (s64)_s * mtxCurrent[3][0] + (s64)_t * mtxCurrent[3][4] + (s64)mtxCurrent[3][8] + (s64)mtxCurrent[3][12])>>12);
last_t = (s32) (( (s64)_s * mtxCurrent[3][1] + (s64)_t * mtxCurrent[3][5] + (s64)mtxCurrent[3][9] + (s64)mtxCurrent[3][13])>>12);
last_s = (s32) (( (s64)_s * mtxCurrent[MATRIXMODE_TEXTURE][0] + (s64)_t * mtxCurrent[MATRIXMODE_TEXTURE][4] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][8] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][12])>>12);
last_t = (s32) (( (s64)_s * mtxCurrent[MATRIXMODE_TEXTURE][1] + (s64)_t * mtxCurrent[MATRIXMODE_TEXTURE][5] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][9] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][13])>>12);
}
else if (texCoordTransformMode == TextureTransformationMode_None)
{
@ -1684,13 +1684,13 @@ static BOOL gfx3d_glBoxTest(u32 v)
//MatrixMultVec4x4_M2(mtxCurrent[0], verts[i].coord);
//but change it all to floating point and do it that way instead
CACHE_ALIGN float temp1[16] = {mtxCurrent[1][0]/4096.0f,mtxCurrent[1][1]/4096.0f,mtxCurrent[1][2]/4096.0f,mtxCurrent[1][3]/4096.0f,mtxCurrent[1][4]/4096.0f,mtxCurrent[1][5]/4096.0f,mtxCurrent[1][6]/4096.0f,mtxCurrent[1][7]/4096.0f,mtxCurrent[1][8]/4096.0f,mtxCurrent[1][9]/4096.0f,mtxCurrent[1][10]/4096.0f,mtxCurrent[1][11]/4096.0f,mtxCurrent[1][12]/4096.0f,mtxCurrent[1][13]/4096.0f,mtxCurrent[1][14]/4096.0f,mtxCurrent[1][15]/4096.0f};
CACHE_ALIGN float temp0[16] = {mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f,mtxCurrent[0][4]/4096.0f,mtxCurrent[0][5]/4096.0f,mtxCurrent[0][6]/4096.0f,mtxCurrent[0][7]/4096.0f,mtxCurrent[0][8]/4096.0f,mtxCurrent[0][9]/4096.0f,mtxCurrent[0][10]/4096.0f,mtxCurrent[0][11]/4096.0f,mtxCurrent[0][12]/4096.0f,mtxCurrent[0][13]/4096.0f,mtxCurrent[0][14]/4096.0f,mtxCurrent[0][15]/4096.0f};
//DS_ALIGN(16) VERT_POS4f vert = { verts[i].x, verts[i].y, verts[i].z, verts[i].w };
_NOSSE_MatrixMultVec4x4(temp1,verts[i].coord);
_NOSSE_MatrixMultVec4x4(temp0,verts[i].coord);
//_MatrixMultVec4x4_NoSIMD(mtxCurrent[MATRIXMODE_POSITION], verts[i].coord);
//_MatrixMultVec4x4_NoSIMD(mtxCurrent[MATRIXMODE_PROJECTION], verts[i].coord);
MatrixMultVec4x4(mtxCurrent[MATRIXMODE_POSITION], verts[i].coord);
MatrixMultVec4x4(mtxCurrent[MATRIXMODE_PROJECTION], verts[i].coord);
}
//clip each poly
@ -1742,12 +1742,9 @@ static BOOL gfx3d_glPosTest(u32 v)
PTind = 0;
PTcoords[3] = 1.0f;
CACHE_ALIGN float temp1[16] = {mtxCurrent[1][0]/4096.0f,mtxCurrent[1][1]/4096.0f,mtxCurrent[1][2]/4096.0f,mtxCurrent[1][3]/4096.0f,mtxCurrent[1][4]/4096.0f,mtxCurrent[1][5]/4096.0f,mtxCurrent[1][6]/4096.0f,mtxCurrent[1][7]/4096.0f,mtxCurrent[1][8]/4096.0f,mtxCurrent[1][9]/4096.0f,mtxCurrent[1][10]/4096.0f,mtxCurrent[1][11]/4096.0f,mtxCurrent[1][12]/4096.0f,mtxCurrent[1][13]/4096.0f,mtxCurrent[1][14]/4096.0f,mtxCurrent[1][15]/4096.0f};
CACHE_ALIGN float temp0[16] = {mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f,mtxCurrent[0][4]/4096.0f,mtxCurrent[0][5]/4096.0f,mtxCurrent[0][6]/4096.0f,mtxCurrent[0][7]/4096.0f,mtxCurrent[0][8]/4096.0f,mtxCurrent[0][9]/4096.0f,mtxCurrent[0][10]/4096.0f,mtxCurrent[0][11]/4096.0f,mtxCurrent[0][12]/4096.0f,mtxCurrent[0][13]/4096.0f,mtxCurrent[0][14]/4096.0f,mtxCurrent[0][15]/4096.0f};
MatrixMultVec4x4(temp1, PTcoords);
MatrixMultVec4x4(temp0, PTcoords);
MatrixMultVec4x4(mtxCurrent[MATRIXMODE_POSITION], PTcoords);
MatrixMultVec4x4(mtxCurrent[MATRIXMODE_PROJECTION], PTcoords);
MMU_new.gxstat.tb = 0;
@ -1765,13 +1762,14 @@ static void gfx3d_glVecTest(u32 v)
//i am not sure exactly what it is doing, maybe it is testing to ensure
//that the normal vector for the point of interest is camera-facing.
CACHE_ALIGN float normal[4] = { normalTable[v&1023],
normalTable[(v>>10)&1023],
normalTable[(v>>20)&1023],
0};
CACHE_ALIGN float temp[16] = {mtxCurrent[2][0]/4096.0f,mtxCurrent[2][1]/4096.0f,mtxCurrent[2][2]/4096.0f,mtxCurrent[2][3]/4096.0f,mtxCurrent[2][4]/4096.0f,mtxCurrent[2][5]/4096.0f,mtxCurrent[2][6]/4096.0f,mtxCurrent[2][7]/4096.0f,mtxCurrent[2][8]/4096.0f,mtxCurrent[2][9]/4096.0f,mtxCurrent[2][10]/4096.0f,mtxCurrent[2][11]/4096.0f,mtxCurrent[2][12]/4096.0f,mtxCurrent[2][13]/4096.0f,mtxCurrent[2][14]/4096.0f,mtxCurrent[2][15]/4096.0f};
MatrixMultVec4x4(temp, normal);
CACHE_ALIGN float normal[4] = {
normalTable[v&1023],
normalTable[(v>>10)&1023],
normalTable[(v>>20)&1023],
0
};
MatrixMultVec4x4(mtxCurrent[MATRIXMODE_POSITION_VECTOR], normal);
s16 x = (s16)(normal[0]*4096);
s16 y = (s16)(normal[1]*4096);
@ -1853,7 +1851,7 @@ void gfx3d_UpdateToonTable(u8 offset, u32 val)
s32 gfx3d_GetClipMatrix(const u32 index)
{
//printf("reading clip matrix: %d\n",index);
return (s32)MatrixGetMultipliedIndex(index, mtxCurrent[0], mtxCurrent[1]);
return (s32)MatrixGetMultipliedIndex(index, mtxCurrent[MATRIXMODE_PROJECTION], mtxCurrent[MATRIXMODE_POSITION]);
}
s32 gfx3d_GetDirectionalMatrix(const u32 index)
@ -1861,7 +1859,7 @@ s32 gfx3d_GetDirectionalMatrix(const u32 index)
const size_t _index = (((index / 3) * 4) + (index % 3));
//return (s32)(mtxCurrent[2][_index]*(1<<12));
return mtxCurrent[2][_index];
return mtxCurrent[MATRIXMODE_POSITION_VECTOR][_index];
}
void gfx3d_glAlphaFunc(u32 v)

View File

@ -1,6 +1,6 @@
/*
Copyright (C) 2006-2007 shash
Copyright (C) 2007-2017 DeSmuME team
Copyright (C) 2007-2018 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -24,118 +24,6 @@
#include "matrix.h"
#include "MMU.h"
void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr)
{
float x = vecPtr[0];
float y = vecPtr[1];
float z = vecPtr[2];
float w = vecPtr[3];
vecPtr[0] = x * matrix[0] + y * matrix[4] + z * matrix[ 8] + w * matrix[12];
vecPtr[1] = x * matrix[1] + y * matrix[5] + z * matrix[ 9] + w * matrix[13];
vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10] + w * matrix[14];
vecPtr[3] = x * matrix[3] + y * matrix[7] + z * matrix[11] + w * matrix[15];
}
void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr)
{
const s32 x = vecPtr[0];
const s32 y = vecPtr[1];
const s32 z = vecPtr[2];
const s32 w = vecPtr[3];
vecPtr[0] = sfx32_shiftdown(fx32_mul(x,matrix[0]) + fx32_mul(y,matrix[4]) + fx32_mul(z,matrix [8]) + fx32_mul(w,matrix[12]));
vecPtr[1] = sfx32_shiftdown(fx32_mul(x,matrix[1]) + fx32_mul(y,matrix[5]) + fx32_mul(z,matrix[ 9]) + fx32_mul(w,matrix[13]));
vecPtr[2] = sfx32_shiftdown(fx32_mul(x,matrix[2]) + fx32_mul(y,matrix[6]) + fx32_mul(z,matrix[10]) + fx32_mul(w,matrix[14]));
vecPtr[3] = sfx32_shiftdown(fx32_mul(x,matrix[3]) + fx32_mul(y,matrix[7]) + fx32_mul(z,matrix[11]) + fx32_mul(w,matrix[15]));
}
void MatrixMultVec3x3_fixed(const s32 *matrix, s32 *vecPtr)
{
const s32 x = vecPtr[0];
const s32 y = vecPtr[1];
const s32 z = vecPtr[2];
vecPtr[0] = sfx32_shiftdown(fx32_mul(x,matrix[0]) + fx32_mul(y,matrix[4]) + fx32_mul(z,matrix[8]));
vecPtr[1] = sfx32_shiftdown(fx32_mul(x,matrix[1]) + fx32_mul(y,matrix[5]) + fx32_mul(z,matrix[9]));
vecPtr[2] = sfx32_shiftdown(fx32_mul(x,matrix[2]) + fx32_mul(y,matrix[6]) + fx32_mul(z,matrix[10]));
}
//-------------------------
//switched SSE functions: implementations for no SSE
#ifndef ENABLE_SSE
void MatrixMultVec4x4 (const float *matrix, float *vecPtr)
{
_NOSSE_MatrixMultVec4x4(matrix, vecPtr);
}
void MatrixMultVec3x3 (const float *matrix, float *vecPtr)
{
float x = vecPtr[0];
float y = vecPtr[1];
float z = vecPtr[2];
vecPtr[0] = x * matrix[0] + y * matrix[4] + z * matrix[ 8];
vecPtr[1] = x * matrix[1] + y * matrix[5] + z * matrix[ 9];
vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10];
}
void MatrixMultiply (float *matrix, const float *rightMatrix)
{
float tmpMatrix[16];
tmpMatrix[0] = (matrix[0]*rightMatrix[0])+(matrix[4]*rightMatrix[1])+(matrix[8]*rightMatrix[2])+(matrix[12]*rightMatrix[3]);
tmpMatrix[1] = (matrix[1]*rightMatrix[0])+(matrix[5]*rightMatrix[1])+(matrix[9]*rightMatrix[2])+(matrix[13]*rightMatrix[3]);
tmpMatrix[2] = (matrix[2]*rightMatrix[0])+(matrix[6]*rightMatrix[1])+(matrix[10]*rightMatrix[2])+(matrix[14]*rightMatrix[3]);
tmpMatrix[3] = (matrix[3]*rightMatrix[0])+(matrix[7]*rightMatrix[1])+(matrix[11]*rightMatrix[2])+(matrix[15]*rightMatrix[3]);
tmpMatrix[4] = (matrix[0]*rightMatrix[4])+(matrix[4]*rightMatrix[5])+(matrix[8]*rightMatrix[6])+(matrix[12]*rightMatrix[7]);
tmpMatrix[5] = (matrix[1]*rightMatrix[4])+(matrix[5]*rightMatrix[5])+(matrix[9]*rightMatrix[6])+(matrix[13]*rightMatrix[7]);
tmpMatrix[6] = (matrix[2]*rightMatrix[4])+(matrix[6]*rightMatrix[5])+(matrix[10]*rightMatrix[6])+(matrix[14]*rightMatrix[7]);
tmpMatrix[7] = (matrix[3]*rightMatrix[4])+(matrix[7]*rightMatrix[5])+(matrix[11]*rightMatrix[6])+(matrix[15]*rightMatrix[7]);
tmpMatrix[8] = (matrix[0]*rightMatrix[8])+(matrix[4]*rightMatrix[9])+(matrix[8]*rightMatrix[10])+(matrix[12]*rightMatrix[11]);
tmpMatrix[9] = (matrix[1]*rightMatrix[8])+(matrix[5]*rightMatrix[9])+(matrix[9]*rightMatrix[10])+(matrix[13]*rightMatrix[11]);
tmpMatrix[10] = (matrix[2]*rightMatrix[8])+(matrix[6]*rightMatrix[9])+(matrix[10]*rightMatrix[10])+(matrix[14]*rightMatrix[11]);
tmpMatrix[11] = (matrix[3]*rightMatrix[8])+(matrix[7]*rightMatrix[9])+(matrix[11]*rightMatrix[10])+(matrix[15]*rightMatrix[11]);
tmpMatrix[12] = (matrix[0]*rightMatrix[12])+(matrix[4]*rightMatrix[13])+(matrix[8]*rightMatrix[14])+(matrix[12]*rightMatrix[15]);
tmpMatrix[13] = (matrix[1]*rightMatrix[12])+(matrix[5]*rightMatrix[13])+(matrix[9]*rightMatrix[14])+(matrix[13]*rightMatrix[15]);
tmpMatrix[14] = (matrix[2]*rightMatrix[12])+(matrix[6]*rightMatrix[13])+(matrix[10]*rightMatrix[14])+(matrix[14]*rightMatrix[15]);
tmpMatrix[15] = (matrix[3]*rightMatrix[12])+(matrix[7]*rightMatrix[13])+(matrix[11]*rightMatrix[14])+(matrix[15]*rightMatrix[15]);
memcpy (matrix, tmpMatrix, sizeof(float)*16);
}
void MatrixTranslate (float *matrix, const float *ptr)
{
matrix[12] += (matrix[0]*ptr[0])+(matrix[4]*ptr[1])+(matrix[ 8]*ptr[2]);
matrix[13] += (matrix[1]*ptr[0])+(matrix[5]*ptr[1])+(matrix[ 9]*ptr[2]);
matrix[14] += (matrix[2]*ptr[0])+(matrix[6]*ptr[1])+(matrix[10]*ptr[2]);
matrix[15] += (matrix[3]*ptr[0])+(matrix[7]*ptr[1])+(matrix[11]*ptr[2]);
}
void MatrixScale (float *matrix, const float *ptr)
{
matrix[0] *= ptr[0];
matrix[1] *= ptr[0];
matrix[2] *= ptr[0];
matrix[3] *= ptr[0];
matrix[4] *= ptr[1];
matrix[5] *= ptr[1];
matrix[6] *= ptr[1];
matrix[7] *= ptr[1];
matrix[8] *= ptr[2];
matrix[9] *= ptr[2];
matrix[10] *= ptr[2];
matrix[11] *= ptr[2];
}
#endif //switched c/asm functions
//-----------------------------------------
void MatrixInit (s32 *matrix)
{
@ -345,51 +233,487 @@ void Vector4Copy(float *dst, const float *src)
dst[3] = src[3];
}
void _MatrixMultVec4x4_NoSIMD(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
{
const CACHE_ALIGN float mtxFloat[16] = {
mtxPtr[ 0] / 4096.0f,
mtxPtr[ 1] / 4096.0f,
mtxPtr[ 2] / 4096.0f,
mtxPtr[ 3] / 4096.0f,
mtxPtr[ 4] / 4096.0f,
mtxPtr[ 5] / 4096.0f,
mtxPtr[ 6] / 4096.0f,
mtxPtr[ 7] / 4096.0f,
mtxPtr[ 8] / 4096.0f,
mtxPtr[ 9] / 4096.0f,
mtxPtr[10] / 4096.0f,
mtxPtr[11] / 4096.0f,
mtxPtr[12] / 4096.0f,
mtxPtr[13] / 4096.0f,
mtxPtr[14] / 4096.0f,
mtxPtr[15] / 4096.0f
};
const float x = vecPtr[0];
const float y = vecPtr[1];
const float z = vecPtr[2];
const float w = vecPtr[3];
vecPtr[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]) + (w * mtxFloat[12]);
vecPtr[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]) + (w * mtxFloat[13]);
vecPtr[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]) + (w * mtxFloat[14]);
vecPtr[3] = (x * mtxFloat[3]) + (y * mtxFloat[7]) + (z * mtxFloat[11]) + (w * mtxFloat[15]);
}
void MatrixMultiply (s32 *matrix, const s32 *rightMatrix)
#ifdef ENABLE_SSE
void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
{
const __m128 loadedVecPtr = _mm_load_ps(vecPtr);
const __m128 convertScalar = _mm_set1_ps(1.0f/4096.0f);
#ifdef ENABLE_SSE2
__m128 row[4] = {
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 0)) ),
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 4)) ),
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 8)) ),
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 12)) )
};
#else
const CACHE_ALIGN float mtxFloat[16] = {
(float)mtxPtr[0],
(float)mtxPtr[1],
(float)mtxPtr[2],
(float)mtxPtr[3],
(float)mtxPtr[4],
(float)mtxPtr[5],
(float)mtxPtr[6],
(float)mtxPtr[7],
(float)mtxPtr[8],
(float)mtxPtr[9],
(float)mtxPtr[10],
(float)mtxPtr[11],
(float)mtxPtr[12],
(float)mtxPtr[13],
(float)mtxPtr[14],
(float)mtxPtr[15]
};
__m128 row[4] = {
_mm_load_ps(mtxFloat + 0),
_mm_load_ps(mtxFloat + 4),
_mm_load_ps(mtxFloat + 8),
_mm_load_ps(mtxFloat + 12)
};
#endif
row[0] = _mm_mul_ps(row[0], convertScalar);
row[1] = _mm_mul_ps(row[1], convertScalar);
row[2] = _mm_mul_ps(row[2], convertScalar);
row[3] = _mm_mul_ps(row[3], convertScalar);
const __m128 vec[4] = {
_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x00),
_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x55),
_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xAA),
_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xFF)
};
const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], vec[0]), _mm_add_ps(_mm_mul_ps(row[1], vec[1]), _mm_add_ps(_mm_mul_ps(row[2], vec[2]), _mm_mul_ps(row[3], vec[3]))) );
_mm_store_ps(vecPtr, calcVec);
}
void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
{
const __m128 loadedVecPtr = _mm_load_ps(vecPtr);
const __m128 convertScalar = _mm_set1_ps(1.0f/4096.0f);
#ifdef ENABLE_SSE2
__m128 row[3] = {
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 0)) ),
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 4)) ),
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 8)) )
};
#else
const CACHE_ALIGN float mtxFloat[16] = {
(float)mtxPtr[0],
(float)mtxPtr[1],
(float)mtxPtr[2],
(float)mtxPtr[3],
(float)mtxPtr[4],
(float)mtxPtr[5],
(float)mtxPtr[6],
(float)mtxPtr[7],
(float)mtxPtr[8],
(float)mtxPtr[9],
(float)mtxPtr[10],
(float)mtxPtr[11],
(float)mtxPtr[12],
(float)mtxPtr[13],
(float)mtxPtr[14],
(float)mtxPtr[15]
};
__m128 row[3] = {
_mm_load_ps(mtxFloat + 0),
_mm_load_ps(mtxFloat + 4),
_mm_load_ps(mtxFloat + 8)
};
#endif
row[0] = _mm_mul_ps(row[0], convertScalar);
row[1] = _mm_mul_ps(row[1], convertScalar);
row[2] = _mm_mul_ps(row[2], convertScalar);
const __m128 vec[3] = {
_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x00),
_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x55),
_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xAA)
};
const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], vec[0]), _mm_add_ps(_mm_mul_ps(row[1], vec[1]), _mm_mul_ps(row[2], vec[2])) );
_mm_store_ps(vecPtr, calcVec);
}
void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr)
{
__m128 xmm4 = _mm_load_ps(vecPtr);
__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtxPtr));
xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtxPtr+4));
xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtxPtr+8));
xmm4 = _mm_add_ps(xmm4,xmm5);
xmm4 = _mm_add_ps(xmm4,xmm6);
xmm4 = _mm_add_ps(xmm4,_mm_load_ps(mtxPtr+12));
_mm_store_ps(mtxPtr+12,xmm4);
}
void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr)
{
__m128 xmm4 = _mm_load_ps(vecPtr);
__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtxPtr));
xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtxPtr+4));
xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtxPtr+8));
_mm_store_ps(mtxPtr,xmm4);
_mm_store_ps(mtxPtr+4,xmm5);
_mm_store_ps(mtxPtr+8,xmm6);
}
void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB)
{
const __m128 convertScale = _mm_set1_ps(1.0f/4096.0f);
#ifdef ENABLE_SSE2
__m128 rowB[4] = {
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 0)) ),
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 4)) ),
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 8)) ),
_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 12)) )
};
#else
const CACHE_ALIGN float mtxFloatB[16] = {
(float)mtxPtrB[0],
(float)mtxPtrB[1],
(float)mtxPtrB[2],
(float)mtxPtrB[3],
(float)mtxPtrB[4],
(float)mtxPtrB[5],
(float)mtxPtrB[6],
(float)mtxPtrB[7],
(float)mtxPtrB[8],
(float)mtxPtrB[9],
(float)mtxPtrB[10],
(float)mtxPtrB[11],
(float)mtxPtrB[12],
(float)mtxPtrB[13],
(float)mtxPtrB[14],
(float)mtxPtrB[15]
};
__m128 rowB[4] = {
_mm_load_ps(mtxFloatB + 0),
_mm_load_ps(mtxFloatB + 4),
_mm_load_ps(mtxFloatB + 8),
_mm_load_ps(mtxFloatB + 12)
};
#endif
rowB[0] = _mm_mul_ps(rowB[0], convertScale);
rowB[1] = _mm_mul_ps(rowB[1], convertScale);
rowB[2] = _mm_mul_ps(rowB[2], convertScale);
rowB[3] = _mm_mul_ps(rowB[3], convertScale);
__m128 rowA[4] = {
_mm_load_ps(mtxPtrA + 0),
_mm_load_ps(mtxPtrA + 4),
_mm_load_ps(mtxPtrA + 8),
_mm_load_ps(mtxPtrA + 12)
};
__m128 vecB[4];
__m128 calcRow;
vecB[0] = _mm_shuffle_ps(rowB[0], rowB[0], 0x00);
vecB[1] = _mm_shuffle_ps(rowB[0], rowB[0], 0x55);
vecB[2] = _mm_shuffle_ps(rowB[0], rowB[0], 0xAA);
vecB[3] = _mm_shuffle_ps(rowB[0], rowB[0], 0xFF);
calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
_mm_store_ps(mtxPtrA + 0, calcRow);
vecB[0] = _mm_shuffle_ps(rowB[1], rowB[1], 0x00);
vecB[1] = _mm_shuffle_ps(rowB[1], rowB[1], 0x55);
vecB[2] = _mm_shuffle_ps(rowB[1], rowB[1], 0xAA);
vecB[3] = _mm_shuffle_ps(rowB[1], rowB[1], 0xFF);
calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
_mm_store_ps(mtxPtrA + 4, calcRow);
vecB[0] = _mm_shuffle_ps(rowB[2], rowB[2], 0x00);
vecB[1] = _mm_shuffle_ps(rowB[2], rowB[2], 0x55);
vecB[2] = _mm_shuffle_ps(rowB[2], rowB[2], 0xAA);
vecB[3] = _mm_shuffle_ps(rowB[2], rowB[2], 0xFF);
calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
_mm_store_ps(mtxPtrA + 8, calcRow);
vecB[0] = _mm_shuffle_ps(rowB[3], rowB[3], 0x00);
vecB[1] = _mm_shuffle_ps(rowB[3], rowB[3], 0x55);
vecB[2] = _mm_shuffle_ps(rowB[3], rowB[3], 0xAA);
vecB[3] = _mm_shuffle_ps(rowB[3], rowB[3], 0xFF);
calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
_mm_store_ps(mtxPtrA + 12, calcRow);
}
template<size_t NUM_ROWS>
FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor)
{
const __m128 divisor_v128 = _mm_set1_ps(divisor);
for (size_t i = 0; i < NUM_ROWS * 4; i+=4)
{
_mm_store_ps( mtxPtr + i, _mm_div_ps(_mm_load_ps(mtxPtr + i), divisor_v128) );
}
}
#else
void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
{
_MatrixMultVec4x4_NoSIMD(mtxPtr, vecPtr);
}
void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
{
const CACHE_ALIGN float mtxFloat[16] = {
mtxPtr[ 0] / 4096.0f,
mtxPtr[ 1] / 4096.0f,
mtxPtr[ 2] / 4096.0f,
mtxPtr[ 3] / 4096.0f,
mtxPtr[ 4] / 4096.0f,
mtxPtr[ 5] / 4096.0f,
mtxPtr[ 6] / 4096.0f,
mtxPtr[ 7] / 4096.0f,
mtxPtr[ 8] / 4096.0f,
mtxPtr[ 9] / 4096.0f,
mtxPtr[10] / 4096.0f,
mtxPtr[11] / 4096.0f,
mtxPtr[12] / 4096.0f,
mtxPtr[13] / 4096.0f,
mtxPtr[14] / 4096.0f,
mtxPtr[15] / 4096.0f
};
const float x = vecPtr[0];
const float y = vecPtr[1];
const float z = vecPtr[2];
vecPtr[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]);
vecPtr[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]);
vecPtr[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]);
}
void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr)
{
mtxPtr[12] += (mtxPtr[0] * vecPtr[0]) + (mtxPtr[4] * vecPtr[1]) + (mtxPtr[ 8] * vecPtr[2]);
mtxPtr[13] += (mtxPtr[1] * vecPtr[0]) + (mtxPtr[5] * vecPtr[1]) + (mtxPtr[ 9] * vecPtr[2]);
mtxPtr[14] += (mtxPtr[2] * vecPtr[0]) + (mtxPtr[6] * vecPtr[1]) + (mtxPtr[10] * vecPtr[2]);
mtxPtr[15] += (mtxPtr[3] * vecPtr[0]) + (mtxPtr[7] * vecPtr[1]) + (mtxPtr[11] * vecPtr[2]);
}
void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr)
{
mtxPtr[ 0] *= vecPtr[0];
mtxPtr[ 1] *= vecPtr[0];
mtxPtr[ 2] *= vecPtr[0];
mtxPtr[ 3] *= vecPtr[0];
mtxPtr[ 4] *= vecPtr[1];
mtxPtr[ 5] *= vecPtr[1];
mtxPtr[ 6] *= vecPtr[1];
mtxPtr[ 7] *= vecPtr[1];
mtxPtr[ 8] *= vecPtr[2];
mtxPtr[ 9] *= vecPtr[2];
mtxPtr[10] *= vecPtr[2];
mtxPtr[11] *= vecPtr[2];
}
void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB)
{
const CACHE_ALIGN float mtxFloatB[16] = {
(float)mtxPtrB[ 0],
(float)mtxPtrB[ 1],
(float)mtxPtrB[ 2],
(float)mtxPtrB[ 3],
(float)mtxPtrB[ 4],
(float)mtxPtrB[ 5],
(float)mtxPtrB[ 6],
(float)mtxPtrB[ 7],
(float)mtxPtrB[ 8],
(float)mtxPtrB[ 9],
(float)mtxPtrB[10],
(float)mtxPtrB[11],
(float)mtxPtrB[12],
(float)mtxPtrB[13],
(float)mtxPtrB[14],
(float)mtxPtrB[15]
};
float tmpMatrix[16];
tmpMatrix[0] = (mtxPtrA[ 0] * mtxFloatB[ 0]) + (mtxPtrA[ 4] * mtxFloatB[ 1]) + (mtxPtrA[ 8] * mtxFloatB[ 2]) + (mtxPtrA[12] * mtxFloatB[ 3]);
tmpMatrix[1] = (mtxPtrA[ 1] * mtxFloatB[ 0]) + (mtxPtrA[ 5] * mtxFloatB[ 1]) + (mtxPtrA[ 9] * mtxFloatB[ 2]) + (mtxPtrA[13] * mtxFloatB[ 3]);
tmpMatrix[2] = (mtxPtrA[ 2] * mtxFloatB[ 0]) + (mtxPtrA[ 6] * mtxFloatB[ 1]) + (mtxPtrA[10] * mtxFloatB[ 2]) + (mtxPtrA[14] * mtxFloatB[ 3]);
tmpMatrix[3] = (mtxPtrA[ 3] * mtxFloatB[ 0]) + (mtxPtrA[ 7] * mtxFloatB[ 1]) + (mtxPtrA[11] * mtxFloatB[ 2]) + (mtxPtrA[15] * mtxFloatB[ 3]);
tmpMatrix[4] = (mtxPtrA[ 0] * mtxFloatB[ 4]) + (mtxPtrA[ 4] * mtxFloatB[ 5]) + (mtxPtrA[ 8] * mtxFloatB[ 6]) + (mtxPtrA[12] * mtxFloatB[ 7]);
tmpMatrix[5] = (mtxPtrA[ 1] * mtxFloatB[ 4]) + (mtxPtrA[ 5] * mtxFloatB[ 5]) + (mtxPtrA[ 9] * mtxFloatB[ 6]) + (mtxPtrA[13] * mtxFloatB[ 7]);
tmpMatrix[6] = (mtxPtrA[ 2] * mtxFloatB[ 4]) + (mtxPtrA[ 6] * mtxFloatB[ 5]) + (mtxPtrA[10] * mtxFloatB[ 6]) + (mtxPtrA[14] * mtxFloatB[ 7]);
tmpMatrix[7] = (mtxPtrA[ 3] * mtxFloatB[ 4]) + (mtxPtrA[ 7] * mtxFloatB[ 5]) + (mtxPtrA[11] * mtxFloatB[ 6]) + (mtxPtrA[15] * mtxFloatB[ 7]);
tmpMatrix[8] = (mtxPtrA[ 0] * mtxFloatB[ 8]) + (mtxPtrA[ 4] * mtxFloatB[ 9]) + (mtxPtrA[ 8] * mtxFloatB[10]) + (mtxPtrA[12] * mtxFloatB[11]);
tmpMatrix[9] = (mtxPtrA[ 1] * mtxFloatB[ 8]) + (mtxPtrA[ 5] * mtxFloatB[ 9]) + (mtxPtrA[ 9] * mtxFloatB[10]) + (mtxPtrA[13] * mtxFloatB[11]);
tmpMatrix[10] = (mtxPtrA[ 2] * mtxFloatB[ 8]) + (mtxPtrA[ 6] * mtxFloatB[ 9]) + (mtxPtrA[10] * mtxFloatB[10]) + (mtxPtrA[14] * mtxFloatB[11]);
tmpMatrix[11] = (mtxPtrA[ 3] * mtxFloatB[ 8]) + (mtxPtrA[ 7] * mtxFloatB[ 9]) + (mtxPtrA[11] * mtxFloatB[10]) + (mtxPtrA[15] * mtxFloatB[11]);
tmpMatrix[12] = (mtxPtrA[ 0] * mtxFloatB[12]) + (mtxPtrA[ 4] * mtxFloatB[13]) + (mtxPtrA[ 8] * mtxFloatB[14]) + (mtxPtrA[12] * mtxFloatB[15]);
tmpMatrix[13] = (mtxPtrA[ 1] * mtxFloatB[12]) + (mtxPtrA[ 5] * mtxFloatB[13]) + (mtxPtrA[ 9] * mtxFloatB[14]) + (mtxPtrA[13] * mtxFloatB[15]);
tmpMatrix[14] = (mtxPtrA[ 2] * mtxFloatB[12]) + (mtxPtrA[ 6] * mtxFloatB[13]) + (mtxPtrA[10] * mtxFloatB[14]) + (mtxPtrA[14] * mtxFloatB[15]);
tmpMatrix[15] = (mtxPtrA[ 3] * mtxFloatB[12]) + (mtxPtrA[ 7] * mtxFloatB[13]) + (mtxPtrA[11] * mtxFloatB[14]) + (mtxPtrA[15] * mtxFloatB[15]);
memcpy(mtxPtrA, tmpMatrix, sizeof(float)*16);
}
template<size_t NUM_ROWS>
FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor)
{
for (size_t i = 0; i < NUM_ROWS * 4; i+=4)
{
mtxPtr[i+0] /= divisor;
mtxPtr[i+1] /= divisor;
mtxPtr[i+2] /= divisor;
mtxPtr[i+3] /= divisor;
}
}
#endif
void MatrixMultVec4x4(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr)
{
const s32 x = vecPtr[0];
const s32 y = vecPtr[1];
const s32 z = vecPtr[2];
const s32 w = vecPtr[3];
vecPtr[0] = sfx32_shiftdown( fx32_mul(x,mtxPtr[0]) + fx32_mul(y,mtxPtr[4]) + fx32_mul(z,mtxPtr[ 8]) + fx32_mul(w,mtxPtr[12]) );
vecPtr[1] = sfx32_shiftdown( fx32_mul(x,mtxPtr[1]) + fx32_mul(y,mtxPtr[5]) + fx32_mul(z,mtxPtr[ 9]) + fx32_mul(w,mtxPtr[13]) );
vecPtr[2] = sfx32_shiftdown( fx32_mul(x,mtxPtr[2]) + fx32_mul(y,mtxPtr[6]) + fx32_mul(z,mtxPtr[10]) + fx32_mul(w,mtxPtr[14]) );
vecPtr[3] = sfx32_shiftdown( fx32_mul(x,mtxPtr[3]) + fx32_mul(y,mtxPtr[7]) + fx32_mul(z,mtxPtr[11]) + fx32_mul(w,mtxPtr[15]) );
}
void MatrixMultVec3x3(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr)
{
const s32 x = vecPtr[0];
const s32 y = vecPtr[1];
const s32 z = vecPtr[2];
vecPtr[0] = sfx32_shiftdown( fx32_mul(x,mtxPtr[0]) + fx32_mul(y,mtxPtr[4]) + fx32_mul(z,mtxPtr[ 8]) );
vecPtr[1] = sfx32_shiftdown( fx32_mul(x,mtxPtr[1]) + fx32_mul(y,mtxPtr[5]) + fx32_mul(z,mtxPtr[ 9]) );
vecPtr[2] = sfx32_shiftdown( fx32_mul(x,mtxPtr[2]) + fx32_mul(y,mtxPtr[6]) + fx32_mul(z,mtxPtr[10]) );
}
void MatrixTranslate(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr)
{
mtxPtr[12] = sfx32_shiftdown( fx32_mul(mtxPtr[0], vecPtr[0]) + fx32_mul(mtxPtr[4], vecPtr[1]) + fx32_mul(mtxPtr[ 8], vecPtr[2]) + fx32_shiftup(mtxPtr[12]) );
mtxPtr[13] = sfx32_shiftdown( fx32_mul(mtxPtr[1], vecPtr[0]) + fx32_mul(mtxPtr[5], vecPtr[1]) + fx32_mul(mtxPtr[ 9], vecPtr[2]) + fx32_shiftup(mtxPtr[13]) );
mtxPtr[14] = sfx32_shiftdown( fx32_mul(mtxPtr[2], vecPtr[0]) + fx32_mul(mtxPtr[6], vecPtr[1]) + fx32_mul(mtxPtr[10], vecPtr[2]) + fx32_shiftup(mtxPtr[14]) );
mtxPtr[15] = sfx32_shiftdown( fx32_mul(mtxPtr[3], vecPtr[0]) + fx32_mul(mtxPtr[7], vecPtr[1]) + fx32_mul(mtxPtr[11], vecPtr[2]) + fx32_shiftup(mtxPtr[15]) );
}
void MatrixScale(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr)
{
mtxPtr[ 0] = sfx32_shiftdown( fx32_mul(mtxPtr[ 0], vecPtr[0]) );
mtxPtr[ 1] = sfx32_shiftdown( fx32_mul(mtxPtr[ 1], vecPtr[0]) );
mtxPtr[ 2] = sfx32_shiftdown( fx32_mul(mtxPtr[ 2], vecPtr[0]) );
mtxPtr[ 3] = sfx32_shiftdown( fx32_mul(mtxPtr[ 3], vecPtr[0]) );
mtxPtr[ 4] = sfx32_shiftdown( fx32_mul(mtxPtr[ 4], vecPtr[1]) );
mtxPtr[ 5] = sfx32_shiftdown( fx32_mul(mtxPtr[ 5], vecPtr[1]) );
mtxPtr[ 6] = sfx32_shiftdown( fx32_mul(mtxPtr[ 6], vecPtr[1]) );
mtxPtr[ 7] = sfx32_shiftdown( fx32_mul(mtxPtr[ 7], vecPtr[1]) );
mtxPtr[ 8] = sfx32_shiftdown( fx32_mul(mtxPtr[ 8], vecPtr[2]) );
mtxPtr[ 9] = sfx32_shiftdown( fx32_mul(mtxPtr[ 9], vecPtr[2]) );
mtxPtr[10] = sfx32_shiftdown( fx32_mul(mtxPtr[10], vecPtr[2]) );
mtxPtr[11] = sfx32_shiftdown( fx32_mul(mtxPtr[11], vecPtr[2]) );
}
void MatrixMultiply(s32 *__restrict mtxPtrA, const s32 *__restrict mtxPtrB)
{
s32 tmpMatrix[16];
tmpMatrix[0] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[0])+fx32_mul(matrix[4],rightMatrix[1])+fx32_mul(matrix[8],rightMatrix[2])+fx32_mul(matrix[12],rightMatrix[3]));
tmpMatrix[1] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[0])+fx32_mul(matrix[5],rightMatrix[1])+fx32_mul(matrix[9],rightMatrix[2])+fx32_mul(matrix[13],rightMatrix[3]));
tmpMatrix[2] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[0])+fx32_mul(matrix[6],rightMatrix[1])+fx32_mul(matrix[10],rightMatrix[2])+fx32_mul(matrix[14],rightMatrix[3]));
tmpMatrix[3] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[0])+fx32_mul(matrix[7],rightMatrix[1])+fx32_mul(matrix[11],rightMatrix[2])+fx32_mul(matrix[15],rightMatrix[3]));
tmpMatrix[4] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[4])+fx32_mul(matrix[4],rightMatrix[5])+fx32_mul(matrix[8],rightMatrix[6])+fx32_mul(matrix[12],rightMatrix[7]));
tmpMatrix[5] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[4])+fx32_mul(matrix[5],rightMatrix[5])+fx32_mul(matrix[9],rightMatrix[6])+fx32_mul(matrix[13],rightMatrix[7]));
tmpMatrix[6] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[4])+fx32_mul(matrix[6],rightMatrix[5])+fx32_mul(matrix[10],rightMatrix[6])+fx32_mul(matrix[14],rightMatrix[7]));
tmpMatrix[7] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[4])+fx32_mul(matrix[7],rightMatrix[5])+fx32_mul(matrix[11],rightMatrix[6])+fx32_mul(matrix[15],rightMatrix[7]));
tmpMatrix[8] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[8])+fx32_mul(matrix[4],rightMatrix[9])+fx32_mul(matrix[8],rightMatrix[10])+fx32_mul(matrix[12],rightMatrix[11]));
tmpMatrix[9] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[8])+fx32_mul(matrix[5],rightMatrix[9])+fx32_mul(matrix[9],rightMatrix[10])+fx32_mul(matrix[13],rightMatrix[11]));
tmpMatrix[10] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[8])+fx32_mul(matrix[6],rightMatrix[9])+fx32_mul(matrix[10],rightMatrix[10])+fx32_mul(matrix[14],rightMatrix[11]));
tmpMatrix[11] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[8])+fx32_mul(matrix[7],rightMatrix[9])+fx32_mul(matrix[11],rightMatrix[10])+fx32_mul(matrix[15],rightMatrix[11]));
tmpMatrix[12] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[12])+fx32_mul(matrix[4],rightMatrix[13])+fx32_mul(matrix[8],rightMatrix[14])+fx32_mul(matrix[12],rightMatrix[15]));
tmpMatrix[13] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[12])+fx32_mul(matrix[5],rightMatrix[13])+fx32_mul(matrix[9],rightMatrix[14])+fx32_mul(matrix[13],rightMatrix[15]));
tmpMatrix[14] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[12])+fx32_mul(matrix[6],rightMatrix[13])+fx32_mul(matrix[10],rightMatrix[14])+fx32_mul(matrix[14],rightMatrix[15]));
tmpMatrix[15] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[12])+fx32_mul(matrix[7],rightMatrix[13])+fx32_mul(matrix[11],rightMatrix[14])+fx32_mul(matrix[15],rightMatrix[15]));
memcpy(matrix,tmpMatrix,sizeof(s32)*16);
tmpMatrix[ 0] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 0])+fx32_mul(mtxPtrA[4],mtxPtrB[ 1])+fx32_mul(mtxPtrA[ 8],mtxPtrB[ 2])+fx32_mul(mtxPtrA[12],mtxPtrB[ 3]) );
tmpMatrix[ 1] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 0])+fx32_mul(mtxPtrA[5],mtxPtrB[ 1])+fx32_mul(mtxPtrA[ 9],mtxPtrB[ 2])+fx32_mul(mtxPtrA[13],mtxPtrB[ 3]) );
tmpMatrix[ 2] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 0])+fx32_mul(mtxPtrA[6],mtxPtrB[ 1])+fx32_mul(mtxPtrA[10],mtxPtrB[ 2])+fx32_mul(mtxPtrA[14],mtxPtrB[ 3]) );
tmpMatrix[ 3] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 0])+fx32_mul(mtxPtrA[7],mtxPtrB[ 1])+fx32_mul(mtxPtrA[11],mtxPtrB[ 2])+fx32_mul(mtxPtrA[15],mtxPtrB[ 3]) );
tmpMatrix[ 4] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 4])+fx32_mul(mtxPtrA[4],mtxPtrB[ 5])+fx32_mul(mtxPtrA[ 8],mtxPtrB[ 6])+fx32_mul(mtxPtrA[12],mtxPtrB[ 7]) );
tmpMatrix[ 5] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 4])+fx32_mul(mtxPtrA[5],mtxPtrB[ 5])+fx32_mul(mtxPtrA[ 9],mtxPtrB[ 6])+fx32_mul(mtxPtrA[13],mtxPtrB[ 7]) );
tmpMatrix[ 6] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 4])+fx32_mul(mtxPtrA[6],mtxPtrB[ 5])+fx32_mul(mtxPtrA[10],mtxPtrB[ 6])+fx32_mul(mtxPtrA[14],mtxPtrB[ 7]) );
tmpMatrix[ 7] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 4])+fx32_mul(mtxPtrA[7],mtxPtrB[ 5])+fx32_mul(mtxPtrA[11],mtxPtrB[ 6])+fx32_mul(mtxPtrA[15],mtxPtrB[ 7]) );
tmpMatrix[ 8] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 8])+fx32_mul(mtxPtrA[4],mtxPtrB[ 9])+fx32_mul(mtxPtrA[ 8],mtxPtrB[10])+fx32_mul(mtxPtrA[12],mtxPtrB[11]) );
tmpMatrix[ 9] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 8])+fx32_mul(mtxPtrA[5],mtxPtrB[ 9])+fx32_mul(mtxPtrA[ 9],mtxPtrB[10])+fx32_mul(mtxPtrA[13],mtxPtrB[11]) );
tmpMatrix[10] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 8])+fx32_mul(mtxPtrA[6],mtxPtrB[ 9])+fx32_mul(mtxPtrA[10],mtxPtrB[10])+fx32_mul(mtxPtrA[14],mtxPtrB[11]) );
tmpMatrix[11] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 8])+fx32_mul(mtxPtrA[7],mtxPtrB[ 9])+fx32_mul(mtxPtrA[11],mtxPtrB[10])+fx32_mul(mtxPtrA[15],mtxPtrB[11]) );
tmpMatrix[12] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[12])+fx32_mul(mtxPtrA[4],mtxPtrB[13])+fx32_mul(mtxPtrA[ 8],mtxPtrB[14])+fx32_mul(mtxPtrA[12],mtxPtrB[15]) );
tmpMatrix[13] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[12])+fx32_mul(mtxPtrA[5],mtxPtrB[13])+fx32_mul(mtxPtrA[ 9],mtxPtrB[14])+fx32_mul(mtxPtrA[13],mtxPtrB[15]) );
tmpMatrix[14] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[12])+fx32_mul(mtxPtrA[6],mtxPtrB[13])+fx32_mul(mtxPtrA[10],mtxPtrB[14])+fx32_mul(mtxPtrA[14],mtxPtrB[15]) );
tmpMatrix[15] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[12])+fx32_mul(mtxPtrA[7],mtxPtrB[13])+fx32_mul(mtxPtrA[11],mtxPtrB[14])+fx32_mul(mtxPtrA[15],mtxPtrB[15]) );
memcpy(mtxPtrA, tmpMatrix, sizeof(s32)*16);
}
void MatrixScale(s32 *matrix, const s32 *ptr)
{
//zero 21-sep-2010 - verified unrolling seems faster on my cpu
MACRODO_N(12,
matrix[X] = sfx32_shiftdown(fx32_mul(matrix[X],ptr[X>>2]))
);
}
void MatrixTranslate(s32 *matrix, const s32 *ptr)
{
MACRODO_N(4,
{
s64 temp = fx32_shiftup(matrix[X+12]);
temp += fx32_mul(matrix[X+0],ptr[0]);
temp += fx32_mul(matrix[X+4],ptr[1]);
temp += fx32_mul(matrix[X+8],ptr[2]);
matrix[X+12] = sfx32_shiftdown(temp);
});
}

View File

@ -1,6 +1,6 @@
/*
Copyright (C) 2006-2007 shash
Copyright (C) 2007-2017 DeSmuME team
Copyright (C) 2007-2018 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -77,7 +77,24 @@ void Vector3Scale(float *dst, const float scale);
void Vector3Copy(float *dst, const float *src);
void Vector3Normalize(float *dst);
void Vector4Copy(float *dst, const float *src);
void Vector4Copy(float *dst, const float *src);
void _MatrixMultVec4x4_NoSIMD(const s32 *__restrict mtxPtr, float *__restrict vecPtr);
void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr);
void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr);
void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr);
void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr);
void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB);
template<size_t NUM_ROWS> FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor);
void MatrixMultVec4x4(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr);
void MatrixMultVec3x3(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr);
void MatrixTranslate(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr);
void MatrixScale(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr);
void MatrixMultiply(s32 *__restrict mtxPtrA, const s32 *__restrict mtxPtrB);
//these functions are an unreliable, inaccurate floor.
//it should only be used for positive numbers
@ -296,151 +313,4 @@ static void memset_u32_fast(void *dst, const u32 val)
#endif // SIMD Functions
// NOSSE version always used in gfx3d.cpp
void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr);
void MatrixMultVec3x3_fixed(const s32 *matrix, s32 *vecPtr);
//---------------------------
//switched SSE functions
#ifdef ENABLE_SSE
struct SSE_MATRIX
{
SSE_MATRIX(const float *matrix)
: row0(_mm_load_ps(matrix))
, row1(_mm_load_ps(matrix+4))
, row2(_mm_load_ps(matrix+8))
, row3(_mm_load_ps(matrix+12))
{}
union {
__m128 rows[4];
struct { __m128 row0; __m128 row1; __m128 row2; __m128 row3; };
};
};
FORCEINLINE __m128 _util_MatrixMultVec4x4_(const SSE_MATRIX &mat, __m128 vec)
{
__m128 xmm5 = _mm_shuffle_ps(vec, vec, B8(01010101));
__m128 xmm6 = _mm_shuffle_ps(vec, vec, B8(10101010));
__m128 xmm7 = _mm_shuffle_ps(vec, vec, B8(11111111));
__m128 xmm4 = _mm_shuffle_ps(vec, vec, B8(00000000));
xmm4 = _mm_mul_ps(xmm4,mat.row0);
xmm5 = _mm_mul_ps(xmm5,mat.row1);
xmm6 = _mm_mul_ps(xmm6,mat.row2);
xmm7 = _mm_mul_ps(xmm7,mat.row3);
xmm4 = _mm_add_ps(xmm4,xmm5);
xmm4 = _mm_add_ps(xmm4,xmm6);
xmm4 = _mm_add_ps(xmm4,xmm7);
return xmm4;
}
FORCEINLINE void MatrixMultiply(float * matrix, const float * rightMatrix)
{
//this seems to generate larger code, including many movaps, but maybe it is less harsh on the registers than the
//more hand-tailored approach
__m128 row0 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix));
__m128 row1 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+4));
__m128 row2 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+8));
__m128 row3 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+12));
_mm_store_ps(matrix,row0);
_mm_store_ps(matrix+4,row1);
_mm_store_ps(matrix+8,row2);
_mm_store_ps(matrix+12,row3);
}
FORCEINLINE void MatrixMultVec4x4(const float *matrix, float *vecPtr)
{
_mm_store_ps(vecPtr,_util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(vecPtr)));
}
FORCEINLINE void MatrixMultVec3x3(const float * matrix, float * vecPtr)
{
const __m128 vec = _mm_load_ps(vecPtr);
__m128 xmm5 = _mm_shuffle_ps(vec, vec, B8(01010101));
__m128 xmm6 = _mm_shuffle_ps(vec, vec, B8(10101010));
__m128 xmm4 = _mm_shuffle_ps(vec, vec, B8(00000000));
const SSE_MATRIX mat(matrix);
xmm4 = _mm_mul_ps(xmm4,mat.row0);
xmm5 = _mm_mul_ps(xmm5,mat.row1);
xmm6 = _mm_mul_ps(xmm6,mat.row2);
xmm4 = _mm_add_ps(xmm4,xmm5);
xmm4 = _mm_add_ps(xmm4,xmm6);
_mm_store_ps(vecPtr,xmm4);
}
FORCEINLINE void MatrixTranslate(float *matrix, const float *ptr)
{
__m128 xmm4 = _mm_load_ps(ptr);
__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(matrix));
xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(matrix+4));
xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(matrix+8));
xmm4 = _mm_add_ps(xmm4,xmm5);
xmm4 = _mm_add_ps(xmm4,xmm6);
xmm4 = _mm_add_ps(xmm4,_mm_load_ps(matrix+12));
_mm_store_ps(matrix+12,xmm4);
}
FORCEINLINE void MatrixScale(float *matrix, const float *ptr)
{
__m128 xmm4 = _mm_load_ps(ptr);
__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(matrix));
xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(matrix+4));
xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(matrix+8));
_mm_store_ps(matrix,xmm4);
_mm_store_ps(matrix+4,xmm5);
_mm_store_ps(matrix+8,xmm6);
}
template<int NUM_ROWS>
FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
{
CTASSERT(NUM_ROWS==3 || NUM_ROWS==4);
const __m128 val = _mm_set_ps1(divisor);
_mm_store_ps(matrix,_mm_div_ps(_mm_load_ps(matrix),val));
_mm_store_ps(matrix+4,_mm_div_ps(_mm_load_ps(matrix+4),val));
_mm_store_ps(matrix+8,_mm_div_ps(_mm_load_ps(matrix+8),val));
if(NUM_ROWS==4)
_mm_store_ps(matrix+12,_mm_div_ps(_mm_load_ps(matrix+12),val));
}
#else //no sse
void MatrixMultVec4x4 (const float *matrix, float *vecPtr);
void MatrixMultVec3x3(const float * matrix, float * vecPtr);
void MatrixMultiply(float * matrix, const float * rightMatrix);
void MatrixTranslate(float *matrix, const float *ptr);
void MatrixScale(float * matrix, const float * ptr);
template<int NUM_ROWS>
FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
{
for(int i=0;i<NUM_ROWS*4;i++)
matrix[i] /= divisor;
}
#endif //switched SSE functions
void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr);
void MatrixMultiply(s32* matrix, const s32* rightMatrix);
void MatrixScale(s32 *matrix, const s32 *ptr);
void MatrixTranslate(s32 *matrix, const s32 *ptr);
#endif // MATRIX_H