diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp
index 1eccf9657..515a909ac 100644
--- a/desmume/src/gfx3d.cpp
+++ b/desmume/src/gfx3d.cpp
@@ -609,11 +609,11 @@ void gfx3d_reset()
 	memset(colorRGB, 0, sizeof(colorRGB));
 	memset(&tempVertInfo, 0, sizeof(tempVertInfo));
 
-	MatrixInit (mtxCurrent[0]);
-	MatrixInit (mtxCurrent[1]);
-	MatrixInit (mtxCurrent[2]);
-	MatrixInit (mtxCurrent[3]);
-	MatrixInit (mtxTemporal);
+	MatrixInit(mtxCurrent[MATRIXMODE_PROJECTION]);
+	MatrixInit(mtxCurrent[MATRIXMODE_POSITION]);
+	MatrixInit(mtxCurrent[MATRIXMODE_POSITION_VECTOR]);
+	MatrixInit(mtxCurrent[MATRIXMODE_TEXTURE]);
+	MatrixInit(mtxTemporal);
 
 	MatrixStackInit(&mtxStack[0]);
 	MatrixStackInit(&mtxStack[1]);
@@ -727,13 +727,13 @@ static void SetVertex()
 	if (texCoordTransformMode == TextureTransformationMode_VertexSource)
 	{
 		//Tested by: Eledees The Adventures of Kai and Zero (E) [title screen and frontend menus]
-		last_s = (s32)(((s64)s16coord[0] * mtxCurrent[3][0] + 
-								(s64)s16coord[1] * mtxCurrent[3][4] + 
-								(s64)s16coord[2] * mtxCurrent[3][8] + 
+		last_s = (s32)(((s64)s16coord[0] * mtxCurrent[MATRIXMODE_TEXTURE][0] +
+								(s64)s16coord[1] * mtxCurrent[MATRIXMODE_TEXTURE][4] +
+								(s64)s16coord[2] * mtxCurrent[MATRIXMODE_TEXTURE][8] +
 								(((s64)(_s))<<24))>>24);
-		last_t = (s32)(((s64)s16coord[0] * mtxCurrent[3][1] + 
-								(s64)s16coord[1] * mtxCurrent[3][5] + 
-								(s64)s16coord[2] * mtxCurrent[3][9] + 
+		last_t = (s32)(((s64)s16coord[0] * mtxCurrent[MATRIXMODE_TEXTURE][1] +
+								(s64)s16coord[1] * mtxCurrent[MATRIXMODE_TEXTURE][5] +
+								(s64)s16coord[2] * mtxCurrent[MATRIXMODE_TEXTURE][9] +
 								(((s64)(_t))<<24))>>24);
 	}
 
@@ -744,8 +744,8 @@ static void SetVertex()
 	if(polylist->count >= POLYLIST_SIZE) 
 			return;
 
-	GEM_TransformVertex(mtxCurrent[1],coordTransformed); //modelview
-	GEM_TransformVertex(mtxCurrent[0],coordTransformed); //projection
+	GEM_TransformVertex(mtxCurrent[MATRIXMODE_POSITION], coordTransformed); //modelview
+	GEM_TransformVertex(mtxCurrent[MATRIXMODE_PROJECTION], coordTransformed); //projection
 
 	//TODO - culling should be done here.
 	//TODO - viewport transform?
@@ -930,7 +930,7 @@ static void gfx3d_glLightDirection_cache(const size_t index)
 	cacheLightDirection[index][3] = 0;
 
 	//Multiply the vector by the directional matrix
-	MatrixMultVec3x3_fixed(mtxCurrent[2], cacheLightDirection[index]);
+	MatrixMultVec3x3(mtxCurrent[MATRIXMODE_POSITION_VECTOR], cacheLightDirection[index]);
 
 	//Calculate the half angle vector
 	s32 lineOfSight[4] = {0, 0, (-1)<<12, 0};
@@ -1092,7 +1092,7 @@ static void gfx3d_glLoadIdentity()
 	GFX_DELAY(19);
 
 	if (mode == MATRIXMODE_POSITION_VECTOR)
-		MatrixIdentity(mtxCurrent[1]);
+		MatrixIdentity(mtxCurrent[MATRIXMODE_POSITION]);
 
 	//printf("identity: %d to: \n",mode); MatrixPrint(mtxCurrent[1]);
 }
@@ -1110,7 +1110,7 @@ static BOOL gfx3d_glLoadMatrix4x4(s32 v)
 	//vector_fix2float<4>(mtxCurrent[mode], 4096.f);
 
 	if (mode == MATRIXMODE_POSITION_VECTOR)
-		MatrixCopy(mtxCurrent[1], mtxCurrent[2]);
+		MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], mtxCurrent[MATRIXMODE_POSITION_VECTOR]);
 
 	//printf("load4x4: matrix %d to: \n",mode); MatrixPrint(mtxCurrent[1]);
 	return TRUE;
@@ -1134,7 +1134,7 @@ static BOOL gfx3d_glLoadMatrix4x3(s32 v)
 	GFX_DELAY(30);
 
 	if (mode == MATRIXMODE_POSITION_VECTOR)
-		MatrixCopy(mtxCurrent[1], mtxCurrent[2]);
+		MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], mtxCurrent[MATRIXMODE_POSITION_VECTOR]);
 	//printf("load4x3: matrix %d to: \n",mode); MatrixPrint(mtxCurrent[1]);
 	return TRUE;
 }
@@ -1155,7 +1155,7 @@ static BOOL gfx3d_glMultMatrix4x4(s32 v)
 
 	if (mode == MATRIXMODE_POSITION_VECTOR)
 	{
-		MatrixMultiply(mtxCurrent[1], mtxTemporal);
+		MatrixMultiply(mtxCurrent[MATRIXMODE_POSITION], mtxTemporal);
 		GFX_DELAY_M2(30);
 	}
 
@@ -1186,7 +1186,7 @@ static BOOL gfx3d_glMultMatrix4x3(s32 v)
 
 	if (mode == MATRIXMODE_POSITION_VECTOR)
 	{
-		MatrixMultiply (mtxCurrent[1], mtxTemporal);
+		MatrixMultiply (mtxCurrent[MATRIXMODE_POSITION], mtxTemporal);
 		GFX_DELAY_M2(30);
 	}
 
@@ -1219,7 +1219,7 @@ static BOOL gfx3d_glMultMatrix3x3(s32 v)
 
 	if (mode == MATRIXMODE_POSITION_VECTOR)
 	{
-		MatrixMultiply(mtxCurrent[1], mtxTemporal);
+		MatrixMultiply(mtxCurrent[MATRIXMODE_POSITION], mtxTemporal);
 		GFX_DELAY_M2(30);
 	}
 
@@ -1248,8 +1248,8 @@ static BOOL gfx3d_glScale(s32 v)
 	//note: pos-vector mode should not cause both matrices to scale.
 	//the whole purpose is to keep the vector matrix orthogonal
 	//so, I am leaving this commented out as an example of what not to do.
-	//if (mode == 2)
-	//	MatrixScale (mtxCurrent[1], scale);
+	//if (mode == MATRIXMODE_POSITION_VECTOR)
+	//	MatrixScale (mtxCurrent[MATRIXMODE_POSITION], scale);
 	return TRUE;
 }
 
@@ -1268,7 +1268,7 @@ static BOOL gfx3d_glTranslate(s32 v)
 
 	if (mode == MATRIXMODE_POSITION_VECTOR)
 	{
-		MatrixTranslate(mtxCurrent[1], trans);
+		MatrixTranslate(mtxCurrent[MATRIXMODE_POSITION], trans);
 		GFX_DELAY_M2(30);
 	}
 
@@ -1297,11 +1297,11 @@ static void gfx3d_glNormal(s32 v)
 	{
 		//SM64 highlight rendered star in main menu tests this
 		//also smackdown 2010 player textures tested this (needed cast on _s and _t)
-		last_s = (s32)(((s64)normal[0] * mtxCurrent[3][0] + (s64)normal[1] * mtxCurrent[3][4] + (s64)normal[2] * mtxCurrent[3][8] + (((s64)_s)<<24))>>24);
-		last_t = (s32)(((s64)normal[0] * mtxCurrent[3][1] + (s64)normal[1] * mtxCurrent[3][5] + (s64)normal[2] * mtxCurrent[3][9] + (((s64)_t)<<24))>>24);
+		last_s = (s32)(((s64)normal[0] * mtxCurrent[MATRIXMODE_TEXTURE][0] + (s64)normal[1] * mtxCurrent[MATRIXMODE_TEXTURE][4] + (s64)normal[2] * mtxCurrent[MATRIXMODE_TEXTURE][8] + (((s64)_s)<<24))>>24);
+		last_t = (s32)(((s64)normal[0] * mtxCurrent[MATRIXMODE_TEXTURE][1] + (s64)normal[1] * mtxCurrent[MATRIXMODE_TEXTURE][5] + (s64)normal[2] * mtxCurrent[MATRIXMODE_TEXTURE][9] + (((s64)_t)<<24))>>24);
 	}
 
-	MatrixMultVec3x3_fixed(mtxCurrent[2],normal);
+	MatrixMultVec3x3(mtxCurrent[MATRIXMODE_POSITION_VECTOR], normal);
 
 	//apply lighting model
 	u8 diffuse[3] = {
@@ -1395,8 +1395,8 @@ static void gfx3d_glTexCoord(s32 val)
 	if (texCoordTransformMode == TextureTransformationMode_TexCoordSource)
 	{
 		//dragon quest 4 overworld will test this
-		last_s = (s32) (( (s64)_s * mtxCurrent[3][0] + (s64)_t * mtxCurrent[3][4] + (s64)mtxCurrent[3][8] + (s64)mtxCurrent[3][12])>>12);
-		last_t = (s32) (( (s64)_s * mtxCurrent[3][1] + (s64)_t * mtxCurrent[3][5] + (s64)mtxCurrent[3][9] + (s64)mtxCurrent[3][13])>>12);
+		last_s = (s32) (( (s64)_s * mtxCurrent[MATRIXMODE_TEXTURE][0] + (s64)_t * mtxCurrent[MATRIXMODE_TEXTURE][4] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][8] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][12])>>12);
+		last_t = (s32) (( (s64)_s * mtxCurrent[MATRIXMODE_TEXTURE][1] + (s64)_t * mtxCurrent[MATRIXMODE_TEXTURE][5] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][9] + (s64)mtxCurrent[MATRIXMODE_TEXTURE][13])>>12);
 	}
 	else if (texCoordTransformMode == TextureTransformationMode_None)
 	{
@@ -1684,13 +1684,13 @@ static BOOL gfx3d_glBoxTest(u32 v)
 		//MatrixMultVec4x4_M2(mtxCurrent[0], verts[i].coord);
 
 		//but change it all to floating point and do it that way instead
-		CACHE_ALIGN float temp1[16] = {mtxCurrent[1][0]/4096.0f,mtxCurrent[1][1]/4096.0f,mtxCurrent[1][2]/4096.0f,mtxCurrent[1][3]/4096.0f,mtxCurrent[1][4]/4096.0f,mtxCurrent[1][5]/4096.0f,mtxCurrent[1][6]/4096.0f,mtxCurrent[1][7]/4096.0f,mtxCurrent[1][8]/4096.0f,mtxCurrent[1][9]/4096.0f,mtxCurrent[1][10]/4096.0f,mtxCurrent[1][11]/4096.0f,mtxCurrent[1][12]/4096.0f,mtxCurrent[1][13]/4096.0f,mtxCurrent[1][14]/4096.0f,mtxCurrent[1][15]/4096.0f};
-		CACHE_ALIGN float temp0[16] = {mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f,mtxCurrent[0][4]/4096.0f,mtxCurrent[0][5]/4096.0f,mtxCurrent[0][6]/4096.0f,mtxCurrent[0][7]/4096.0f,mtxCurrent[0][8]/4096.0f,mtxCurrent[0][9]/4096.0f,mtxCurrent[0][10]/4096.0f,mtxCurrent[0][11]/4096.0f,mtxCurrent[0][12]/4096.0f,mtxCurrent[0][13]/4096.0f,mtxCurrent[0][14]/4096.0f,mtxCurrent[0][15]/4096.0f};
 
 		//DS_ALIGN(16) VERT_POS4f vert = { verts[i].x, verts[i].y, verts[i].z, verts[i].w };
-
-		_NOSSE_MatrixMultVec4x4(temp1,verts[i].coord);
-		_NOSSE_MatrixMultVec4x4(temp0,verts[i].coord);
+		
+		//_MatrixMultVec4x4_NoSIMD(mtxCurrent[MATRIXMODE_POSITION], verts[i].coord);
+		//_MatrixMultVec4x4_NoSIMD(mtxCurrent[MATRIXMODE_PROJECTION], verts[i].coord);
+		MatrixMultVec4x4(mtxCurrent[MATRIXMODE_POSITION], verts[i].coord);
+		MatrixMultVec4x4(mtxCurrent[MATRIXMODE_PROJECTION], verts[i].coord);
 	}
 
 	//clip each poly
@@ -1742,12 +1742,9 @@ static BOOL gfx3d_glPosTest(u32 v)
 	PTind = 0;
 	
 	PTcoords[3] = 1.0f;
-
-	CACHE_ALIGN float temp1[16] = {mtxCurrent[1][0]/4096.0f,mtxCurrent[1][1]/4096.0f,mtxCurrent[1][2]/4096.0f,mtxCurrent[1][3]/4096.0f,mtxCurrent[1][4]/4096.0f,mtxCurrent[1][5]/4096.0f,mtxCurrent[1][6]/4096.0f,mtxCurrent[1][7]/4096.0f,mtxCurrent[1][8]/4096.0f,mtxCurrent[1][9]/4096.0f,mtxCurrent[1][10]/4096.0f,mtxCurrent[1][11]/4096.0f,mtxCurrent[1][12]/4096.0f,mtxCurrent[1][13]/4096.0f,mtxCurrent[1][14]/4096.0f,mtxCurrent[1][15]/4096.0f};
-	CACHE_ALIGN float temp0[16] = {mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f,mtxCurrent[0][4]/4096.0f,mtxCurrent[0][5]/4096.0f,mtxCurrent[0][6]/4096.0f,mtxCurrent[0][7]/4096.0f,mtxCurrent[0][8]/4096.0f,mtxCurrent[0][9]/4096.0f,mtxCurrent[0][10]/4096.0f,mtxCurrent[0][11]/4096.0f,mtxCurrent[0][12]/4096.0f,mtxCurrent[0][13]/4096.0f,mtxCurrent[0][14]/4096.0f,mtxCurrent[0][15]/4096.0f};
-
-	MatrixMultVec4x4(temp1, PTcoords);
-	MatrixMultVec4x4(temp0, PTcoords);
+	
+	MatrixMultVec4x4(mtxCurrent[MATRIXMODE_POSITION], PTcoords);
+	MatrixMultVec4x4(mtxCurrent[MATRIXMODE_PROJECTION], PTcoords);
 
 	MMU_new.gxstat.tb = 0;
 
@@ -1765,13 +1762,14 @@ static void gfx3d_glVecTest(u32 v)
 	//i am not sure exactly what it is doing, maybe it is testing to ensure
 	//that the normal vector for the point of interest is camera-facing.
 
-	CACHE_ALIGN float normal[4] = { normalTable[v&1023],
-						normalTable[(v>>10)&1023],
-						normalTable[(v>>20)&1023],
-						0};
-
-	CACHE_ALIGN float temp[16] = {mtxCurrent[2][0]/4096.0f,mtxCurrent[2][1]/4096.0f,mtxCurrent[2][2]/4096.0f,mtxCurrent[2][3]/4096.0f,mtxCurrent[2][4]/4096.0f,mtxCurrent[2][5]/4096.0f,mtxCurrent[2][6]/4096.0f,mtxCurrent[2][7]/4096.0f,mtxCurrent[2][8]/4096.0f,mtxCurrent[2][9]/4096.0f,mtxCurrent[2][10]/4096.0f,mtxCurrent[2][11]/4096.0f,mtxCurrent[2][12]/4096.0f,mtxCurrent[2][13]/4096.0f,mtxCurrent[2][14]/4096.0f,mtxCurrent[2][15]/4096.0f};
-	MatrixMultVec4x4(temp, normal);
+	CACHE_ALIGN float normal[4] = {
+		normalTable[v&1023],
+		normalTable[(v>>10)&1023],
+		normalTable[(v>>20)&1023],
+		0
+	};
+	
+	MatrixMultVec4x4(mtxCurrent[MATRIXMODE_POSITION_VECTOR], normal);
 
 	s16 x = (s16)(normal[0]*4096);
 	s16 y = (s16)(normal[1]*4096);
@@ -1853,7 +1851,7 @@ void gfx3d_UpdateToonTable(u8 offset, u32 val)
 s32 gfx3d_GetClipMatrix(const u32 index)
 {
 	//printf("reading clip matrix: %d\n",index);
-	return (s32)MatrixGetMultipliedIndex(index, mtxCurrent[0], mtxCurrent[1]);
+	return (s32)MatrixGetMultipliedIndex(index, mtxCurrent[MATRIXMODE_PROJECTION], mtxCurrent[MATRIXMODE_POSITION]);
 }
 
 s32 gfx3d_GetDirectionalMatrix(const u32 index)
@@ -1861,7 +1859,7 @@ s32 gfx3d_GetDirectionalMatrix(const u32 index)
 	const size_t _index = (((index / 3) * 4) + (index % 3));
 
 	//return (s32)(mtxCurrent[2][_index]*(1<<12));
-	return mtxCurrent[2][_index];
+	return mtxCurrent[MATRIXMODE_POSITION_VECTOR][_index];
 }
 
 void gfx3d_glAlphaFunc(u32 v)
diff --git a/desmume/src/matrix.cpp b/desmume/src/matrix.cpp
index b3c175aae..14f805207 100644
--- a/desmume/src/matrix.cpp
+++ b/desmume/src/matrix.cpp
@@ -1,6 +1,6 @@
 /*
 	Copyright (C) 2006-2007 shash
-	Copyright (C) 2007-2017 DeSmuME team
+	Copyright (C) 2007-2018 DeSmuME team
 
 	This file is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
@@ -24,118 +24,6 @@
 #include "matrix.h"
 #include "MMU.h"
 
-void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr)
-{
-	float x = vecPtr[0];
-	float y = vecPtr[1];
-	float z = vecPtr[2];
-	float w = vecPtr[3];
-
-	vecPtr[0] = x * matrix[0] + y * matrix[4] + z * matrix[ 8] + w * matrix[12];
-	vecPtr[1] = x * matrix[1] + y * matrix[5] + z * matrix[ 9] + w * matrix[13];
-	vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10] + w * matrix[14];
-	vecPtr[3] = x * matrix[3] + y * matrix[7] + z * matrix[11] + w * matrix[15];
-}
-
-void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr)
-{
-	const s32 x = vecPtr[0];
-	const s32 y = vecPtr[1];
-	const s32 z = vecPtr[2];
-	const s32 w = vecPtr[3];
-
-	vecPtr[0] = sfx32_shiftdown(fx32_mul(x,matrix[0]) + fx32_mul(y,matrix[4]) + fx32_mul(z,matrix [8]) + fx32_mul(w,matrix[12]));
-	vecPtr[1] = sfx32_shiftdown(fx32_mul(x,matrix[1]) + fx32_mul(y,matrix[5]) + fx32_mul(z,matrix[ 9]) + fx32_mul(w,matrix[13]));
-	vecPtr[2] = sfx32_shiftdown(fx32_mul(x,matrix[2]) + fx32_mul(y,matrix[6]) + fx32_mul(z,matrix[10]) + fx32_mul(w,matrix[14]));
-	vecPtr[3] = sfx32_shiftdown(fx32_mul(x,matrix[3]) + fx32_mul(y,matrix[7]) + fx32_mul(z,matrix[11]) + fx32_mul(w,matrix[15]));
-}
-
-void MatrixMultVec3x3_fixed(const s32 *matrix, s32 *vecPtr)
-{
-	const s32 x = vecPtr[0];
-	const s32 y = vecPtr[1];
-	const s32 z = vecPtr[2];
-
-	vecPtr[0] = sfx32_shiftdown(fx32_mul(x,matrix[0]) + fx32_mul(y,matrix[4]) + fx32_mul(z,matrix[8]));
-	vecPtr[1] = sfx32_shiftdown(fx32_mul(x,matrix[1]) + fx32_mul(y,matrix[5]) + fx32_mul(z,matrix[9]));
-	vecPtr[2] = sfx32_shiftdown(fx32_mul(x,matrix[2]) + fx32_mul(y,matrix[6]) + fx32_mul(z,matrix[10]));
-}
-
-//-------------------------
-//switched SSE functions: implementations for no SSE
-#ifndef ENABLE_SSE
-void MatrixMultVec4x4 (const float *matrix, float *vecPtr)
-{
-	_NOSSE_MatrixMultVec4x4(matrix, vecPtr);
-}
-
-
-void MatrixMultVec3x3 (const float *matrix, float *vecPtr)
-{
-	float x = vecPtr[0];
-	float y = vecPtr[1];
-	float z = vecPtr[2];
-
-	vecPtr[0] = x * matrix[0] + y * matrix[4] + z * matrix[ 8];
-	vecPtr[1] = x * matrix[1] + y * matrix[5] + z * matrix[ 9];
-	vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10];
-}
-
-void MatrixMultiply (float *matrix, const float *rightMatrix)
-{
-	float tmpMatrix[16];
-
-	tmpMatrix[0]  = (matrix[0]*rightMatrix[0])+(matrix[4]*rightMatrix[1])+(matrix[8]*rightMatrix[2])+(matrix[12]*rightMatrix[3]);
-	tmpMatrix[1]  = (matrix[1]*rightMatrix[0])+(matrix[5]*rightMatrix[1])+(matrix[9]*rightMatrix[2])+(matrix[13]*rightMatrix[3]);
-	tmpMatrix[2]  = (matrix[2]*rightMatrix[0])+(matrix[6]*rightMatrix[1])+(matrix[10]*rightMatrix[2])+(matrix[14]*rightMatrix[3]);
-	tmpMatrix[3]  = (matrix[3]*rightMatrix[0])+(matrix[7]*rightMatrix[1])+(matrix[11]*rightMatrix[2])+(matrix[15]*rightMatrix[3]);
-
-	tmpMatrix[4]  = (matrix[0]*rightMatrix[4])+(matrix[4]*rightMatrix[5])+(matrix[8]*rightMatrix[6])+(matrix[12]*rightMatrix[7]);
-	tmpMatrix[5]  = (matrix[1]*rightMatrix[4])+(matrix[5]*rightMatrix[5])+(matrix[9]*rightMatrix[6])+(matrix[13]*rightMatrix[7]);
-	tmpMatrix[6]  = (matrix[2]*rightMatrix[4])+(matrix[6]*rightMatrix[5])+(matrix[10]*rightMatrix[6])+(matrix[14]*rightMatrix[7]);
-	tmpMatrix[7]  = (matrix[3]*rightMatrix[4])+(matrix[7]*rightMatrix[5])+(matrix[11]*rightMatrix[6])+(matrix[15]*rightMatrix[7]);
-
-	tmpMatrix[8]  = (matrix[0]*rightMatrix[8])+(matrix[4]*rightMatrix[9])+(matrix[8]*rightMatrix[10])+(matrix[12]*rightMatrix[11]);
-	tmpMatrix[9]  = (matrix[1]*rightMatrix[8])+(matrix[5]*rightMatrix[9])+(matrix[9]*rightMatrix[10])+(matrix[13]*rightMatrix[11]);
-	tmpMatrix[10] = (matrix[2]*rightMatrix[8])+(matrix[6]*rightMatrix[9])+(matrix[10]*rightMatrix[10])+(matrix[14]*rightMatrix[11]);
-	tmpMatrix[11] = (matrix[3]*rightMatrix[8])+(matrix[7]*rightMatrix[9])+(matrix[11]*rightMatrix[10])+(matrix[15]*rightMatrix[11]);
-
-	tmpMatrix[12] = (matrix[0]*rightMatrix[12])+(matrix[4]*rightMatrix[13])+(matrix[8]*rightMatrix[14])+(matrix[12]*rightMatrix[15]);
-	tmpMatrix[13] = (matrix[1]*rightMatrix[12])+(matrix[5]*rightMatrix[13])+(matrix[9]*rightMatrix[14])+(matrix[13]*rightMatrix[15]);
-	tmpMatrix[14] = (matrix[2]*rightMatrix[12])+(matrix[6]*rightMatrix[13])+(matrix[10]*rightMatrix[14])+(matrix[14]*rightMatrix[15]);
-	tmpMatrix[15] = (matrix[3]*rightMatrix[12])+(matrix[7]*rightMatrix[13])+(matrix[11]*rightMatrix[14])+(matrix[15]*rightMatrix[15]);
-
-	memcpy (matrix, tmpMatrix, sizeof(float)*16);
-}
-
-void MatrixTranslate	(float *matrix, const float *ptr)
-{
-	matrix[12] += (matrix[0]*ptr[0])+(matrix[4]*ptr[1])+(matrix[ 8]*ptr[2]);
-	matrix[13] += (matrix[1]*ptr[0])+(matrix[5]*ptr[1])+(matrix[ 9]*ptr[2]);
-	matrix[14] += (matrix[2]*ptr[0])+(matrix[6]*ptr[1])+(matrix[10]*ptr[2]);
-	matrix[15] += (matrix[3]*ptr[0])+(matrix[7]*ptr[1])+(matrix[11]*ptr[2]);
-}
-
-void MatrixScale (float *matrix, const float *ptr)
-{
-	matrix[0]  *= ptr[0];
-	matrix[1]  *= ptr[0];
-	matrix[2]  *= ptr[0];
-	matrix[3]  *= ptr[0];
-
-	matrix[4]  *= ptr[1];
-	matrix[5]  *= ptr[1];
-	matrix[6]  *= ptr[1];
-	matrix[7]  *= ptr[1];
-
-	matrix[8] *= ptr[2];
-	matrix[9] *= ptr[2];
-	matrix[10] *= ptr[2];
-	matrix[11] *= ptr[2];
-}
-
-#endif //switched c/asm functions
-//-----------------------------------------
 
 void MatrixInit  (s32 *matrix)
 {
@@ -345,51 +233,487 @@ void Vector4Copy(float *dst, const float *src)
 	dst[3] = src[3];
 }
 
+void _MatrixMultVec4x4_NoSIMD(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
+{
+	const CACHE_ALIGN float mtxFloat[16] = {
+		mtxPtr[ 0] / 4096.0f,
+		mtxPtr[ 1] / 4096.0f,
+		mtxPtr[ 2] / 4096.0f,
+		mtxPtr[ 3] / 4096.0f,
+		
+		mtxPtr[ 4] / 4096.0f,
+		mtxPtr[ 5] / 4096.0f,
+		mtxPtr[ 6] / 4096.0f,
+		mtxPtr[ 7] / 4096.0f,
+		
+		mtxPtr[ 8] / 4096.0f,
+		mtxPtr[ 9] / 4096.0f,
+		mtxPtr[10] / 4096.0f,
+		mtxPtr[11] / 4096.0f,
+		
+		mtxPtr[12] / 4096.0f,
+		mtxPtr[13] / 4096.0f,
+		mtxPtr[14] / 4096.0f,
+		mtxPtr[15] / 4096.0f
+	};
+	
+	const float x = vecPtr[0];
+	const float y = vecPtr[1];
+	const float z = vecPtr[2];
+	const float w = vecPtr[3];
+	
+	vecPtr[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]) + (w * mtxFloat[12]);
+	vecPtr[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]) + (w * mtxFloat[13]);
+	vecPtr[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]) + (w * mtxFloat[14]);
+	vecPtr[3] = (x * mtxFloat[3]) + (y * mtxFloat[7]) + (z * mtxFloat[11]) + (w * mtxFloat[15]);
+}
 
-void MatrixMultiply (s32 *matrix, const s32 *rightMatrix)
+#ifdef ENABLE_SSE
+
+void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
+{
+	const __m128 loadedVecPtr = _mm_load_ps(vecPtr);
+	const __m128 convertScalar = _mm_set1_ps(1.0f/4096.0f);
+	
+#ifdef ENABLE_SSE2
+	__m128 row[4] = {
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr +  0)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr +  4)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr +  8)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 12)) )
+	};
+#else
+	const CACHE_ALIGN float mtxFloat[16] = {
+		(float)mtxPtr[0],
+		(float)mtxPtr[1],
+		(float)mtxPtr[2],
+		(float)mtxPtr[3],
+		
+		(float)mtxPtr[4],
+		(float)mtxPtr[5],
+		(float)mtxPtr[6],
+		(float)mtxPtr[7],
+		
+		(float)mtxPtr[8],
+		(float)mtxPtr[9],
+		(float)mtxPtr[10],
+		(float)mtxPtr[11],
+		
+		(float)mtxPtr[12],
+		(float)mtxPtr[13],
+		(float)mtxPtr[14],
+		(float)mtxPtr[15]
+	};
+	
+	__m128 row[4] = {
+		_mm_load_ps(mtxFloat +  0),
+		_mm_load_ps(mtxFloat +  4),
+		_mm_load_ps(mtxFloat +  8),
+		_mm_load_ps(mtxFloat + 12)
+	};
+#endif
+	
+	row[0] = _mm_mul_ps(row[0], convertScalar);
+	row[1] = _mm_mul_ps(row[1], convertScalar);
+	row[2] = _mm_mul_ps(row[2], convertScalar);
+	row[3] = _mm_mul_ps(row[3], convertScalar);
+	
+	const __m128 vec[4] = {
+		_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x00),
+		_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x55),
+		_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xAA),
+		_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xFF)
+	};
+	
+	const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], vec[0]), _mm_add_ps(_mm_mul_ps(row[1], vec[1]), _mm_add_ps(_mm_mul_ps(row[2], vec[2]), _mm_mul_ps(row[3], vec[3]))) );
+	_mm_store_ps(vecPtr, calcVec);
+}
+
+void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
+{
+	const __m128 loadedVecPtr = _mm_load_ps(vecPtr);
+	const __m128 convertScalar = _mm_set1_ps(1.0f/4096.0f);
+	
+#ifdef ENABLE_SSE2
+	__m128 row[3] = {
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 0)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 4)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 8)) )
+	};
+#else
+	const CACHE_ALIGN float mtxFloat[16] = {
+		(float)mtxPtr[0],
+		(float)mtxPtr[1],
+		(float)mtxPtr[2],
+		(float)mtxPtr[3],
+		
+		(float)mtxPtr[4],
+		(float)mtxPtr[5],
+		(float)mtxPtr[6],
+		(float)mtxPtr[7],
+		
+		(float)mtxPtr[8],
+		(float)mtxPtr[9],
+		(float)mtxPtr[10],
+		(float)mtxPtr[11],
+		
+		(float)mtxPtr[12],
+		(float)mtxPtr[13],
+		(float)mtxPtr[14],
+		(float)mtxPtr[15]
+	};
+	
+	__m128 row[3] = {
+		_mm_load_ps(mtxFloat + 0),
+		_mm_load_ps(mtxFloat + 4),
+		_mm_load_ps(mtxFloat + 8)
+	};
+#endif
+	
+	row[0] = _mm_mul_ps(row[0], convertScalar);
+	row[1] = _mm_mul_ps(row[1], convertScalar);
+	row[2] = _mm_mul_ps(row[2], convertScalar);
+	
+	const __m128 vec[3] = {
+		_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x00),
+		_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x55),
+		_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xAA)
+	};
+	
+	const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], vec[0]), _mm_add_ps(_mm_mul_ps(row[1], vec[1]), _mm_mul_ps(row[2], vec[2])) );
+	_mm_store_ps(vecPtr, calcVec);
+}
+
+void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr)
+{
+	__m128 xmm4 = _mm_load_ps(vecPtr);
+	__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
+	__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
+	xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
+	
+	xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtxPtr));
+	xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtxPtr+4));
+	xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtxPtr+8));
+	xmm4 = _mm_add_ps(xmm4,xmm5);
+	xmm4 = _mm_add_ps(xmm4,xmm6);
+	xmm4 = _mm_add_ps(xmm4,_mm_load_ps(mtxPtr+12));
+	_mm_store_ps(mtxPtr+12,xmm4);
+}
+
+void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr)
+{
+	__m128 xmm4 = _mm_load_ps(vecPtr);
+	__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
+	__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
+	xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
+	
+	xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtxPtr));
+	xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtxPtr+4));
+	xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtxPtr+8));
+	_mm_store_ps(mtxPtr,xmm4);
+	_mm_store_ps(mtxPtr+4,xmm5);
+	_mm_store_ps(mtxPtr+8,xmm6);
+}
+
+void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB)
+{
+	const __m128 convertScale = _mm_set1_ps(1.0f/4096.0f);
+	
+#ifdef ENABLE_SSE2
+	__m128 rowB[4] = {
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB +  0)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB +  4)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB +  8)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 12)) )
+	};
+#else
+	const CACHE_ALIGN float mtxFloatB[16] = {
+		(float)mtxPtrB[0],
+		(float)mtxPtrB[1],
+		(float)mtxPtrB[2],
+		(float)mtxPtrB[3],
+		
+		(float)mtxPtrB[4],
+		(float)mtxPtrB[5],
+		(float)mtxPtrB[6],
+		(float)mtxPtrB[7],
+		
+		(float)mtxPtrB[8],
+		(float)mtxPtrB[9],
+		(float)mtxPtrB[10],
+		(float)mtxPtrB[11],
+		
+		(float)mtxPtrB[12],
+		(float)mtxPtrB[13],
+		(float)mtxPtrB[14],
+		(float)mtxPtrB[15]
+	};
+	
+	__m128 rowB[4] = {
+		_mm_load_ps(mtxFloatB + 0),
+		_mm_load_ps(mtxFloatB + 4),
+		_mm_load_ps(mtxFloatB + 8),
+		_mm_load_ps(mtxFloatB + 12)
+	};
+#endif
+	
+	rowB[0] = _mm_mul_ps(rowB[0], convertScale);
+	rowB[1] = _mm_mul_ps(rowB[1], convertScale);
+	rowB[2] = _mm_mul_ps(rowB[2], convertScale);
+	rowB[3] = _mm_mul_ps(rowB[3], convertScale);
+	
+	__m128 rowA[4] = {
+		_mm_load_ps(mtxPtrA +  0),
+		_mm_load_ps(mtxPtrA +  4),
+		_mm_load_ps(mtxPtrA +  8),
+		_mm_load_ps(mtxPtrA + 12)
+	};
+	
+	__m128 vecB[4];
+	__m128 calcRow;
+	
+	vecB[0] = _mm_shuffle_ps(rowB[0], rowB[0], 0x00);
+	vecB[1] = _mm_shuffle_ps(rowB[0], rowB[0], 0x55);
+	vecB[2] = _mm_shuffle_ps(rowB[0], rowB[0], 0xAA);
+	vecB[3] = _mm_shuffle_ps(rowB[0], rowB[0], 0xFF);
+	calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
+	_mm_store_ps(mtxPtrA +  0, calcRow);
+	
+	vecB[0] = _mm_shuffle_ps(rowB[1], rowB[1], 0x00);
+	vecB[1] = _mm_shuffle_ps(rowB[1], rowB[1], 0x55);
+	vecB[2] = _mm_shuffle_ps(rowB[1], rowB[1], 0xAA);
+	vecB[3] = _mm_shuffle_ps(rowB[1], rowB[1], 0xFF);
+	calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
+	_mm_store_ps(mtxPtrA +  4, calcRow);
+	
+	vecB[0] = _mm_shuffle_ps(rowB[2], rowB[2], 0x00);
+	vecB[1] = _mm_shuffle_ps(rowB[2], rowB[2], 0x55);
+	vecB[2] = _mm_shuffle_ps(rowB[2], rowB[2], 0xAA);
+	vecB[3] = _mm_shuffle_ps(rowB[2], rowB[2], 0xFF);
+	calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
+	_mm_store_ps(mtxPtrA +  8, calcRow);
+	
+	vecB[0] = _mm_shuffle_ps(rowB[3], rowB[3], 0x00);
+	vecB[1] = _mm_shuffle_ps(rowB[3], rowB[3], 0x55);
+	vecB[2] = _mm_shuffle_ps(rowB[3], rowB[3], 0xAA);
+	vecB[3] = _mm_shuffle_ps(rowB[3], rowB[3], 0xFF);
+	calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
+	_mm_store_ps(mtxPtrA + 12, calcRow);
+}
+
+template<size_t NUM_ROWS>
+FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor)
+{
+	const __m128 divisor_v128 = _mm_set1_ps(divisor);
+	
+	for (size_t i = 0; i < NUM_ROWS * 4; i+=4)
+	{
+		_mm_store_ps( mtxPtr + i, _mm_div_ps(_mm_load_ps(mtxPtr + i), divisor_v128) );
+	}
+}
+
+#else
+
+void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
+{
+	_MatrixMultVec4x4_NoSIMD(mtxPtr, vecPtr);
+}
+
+void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
+{
+	const CACHE_ALIGN float mtxFloat[16] = {
+		mtxPtr[ 0] / 4096.0f,
+		mtxPtr[ 1] / 4096.0f,
+		mtxPtr[ 2] / 4096.0f,
+		mtxPtr[ 3] / 4096.0f,
+		
+		mtxPtr[ 4] / 4096.0f,
+		mtxPtr[ 5] / 4096.0f,
+		mtxPtr[ 6] / 4096.0f,
+		mtxPtr[ 7] / 4096.0f,
+		
+		mtxPtr[ 8] / 4096.0f,
+		mtxPtr[ 9] / 4096.0f,
+		mtxPtr[10] / 4096.0f,
+		mtxPtr[11] / 4096.0f,
+		
+		mtxPtr[12] / 4096.0f,
+		mtxPtr[13] / 4096.0f,
+		mtxPtr[14] / 4096.0f,
+		mtxPtr[15] / 4096.0f
+	};
+	
+	const float x = vecPtr[0];
+	const float y = vecPtr[1];
+	const float z = vecPtr[2];
+	
+	vecPtr[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]);
+	vecPtr[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]);
+	vecPtr[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]);
+}
+
+void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr)
+{
+	mtxPtr[12] += (mtxPtr[0] * vecPtr[0]) + (mtxPtr[4] * vecPtr[1]) + (mtxPtr[ 8] * vecPtr[2]);
+	mtxPtr[13] += (mtxPtr[1] * vecPtr[0]) + (mtxPtr[5] * vecPtr[1]) + (mtxPtr[ 9] * vecPtr[2]);
+	mtxPtr[14] += (mtxPtr[2] * vecPtr[0]) + (mtxPtr[6] * vecPtr[1]) + (mtxPtr[10] * vecPtr[2]);
+	mtxPtr[15] += (mtxPtr[3] * vecPtr[0]) + (mtxPtr[7] * vecPtr[1]) + (mtxPtr[11] * vecPtr[2]);
+}
+
+void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr)
+{
+	mtxPtr[ 0] *= vecPtr[0];
+	mtxPtr[ 1] *= vecPtr[0];
+	mtxPtr[ 2] *= vecPtr[0];
+	mtxPtr[ 3] *= vecPtr[0];
+	
+	mtxPtr[ 4] *= vecPtr[1];
+	mtxPtr[ 5] *= vecPtr[1];
+	mtxPtr[ 6] *= vecPtr[1];
+	mtxPtr[ 7] *= vecPtr[1];
+	
+	mtxPtr[ 8] *= vecPtr[2];
+	mtxPtr[ 9] *= vecPtr[2];
+	mtxPtr[10] *= vecPtr[2];
+	mtxPtr[11] *= vecPtr[2];
+}
+
+void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB)
+{
+	const CACHE_ALIGN float mtxFloatB[16] = {
+		(float)mtxPtrB[ 0],
+		(float)mtxPtrB[ 1],
+		(float)mtxPtrB[ 2],
+		(float)mtxPtrB[ 3],
+		
+		(float)mtxPtrB[ 4],
+		(float)mtxPtrB[ 5],
+		(float)mtxPtrB[ 6],
+		(float)mtxPtrB[ 7],
+		
+		(float)mtxPtrB[ 8],
+		(float)mtxPtrB[ 9],
+		(float)mtxPtrB[10],
+		(float)mtxPtrB[11],
+		
+		(float)mtxPtrB[12],
+		(float)mtxPtrB[13],
+		(float)mtxPtrB[14],
+		(float)mtxPtrB[15]
+	};
+	
+	float tmpMatrix[16];
+	
+	tmpMatrix[0]  = (mtxPtrA[ 0] * mtxFloatB[ 0]) + (mtxPtrA[ 4] * mtxFloatB[ 1]) + (mtxPtrA[ 8] * mtxFloatB[ 2]) + (mtxPtrA[12] * mtxFloatB[ 3]);
+	tmpMatrix[1]  = (mtxPtrA[ 1] * mtxFloatB[ 0]) + (mtxPtrA[ 5] * mtxFloatB[ 1]) + (mtxPtrA[ 9] * mtxFloatB[ 2]) + (mtxPtrA[13] * mtxFloatB[ 3]);
+	tmpMatrix[2]  = (mtxPtrA[ 2] * mtxFloatB[ 0]) + (mtxPtrA[ 6] * mtxFloatB[ 1]) + (mtxPtrA[10] * mtxFloatB[ 2]) + (mtxPtrA[14] * mtxFloatB[ 3]);
+	tmpMatrix[3]  = (mtxPtrA[ 3] * mtxFloatB[ 0]) + (mtxPtrA[ 7] * mtxFloatB[ 1]) + (mtxPtrA[11] * mtxFloatB[ 2]) + (mtxPtrA[15] * mtxFloatB[ 3]);
+	
+	tmpMatrix[4]  = (mtxPtrA[ 0] * mtxFloatB[ 4]) + (mtxPtrA[ 4] * mtxFloatB[ 5]) + (mtxPtrA[ 8] * mtxFloatB[ 6]) + (mtxPtrA[12] * mtxFloatB[ 7]);
+	tmpMatrix[5]  = (mtxPtrA[ 1] * mtxFloatB[ 4]) + (mtxPtrA[ 5] * mtxFloatB[ 5]) + (mtxPtrA[ 9] * mtxFloatB[ 6]) + (mtxPtrA[13] * mtxFloatB[ 7]);
+	tmpMatrix[6]  = (mtxPtrA[ 2] * mtxFloatB[ 4]) + (mtxPtrA[ 6] * mtxFloatB[ 5]) + (mtxPtrA[10] * mtxFloatB[ 6]) + (mtxPtrA[14] * mtxFloatB[ 7]);
+	tmpMatrix[7]  = (mtxPtrA[ 3] * mtxFloatB[ 4]) + (mtxPtrA[ 7] * mtxFloatB[ 5]) + (mtxPtrA[11] * mtxFloatB[ 6]) + (mtxPtrA[15] * mtxFloatB[ 7]);
+	
+	tmpMatrix[8]  = (mtxPtrA[ 0] * mtxFloatB[ 8]) + (mtxPtrA[ 4] * mtxFloatB[ 9]) + (mtxPtrA[ 8] * mtxFloatB[10]) + (mtxPtrA[12] * mtxFloatB[11]);
+	tmpMatrix[9]  = (mtxPtrA[ 1] * mtxFloatB[ 8]) + (mtxPtrA[ 5] * mtxFloatB[ 9]) + (mtxPtrA[ 9] * mtxFloatB[10]) + (mtxPtrA[13] * mtxFloatB[11]);
+	tmpMatrix[10] = (mtxPtrA[ 2] * mtxFloatB[ 8]) + (mtxPtrA[ 6] * mtxFloatB[ 9]) + (mtxPtrA[10] * mtxFloatB[10]) + (mtxPtrA[14] * mtxFloatB[11]);
+	tmpMatrix[11] = (mtxPtrA[ 3] * mtxFloatB[ 8]) + (mtxPtrA[ 7] * mtxFloatB[ 9]) + (mtxPtrA[11] * mtxFloatB[10]) + (mtxPtrA[15] * mtxFloatB[11]);
+	
+	tmpMatrix[12] = (mtxPtrA[ 0] * mtxFloatB[12]) + (mtxPtrA[ 4] * mtxFloatB[13]) + (mtxPtrA[ 8] * mtxFloatB[14]) + (mtxPtrA[12] * mtxFloatB[15]);
+	tmpMatrix[13] = (mtxPtrA[ 1] * mtxFloatB[12]) + (mtxPtrA[ 5] * mtxFloatB[13]) + (mtxPtrA[ 9] * mtxFloatB[14]) + (mtxPtrA[13] * mtxFloatB[15]);
+	tmpMatrix[14] = (mtxPtrA[ 2] * mtxFloatB[12]) + (mtxPtrA[ 6] * mtxFloatB[13]) + (mtxPtrA[10] * mtxFloatB[14]) + (mtxPtrA[14] * mtxFloatB[15]);
+	tmpMatrix[15] = (mtxPtrA[ 3] * mtxFloatB[12]) + (mtxPtrA[ 7] * mtxFloatB[13]) + (mtxPtrA[11] * mtxFloatB[14]) + (mtxPtrA[15] * mtxFloatB[15]);
+	
+	memcpy(mtxPtrA, tmpMatrix, sizeof(float)*16);
+}
+
+template<size_t NUM_ROWS>
+FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor)
+{
+	for (size_t i = 0; i < NUM_ROWS * 4; i+=4)
+	{
+		mtxPtr[i+0] /= divisor;
+		mtxPtr[i+1] /= divisor;
+		mtxPtr[i+2] /= divisor;
+		mtxPtr[i+3] /= divisor;
+	}
+}
+
+#endif
+
+void MatrixMultVec4x4(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr)
+{
+	const s32 x = vecPtr[0];
+	const s32 y = vecPtr[1];
+	const s32 z = vecPtr[2];
+	const s32 w = vecPtr[3];
+	
+	vecPtr[0] = sfx32_shiftdown( fx32_mul(x,mtxPtr[0]) + fx32_mul(y,mtxPtr[4]) + fx32_mul(z,mtxPtr[ 8]) + fx32_mul(w,mtxPtr[12]) );
+	vecPtr[1] = sfx32_shiftdown( fx32_mul(x,mtxPtr[1]) + fx32_mul(y,mtxPtr[5]) + fx32_mul(z,mtxPtr[ 9]) + fx32_mul(w,mtxPtr[13]) );
+	vecPtr[2] = sfx32_shiftdown( fx32_mul(x,mtxPtr[2]) + fx32_mul(y,mtxPtr[6]) + fx32_mul(z,mtxPtr[10]) + fx32_mul(w,mtxPtr[14]) );
+	vecPtr[3] = sfx32_shiftdown( fx32_mul(x,mtxPtr[3]) + fx32_mul(y,mtxPtr[7]) + fx32_mul(z,mtxPtr[11]) + fx32_mul(w,mtxPtr[15]) );
+}
+
+void MatrixMultVec3x3(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr)
+{
+	const s32 x = vecPtr[0];
+	const s32 y = vecPtr[1];
+	const s32 z = vecPtr[2];
+	
+	vecPtr[0] = sfx32_shiftdown( fx32_mul(x,mtxPtr[0]) + fx32_mul(y,mtxPtr[4]) + fx32_mul(z,mtxPtr[ 8]) );
+	vecPtr[1] = sfx32_shiftdown( fx32_mul(x,mtxPtr[1]) + fx32_mul(y,mtxPtr[5]) + fx32_mul(z,mtxPtr[ 9]) );
+	vecPtr[2] = sfx32_shiftdown( fx32_mul(x,mtxPtr[2]) + fx32_mul(y,mtxPtr[6]) + fx32_mul(z,mtxPtr[10]) );
+}
+
+void MatrixTranslate(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr)
+{
+	mtxPtr[12] = sfx32_shiftdown( fx32_mul(mtxPtr[0], vecPtr[0]) + fx32_mul(mtxPtr[4], vecPtr[1]) + fx32_mul(mtxPtr[ 8], vecPtr[2]) + fx32_shiftup(mtxPtr[12]) );
+	mtxPtr[13] = sfx32_shiftdown( fx32_mul(mtxPtr[1], vecPtr[0]) + fx32_mul(mtxPtr[5], vecPtr[1]) + fx32_mul(mtxPtr[ 9], vecPtr[2]) + fx32_shiftup(mtxPtr[13]) );
+	mtxPtr[14] = sfx32_shiftdown( fx32_mul(mtxPtr[2], vecPtr[0]) + fx32_mul(mtxPtr[6], vecPtr[1]) + fx32_mul(mtxPtr[10], vecPtr[2]) + fx32_shiftup(mtxPtr[14]) );
+	mtxPtr[15] = sfx32_shiftdown( fx32_mul(mtxPtr[3], vecPtr[0]) + fx32_mul(mtxPtr[7], vecPtr[1]) + fx32_mul(mtxPtr[11], vecPtr[2]) + fx32_shiftup(mtxPtr[15]) );
+}
+
+void MatrixScale(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr)
+{
+	mtxPtr[ 0] = sfx32_shiftdown( fx32_mul(mtxPtr[ 0], vecPtr[0]) );
+	mtxPtr[ 1] = sfx32_shiftdown( fx32_mul(mtxPtr[ 1], vecPtr[0]) );
+	mtxPtr[ 2] = sfx32_shiftdown( fx32_mul(mtxPtr[ 2], vecPtr[0]) );
+	mtxPtr[ 3] = sfx32_shiftdown( fx32_mul(mtxPtr[ 3], vecPtr[0]) );
+	
+	mtxPtr[ 4] = sfx32_shiftdown( fx32_mul(mtxPtr[ 4], vecPtr[1]) );
+	mtxPtr[ 5] = sfx32_shiftdown( fx32_mul(mtxPtr[ 5], vecPtr[1]) );
+	mtxPtr[ 6] = sfx32_shiftdown( fx32_mul(mtxPtr[ 6], vecPtr[1]) );
+	mtxPtr[ 7] = sfx32_shiftdown( fx32_mul(mtxPtr[ 7], vecPtr[1]) );
+	
+	mtxPtr[ 8] = sfx32_shiftdown( fx32_mul(mtxPtr[ 8], vecPtr[2]) );
+	mtxPtr[ 9] = sfx32_shiftdown( fx32_mul(mtxPtr[ 9], vecPtr[2]) );
+	mtxPtr[10] = sfx32_shiftdown( fx32_mul(mtxPtr[10], vecPtr[2]) );
+	mtxPtr[11] = sfx32_shiftdown( fx32_mul(mtxPtr[11], vecPtr[2]) );
+}
+
+void MatrixMultiply(s32 *__restrict mtxPtrA, const s32 *__restrict mtxPtrB)
 {
 	s32 tmpMatrix[16];
-
-	tmpMatrix[0]  = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[0])+fx32_mul(matrix[4],rightMatrix[1])+fx32_mul(matrix[8],rightMatrix[2])+fx32_mul(matrix[12],rightMatrix[3]));
-	tmpMatrix[1]  = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[0])+fx32_mul(matrix[5],rightMatrix[1])+fx32_mul(matrix[9],rightMatrix[2])+fx32_mul(matrix[13],rightMatrix[3]));
-	tmpMatrix[2]  = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[0])+fx32_mul(matrix[6],rightMatrix[1])+fx32_mul(matrix[10],rightMatrix[2])+fx32_mul(matrix[14],rightMatrix[3]));
-	tmpMatrix[3]  = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[0])+fx32_mul(matrix[7],rightMatrix[1])+fx32_mul(matrix[11],rightMatrix[2])+fx32_mul(matrix[15],rightMatrix[3]));
-
-	tmpMatrix[4]  = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[4])+fx32_mul(matrix[4],rightMatrix[5])+fx32_mul(matrix[8],rightMatrix[6])+fx32_mul(matrix[12],rightMatrix[7]));
-	tmpMatrix[5]  = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[4])+fx32_mul(matrix[5],rightMatrix[5])+fx32_mul(matrix[9],rightMatrix[6])+fx32_mul(matrix[13],rightMatrix[7]));
-	tmpMatrix[6]  = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[4])+fx32_mul(matrix[6],rightMatrix[5])+fx32_mul(matrix[10],rightMatrix[6])+fx32_mul(matrix[14],rightMatrix[7]));
-	tmpMatrix[7]  = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[4])+fx32_mul(matrix[7],rightMatrix[5])+fx32_mul(matrix[11],rightMatrix[6])+fx32_mul(matrix[15],rightMatrix[7]));
-
-	tmpMatrix[8]  = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[8])+fx32_mul(matrix[4],rightMatrix[9])+fx32_mul(matrix[8],rightMatrix[10])+fx32_mul(matrix[12],rightMatrix[11]));
-	tmpMatrix[9]  = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[8])+fx32_mul(matrix[5],rightMatrix[9])+fx32_mul(matrix[9],rightMatrix[10])+fx32_mul(matrix[13],rightMatrix[11]));
-	tmpMatrix[10] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[8])+fx32_mul(matrix[6],rightMatrix[9])+fx32_mul(matrix[10],rightMatrix[10])+fx32_mul(matrix[14],rightMatrix[11]));
-	tmpMatrix[11] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[8])+fx32_mul(matrix[7],rightMatrix[9])+fx32_mul(matrix[11],rightMatrix[10])+fx32_mul(matrix[15],rightMatrix[11]));
-
-	tmpMatrix[12] = sfx32_shiftdown(fx32_mul(matrix[0],rightMatrix[12])+fx32_mul(matrix[4],rightMatrix[13])+fx32_mul(matrix[8],rightMatrix[14])+fx32_mul(matrix[12],rightMatrix[15]));
-	tmpMatrix[13] = sfx32_shiftdown(fx32_mul(matrix[1],rightMatrix[12])+fx32_mul(matrix[5],rightMatrix[13])+fx32_mul(matrix[9],rightMatrix[14])+fx32_mul(matrix[13],rightMatrix[15]));
-	tmpMatrix[14] = sfx32_shiftdown(fx32_mul(matrix[2],rightMatrix[12])+fx32_mul(matrix[6],rightMatrix[13])+fx32_mul(matrix[10],rightMatrix[14])+fx32_mul(matrix[14],rightMatrix[15]));
-	tmpMatrix[15] = sfx32_shiftdown(fx32_mul(matrix[3],rightMatrix[12])+fx32_mul(matrix[7],rightMatrix[13])+fx32_mul(matrix[11],rightMatrix[14])+fx32_mul(matrix[15],rightMatrix[15]));
-
-	memcpy(matrix,tmpMatrix,sizeof(s32)*16);
+	
+	tmpMatrix[ 0] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 0])+fx32_mul(mtxPtrA[4],mtxPtrB[ 1])+fx32_mul(mtxPtrA[ 8],mtxPtrB[ 2])+fx32_mul(mtxPtrA[12],mtxPtrB[ 3]) );
+	tmpMatrix[ 1] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 0])+fx32_mul(mtxPtrA[5],mtxPtrB[ 1])+fx32_mul(mtxPtrA[ 9],mtxPtrB[ 2])+fx32_mul(mtxPtrA[13],mtxPtrB[ 3]) );
+	tmpMatrix[ 2] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 0])+fx32_mul(mtxPtrA[6],mtxPtrB[ 1])+fx32_mul(mtxPtrA[10],mtxPtrB[ 2])+fx32_mul(mtxPtrA[14],mtxPtrB[ 3]) );
+	tmpMatrix[ 3] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 0])+fx32_mul(mtxPtrA[7],mtxPtrB[ 1])+fx32_mul(mtxPtrA[11],mtxPtrB[ 2])+fx32_mul(mtxPtrA[15],mtxPtrB[ 3]) );
+	
+	tmpMatrix[ 4] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 4])+fx32_mul(mtxPtrA[4],mtxPtrB[ 5])+fx32_mul(mtxPtrA[ 8],mtxPtrB[ 6])+fx32_mul(mtxPtrA[12],mtxPtrB[ 7]) );
+	tmpMatrix[ 5] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 4])+fx32_mul(mtxPtrA[5],mtxPtrB[ 5])+fx32_mul(mtxPtrA[ 9],mtxPtrB[ 6])+fx32_mul(mtxPtrA[13],mtxPtrB[ 7]) );
+	tmpMatrix[ 6] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 4])+fx32_mul(mtxPtrA[6],mtxPtrB[ 5])+fx32_mul(mtxPtrA[10],mtxPtrB[ 6])+fx32_mul(mtxPtrA[14],mtxPtrB[ 7]) );
+	tmpMatrix[ 7] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 4])+fx32_mul(mtxPtrA[7],mtxPtrB[ 5])+fx32_mul(mtxPtrA[11],mtxPtrB[ 6])+fx32_mul(mtxPtrA[15],mtxPtrB[ 7]) );
+	
+	tmpMatrix[ 8] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 8])+fx32_mul(mtxPtrA[4],mtxPtrB[ 9])+fx32_mul(mtxPtrA[ 8],mtxPtrB[10])+fx32_mul(mtxPtrA[12],mtxPtrB[11]) );
+	tmpMatrix[ 9] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 8])+fx32_mul(mtxPtrA[5],mtxPtrB[ 9])+fx32_mul(mtxPtrA[ 9],mtxPtrB[10])+fx32_mul(mtxPtrA[13],mtxPtrB[11]) );
+	tmpMatrix[10] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 8])+fx32_mul(mtxPtrA[6],mtxPtrB[ 9])+fx32_mul(mtxPtrA[10],mtxPtrB[10])+fx32_mul(mtxPtrA[14],mtxPtrB[11]) );
+	tmpMatrix[11] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 8])+fx32_mul(mtxPtrA[7],mtxPtrB[ 9])+fx32_mul(mtxPtrA[11],mtxPtrB[10])+fx32_mul(mtxPtrA[15],mtxPtrB[11]) );
+	
+	tmpMatrix[12] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[12])+fx32_mul(mtxPtrA[4],mtxPtrB[13])+fx32_mul(mtxPtrA[ 8],mtxPtrB[14])+fx32_mul(mtxPtrA[12],mtxPtrB[15]) );
+	tmpMatrix[13] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[12])+fx32_mul(mtxPtrA[5],mtxPtrB[13])+fx32_mul(mtxPtrA[ 9],mtxPtrB[14])+fx32_mul(mtxPtrA[13],mtxPtrB[15]) );
+	tmpMatrix[14] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[12])+fx32_mul(mtxPtrA[6],mtxPtrB[13])+fx32_mul(mtxPtrA[10],mtxPtrB[14])+fx32_mul(mtxPtrA[14],mtxPtrB[15]) );
+	tmpMatrix[15] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[12])+fx32_mul(mtxPtrA[7],mtxPtrB[13])+fx32_mul(mtxPtrA[11],mtxPtrB[14])+fx32_mul(mtxPtrA[15],mtxPtrB[15]) );
+	
+	memcpy(mtxPtrA, tmpMatrix, sizeof(s32)*16);
 }
-
-void MatrixScale(s32 *matrix, const s32 *ptr)
-{
-	//zero 21-sep-2010 - verified unrolling seems faster on my cpu
-	MACRODO_N(12,
-		matrix[X] = sfx32_shiftdown(fx32_mul(matrix[X],ptr[X>>2]))
-		);
-}
-
-void MatrixTranslate(s32 *matrix, const s32 *ptr)
-{
-	MACRODO_N(4,
-	{
-		s64 temp = fx32_shiftup(matrix[X+12]);
-		temp += fx32_mul(matrix[X+0],ptr[0]);
-		temp += fx32_mul(matrix[X+4],ptr[1]);
-		temp += fx32_mul(matrix[X+8],ptr[2]);
-		matrix[X+12] = sfx32_shiftdown(temp);
-	});
-}
-
diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h
index 1d0b667df..442e4ee87 100644
--- a/desmume/src/matrix.h
+++ b/desmume/src/matrix.h
@@ -1,6 +1,6 @@
 /*  
 	Copyright (C) 2006-2007 shash
-	Copyright (C) 2007-2017 DeSmuME team
+	Copyright (C) 2007-2018 DeSmuME team
 
 	This file is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
@@ -77,7 +77,24 @@ void Vector3Scale(float *dst, const float scale);
 void Vector3Copy(float *dst, const float *src);
 void Vector3Normalize(float *dst);
 
-void Vector4Copy(float *dst, const float *src);
+void Vector4Copy(float *dst, const float *src);
+
+
+void _MatrixMultVec4x4_NoSIMD(const s32 *__restrict mtxPtr, float *__restrict vecPtr);
+
+void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr);
+void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr);
+void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr);
+void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr);
+void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB);
+
+template<size_t NUM_ROWS> FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor);
+
+void MatrixMultVec4x4(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr);
+void MatrixMultVec3x3(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr);
+void MatrixTranslate(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr);
+void MatrixScale(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr);
+void MatrixMultiply(s32 *__restrict mtxPtrA, const s32 *__restrict mtxPtrB);
 
 //these functions are an unreliable, inaccurate floor.
 //it should only be used for positive numbers
@@ -296,151 +313,4 @@ static void memset_u32_fast(void *dst, const u32 val)
 
 #endif // SIMD Functions
 
-// NOSSE version always used in gfx3d.cpp
-void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr);
-void MatrixMultVec3x3_fixed(const s32 *matrix, s32 *vecPtr);
-
-//---------------------------
-//switched SSE functions
-#ifdef ENABLE_SSE
-
-struct SSE_MATRIX
-{
-	SSE_MATRIX(const float *matrix)
-		: row0(_mm_load_ps(matrix))
-		, row1(_mm_load_ps(matrix+4))
-		, row2(_mm_load_ps(matrix+8))
-		, row3(_mm_load_ps(matrix+12))
-	{}
-
-	union {
-		__m128 rows[4];
-		struct { __m128 row0; __m128 row1; __m128 row2; __m128 row3; };
-	};
-		
-};
-
-FORCEINLINE __m128 _util_MatrixMultVec4x4_(const SSE_MATRIX &mat, __m128 vec)
-{
-	__m128 xmm5 = _mm_shuffle_ps(vec, vec, B8(01010101));
-	__m128 xmm6 = _mm_shuffle_ps(vec, vec, B8(10101010));
-	__m128 xmm7 = _mm_shuffle_ps(vec, vec, B8(11111111));
-	__m128 xmm4 = _mm_shuffle_ps(vec, vec, B8(00000000));
-
-	xmm4 = _mm_mul_ps(xmm4,mat.row0);
-	xmm5 = _mm_mul_ps(xmm5,mat.row1);
-	xmm6 = _mm_mul_ps(xmm6,mat.row2);
-	xmm7 = _mm_mul_ps(xmm7,mat.row3);
-	xmm4 = _mm_add_ps(xmm4,xmm5);
-	xmm4 = _mm_add_ps(xmm4,xmm6);
-	xmm4 = _mm_add_ps(xmm4,xmm7);
-	return xmm4;
-}
-
-FORCEINLINE void MatrixMultiply(float * matrix, const float * rightMatrix)
-{
-	//this seems to generate larger code, including many movaps, but maybe it is less harsh on the registers than the
-	//more hand-tailored approach
-	__m128 row0 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix)); 
-	__m128 row1 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+4));
-	__m128 row2 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+8)); 
-	__m128 row3 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+12));
-	_mm_store_ps(matrix,row0); 
-	_mm_store_ps(matrix+4,row1); 
-	_mm_store_ps(matrix+8,row2);
-	_mm_store_ps(matrix+12,row3);
-}
-
-FORCEINLINE void MatrixMultVec4x4(const float *matrix, float *vecPtr)
-{
-	_mm_store_ps(vecPtr,_util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(vecPtr)));
-}
-
-FORCEINLINE void MatrixMultVec3x3(const float * matrix, float * vecPtr)
-{
-	const __m128 vec = _mm_load_ps(vecPtr);
-
-	__m128 xmm5 = _mm_shuffle_ps(vec, vec, B8(01010101));
-	__m128 xmm6 = _mm_shuffle_ps(vec, vec, B8(10101010));
-	__m128 xmm4 = _mm_shuffle_ps(vec, vec, B8(00000000));
-
-	const SSE_MATRIX mat(matrix);
-
-	xmm4 = _mm_mul_ps(xmm4,mat.row0);
-	xmm5 = _mm_mul_ps(xmm5,mat.row1);
-	xmm6 = _mm_mul_ps(xmm6,mat.row2);
-	xmm4 = _mm_add_ps(xmm4,xmm5);
-	xmm4 = _mm_add_ps(xmm4,xmm6);
-
-	_mm_store_ps(vecPtr,xmm4);
-}
-
-FORCEINLINE void MatrixTranslate(float *matrix, const float *ptr)
-{
-	__m128 xmm4 = _mm_load_ps(ptr);
-	__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
-	__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
-	xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
-	
-	xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(matrix));
-	xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(matrix+4));
-	xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(matrix+8));
-	xmm4 = _mm_add_ps(xmm4,xmm5);
-	xmm4 = _mm_add_ps(xmm4,xmm6);
-	xmm4 = _mm_add_ps(xmm4,_mm_load_ps(matrix+12));
-	_mm_store_ps(matrix+12,xmm4);
-}
-
-FORCEINLINE void MatrixScale(float *matrix, const float *ptr)
-{
-	__m128 xmm4 = _mm_load_ps(ptr);
-	__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
-	__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
-	xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
-	
-	xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(matrix));
-	xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(matrix+4));
-	xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(matrix+8));
-	_mm_store_ps(matrix,xmm4);
-	_mm_store_ps(matrix+4,xmm5);
-	_mm_store_ps(matrix+8,xmm6);
-}
-
-template<int NUM_ROWS>
-FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
-{
-	CTASSERT(NUM_ROWS==3 || NUM_ROWS==4);
-
-	const __m128 val = _mm_set_ps1(divisor);
-
-	_mm_store_ps(matrix,_mm_div_ps(_mm_load_ps(matrix),val));
-	_mm_store_ps(matrix+4,_mm_div_ps(_mm_load_ps(matrix+4),val));
-	_mm_store_ps(matrix+8,_mm_div_ps(_mm_load_ps(matrix+8),val));
-	if(NUM_ROWS==4)
-		_mm_store_ps(matrix+12,_mm_div_ps(_mm_load_ps(matrix+12),val));
-}
-
-#else //no sse
-
-void MatrixMultVec4x4 (const float *matrix, float *vecPtr);
-void MatrixMultVec3x3(const float * matrix, float * vecPtr);
-void MatrixMultiply(float * matrix, const float * rightMatrix);
-void MatrixTranslate(float *matrix, const float *ptr);
-void MatrixScale(float * matrix, const float * ptr);
-
-template<int NUM_ROWS>
-FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
-{
-	for(int i=0;i<NUM_ROWS*4;i++)
-		matrix[i] /= divisor;
-}
-
-#endif //switched SSE functions
-
-void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr);
-
-void MatrixMultiply(s32* matrix, const s32* rightMatrix);
-void MatrixScale(s32 *matrix, const s32 *ptr);
-void MatrixTranslate(s32 *matrix, const s32 *ptr);
-
 #endif // MATRIX_H