From c7bb41e4b10a4c86ffff3823c76ad5e4537e86f5 Mon Sep 17 00:00:00 2001
From: rogerman <rogerman@users.noreply.github.com>
Date: Mon, 19 Feb 2018 11:43:55 -0800
Subject: [PATCH] matrix.cpp: Rework all matrix function parameters for
 explicit array sizing in order to aid compiler optimization and (hopefully)
 aid in code readability. Also add SSE4.1 versions for the main matrix
 functions.

---
 desmume/src/MMU.cpp                         |   9 +-
 desmume/src/frontend/windows/matrixView.cpp |   8 +-
 desmume/src/gfx3d.cpp                       | 257 ++++--
 desmume/src/gfx3d.h                         |  23 +-
 desmume/src/matrix.cpp                      | 940 +++++++++++++-------
 desmume/src/matrix.h                        |  86 +-
 6 files changed, 853 insertions(+), 470 deletions(-)

diff --git a/desmume/src/MMU.cpp b/desmume/src/MMU.cpp
index e26a84404..5deece4a2 100644
--- a/desmume/src/MMU.cpp
+++ b/desmume/src/MMU.cpp
@@ -1,7 +1,7 @@
 /*
 	Copyright (C) 2006 yopyop
 	Copyright (C) 2007 shash
-	Copyright (C) 2007-2017 DeSmuME team
+	Copyright (C) 2007-2018 DeSmuME team
 
 	This file is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
@@ -1936,7 +1936,6 @@ static INLINE void write_timer(int proc, int timerIndex, u16 val)
 	NDS_RescheduleTimers();
 }
 
-extern CACHE_ALIGN MatrixStack	mtxStack[4];
 u32 TGXSTAT::read32()
 {
 	u32 ret = 0;
@@ -1945,8 +1944,8 @@ u32 TGXSTAT::read32()
 
 	// stack position always equal zero. possible timings is wrong
 	// using in "The Wild West"
-	int proj_level = mtxStack[MATRIXMODE_PROJECTION].position & 1;
-	int mv_level = mtxStack[MATRIXMODE_POSITION].position & 31;
+	int proj_level = mtxStackProjection.position & 1;
+	int mv_level = mtxStackPosition.position & 31;
 	ret |= ((proj_level << 13) | (mv_level << 8));
 
 	ret |= sb<<14;	//stack busy
@@ -1981,7 +1980,7 @@ void TGXSTAT::write32(const u32 val)
 		// Writing "1" to Bit15 does reset the Error Flag (Bit15), 
 		// and additionally resets the Projection Stack Pointer (Bit13)
 		// (and probably (?) also the Texture Stack Pointer)??
-		mtxStack[0].position = 0;
+		mtxStackProjection.position = 0;
 		se = 0; //clear stack error flag
 	}
 	//printf("gxstat write: %08X while gxfifo.size=%d\n",val,gxFIFO.size);
diff --git a/desmume/src/frontend/windows/matrixView.cpp b/desmume/src/frontend/windows/matrixView.cpp
index 060a9b343..de3d248e5 100644
--- a/desmume/src/frontend/windows/matrixView.cpp
+++ b/desmume/src/frontend/windows/matrixView.cpp
@@ -64,7 +64,7 @@ void MatrixView_OnPaintPositionMatrix(HWND hwnd)
 
 	stackIndex = SendMessage(hStackCombo, CB_GETCURSEL, 0, 0) - 1;
 
-	gfx3d_glGetMatrix(MATRIXMODE_POSITION, stackIndex, matrix);
+	gfx3d_glGetMatrix<MATRIXMODE_POSITION>(stackIndex, matrix);
 	MatrixView_SetMatrix(hwnd, idcGroup, matrix);
 }
 
@@ -87,7 +87,7 @@ void MatrixView_OnPaintDirectionMatrix(HWND hwnd)
 
 	stackIndex = SendMessage(hStackCombo, CB_GETCURSEL, 0, 0) - 1;
 
-	gfx3d_glGetMatrix(MATRIXMODE_POSITION_VECTOR, stackIndex, matrix);
+	gfx3d_glGetMatrix<MATRIXMODE_POSITION_VECTOR>(stackIndex, matrix);
 	MatrixView_SetMatrix(hwnd, idcGroup, matrix);
 }
 
@@ -106,7 +106,7 @@ void MatrixView_OnPaintProjectionMatrix(HWND hwnd)
 
 	float mat[16];
 
-	gfx3d_glGetMatrix(MATRIXMODE_PROJECTION, -1, mat);
+	gfx3d_glGetMatrix<MATRIXMODE_PROJECTION>(-1, mat);
 	MatrixView_SetMatrix(hwnd, idcGroup, mat);
 }
 
@@ -125,7 +125,7 @@ void MatrixView_OnPaintTextureMatrix(HWND hwnd)
 
 	float mat[16];
 
-	gfx3d_glGetMatrix(MATRIXMODE_TEXTURE, -1, mat);
+	gfx3d_glGetMatrix<MATRIXMODE_TEXTURE>(-1, mat);
 	MatrixView_SetMatrix(hwnd, idcGroup, mat);
 }
 
diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp
index f7c52e56c..2bb267a53 100644
--- a/desmume/src/gfx3d.cpp
+++ b/desmume/src/gfx3d.cpp
@@ -1,6 +1,6 @@
 /*	
 	Copyright (C) 2006 yopyop
-	Copyright (C) 2008-2017 DeSmuME team
+	Copyright (C) 2008-2018 DeSmuME team
 
 	This file is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
@@ -288,12 +288,10 @@ static u16 *_gfx3d_color16 = NULL;
 
 // Matrix stack handling
 //TODO: decouple stack pointers from matrix stack type
-CACHE_ALIGN MatrixStack	mtxStack[4] = {
-	MatrixStack(1, 0), // Projection stack
-	MatrixStack(32, 1), // Coordinate stack
-	MatrixStack(32, 2), // Directional stack
-	MatrixStack(1, 3), // Texture stack
-};
+CACHE_ALIGN MatrixStack<MATRIXMODE_PROJECTION> mtxStackProjection;
+CACHE_ALIGN MatrixStack<MATRIXMODE_POSITION> mtxStackPosition;
+CACHE_ALIGN MatrixStack<MATRIXMODE_POSITION_VECTOR> mtxStackPositionVector;
+CACHE_ALIGN MatrixStack<MATRIXMODE_TEXTURE> mtxStackTexture;
 
 static CACHE_ALIGN s32 mtxCurrent[4][16];
 static CACHE_ALIGN s32 mtxTemporal[16];
@@ -615,9 +613,10 @@ void gfx3d_reset()
 	MatrixInit(mtxCurrent[MATRIXMODE_TEXTURE]);
 	MatrixInit(mtxTemporal);
 
-	MatrixStackInit(&mtxStack[0]);
-	MatrixStackInit(&mtxStack[1]);
-	MatrixStackInit(&mtxStack[2]);
+	MatrixStackInit(&mtxStackProjection);
+	MatrixStackInit(&mtxStackPosition);
+	MatrixStackInit(&mtxStackPositionVector);
+	MatrixStackInit(&mtxStackTexture);
 
 	clCmd = 0;
 	clInd = 0;
@@ -692,12 +691,12 @@ static s32 GEM_SaturateAndShiftdown36To32(const s64 val)
 	return fx32_shiftdown(val);
 }
 
-static void GEM_TransformVertex(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr)
+static void GEM_TransformVertex(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4])
 {
-	const s32 x = vecPtr[0];
-	const s32 y = vecPtr[1];
-	const s32 z = vecPtr[2];
-	const s32 w = vecPtr[3];
+	const s32 x = vec[0];
+	const s32 y = vec[1];
+	const s32 z = vec[2];
+	const s32 w = vec[3];
 
 	//saturation logic is most carefully tested by:
 	//+ spectrobes beyond the portals excavation blower and drill tools: sets very large overflowing +x,+y in the modelview matrix to push things offscreen
@@ -709,10 +708,10 @@ static void GEM_TransformVertex(const s32 *__restrict mtxPtr, s32 *__restrict ve
 	//+ SM64: outside castle skybox
 	//+ NSMB: mario head screen wipe
 
-	vecPtr[0] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtxPtr[0]) + GEM_Mul32x32To64(y,mtxPtr[4]) + GEM_Mul32x32To64(z,mtxPtr[ 8]) + GEM_Mul32x32To64(w,mtxPtr[12]) );
-	vecPtr[1] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtxPtr[1]) + GEM_Mul32x32To64(y,mtxPtr[5]) + GEM_Mul32x32To64(z,mtxPtr[ 9]) + GEM_Mul32x32To64(w,mtxPtr[13]) );
-	vecPtr[2] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtxPtr[2]) + GEM_Mul32x32To64(y,mtxPtr[6]) + GEM_Mul32x32To64(z,mtxPtr[10]) + GEM_Mul32x32To64(w,mtxPtr[14]) );
-	vecPtr[3] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtxPtr[3]) + GEM_Mul32x32To64(y,mtxPtr[7]) + GEM_Mul32x32To64(z,mtxPtr[11]) + GEM_Mul32x32To64(w,mtxPtr[15]) );
+	vec[0] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[0]) + GEM_Mul32x32To64(y,mtx[4]) + GEM_Mul32x32To64(z,mtx[ 8]) + GEM_Mul32x32To64(w,mtx[12]) );
+	vec[1] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[1]) + GEM_Mul32x32To64(y,mtx[5]) + GEM_Mul32x32To64(z,mtx[ 9]) + GEM_Mul32x32To64(w,mtx[13]) );
+	vec[2] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[2]) + GEM_Mul32x32To64(y,mtx[6]) + GEM_Mul32x32To64(z,mtx[10]) + GEM_Mul32x32To64(w,mtx[14]) );
+	vec[3] = GEM_SaturateAndShiftdown36To32( GEM_Mul32x32To64(x,mtx[3]) + GEM_Mul32x32To64(y,mtx[7]) + GEM_Mul32x32To64(z,mtx[11]) + GEM_Mul32x32To64(w,mtx[15]) );
 }
 //---------------
 
@@ -975,25 +974,37 @@ static void gfx3d_glPushMatrix()
 	//printf("%d %d %d %d -> ",mtxStack[0].position,mtxStack[1].position,mtxStack[2].position,mtxStack[3].position);
 	//printf("PUSH mode: %d -> ",mode,mtxStack[mode].position);
 
-	if(mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE)
+	if (mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE)
 	{
-		MatrixCopy(MatrixStackGetPos(&mtxStack[mode], 0), mtxCurrent[mode]);
-
-		u32& index = mtxStack[mode].position;
-		if(index == 1) MMU_new.gxstat.se = 1; //unknown if this applies to the texture matrix
-		index += 1;
-		index &= 1;
+		if (mode == MATRIXMODE_PROJECTION)
+		{
+			MatrixCopy(mtxStackProjection.matrix[0], mtxCurrent[mode]);
+			
+			u32 &index = mtxStackProjection.position;
+			if (index == 1) MMU_new.gxstat.se = 1;
+			index += 1;
+			index &= 1;
+		}
+		else
+		{
+			MatrixCopy(mtxStackTexture.matrix[0], mtxCurrent[mode]);
+			
+			u32 &index = mtxStackTexture.position;
+			if (index == 1) MMU_new.gxstat.se = 1; //unknown if this applies to the texture matrix
+			index += 1;
+			index &= 1;
+		}
 	}
 	else
 	{
-		u32& index = mtxStack[MATRIXMODE_POSITION].position;
+		u32 &index = mtxStackPosition.position;
+		
+		MatrixCopy(mtxStackPosition.matrix[index & 31], mtxCurrent[MATRIXMODE_POSITION]);
+		MatrixCopy(mtxStackPositionVector.matrix[index & 31], mtxCurrent[MATRIXMODE_POSITION_VECTOR]);
 		
-		MatrixCopy(MatrixStackGetPos(&mtxStack[MATRIXMODE_POSITION], index&31), mtxCurrent[MATRIXMODE_POSITION]);
-		MatrixCopy(MatrixStackGetPos(&mtxStack[MATRIXMODE_POSITION_VECTOR], index&31), mtxCurrent[MATRIXMODE_POSITION_VECTOR]);
-
 		index += 1;
 		index &= 63;
-		if(index >= 32) MMU_new.gxstat.se = 1; //(not sure, this might be off by 1)
+		if (index >= 32) MMU_new.gxstat.se = 1; //(not sure, this might be off by 1)
 	}
 
 	//printf("%d %d %d %d\n",mtxStack[0].position,mtxStack[1].position,mtxStack[2].position,mtxStack[3].position);
@@ -1010,25 +1021,35 @@ static void gfx3d_glPopMatrix(u32 v)
 	//printf("%d %d %d %d -> ",mtxStack[0].position,mtxStack[1].position,mtxStack[2].position,mtxStack[3].position);
 	//printf("POP   (%d): mode: %d -> ",v,mode,mtxStack[mode].position);
 
-	if(mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE)
+	if (mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE)
 	{
 		//parameter is ignored and treated as sensible (always 1)
-
-		u32& index = mtxStack[mode].position;
-		index ^= 1;
-		if(index == 1) MMU_new.gxstat.se = 1; //unknown if this applies to the texture matrix
-		MatrixCopy(mtxCurrent[mode], MatrixStackGetPos(&mtxStack[mode], 0));
+		
+		if (mode == MATRIXMODE_PROJECTION)
+		{
+			u32 &index = mtxStackProjection.position;
+			index ^= 1;
+			if (index == 1) MMU_new.gxstat.se = 1;
+			MatrixCopy(mtxCurrent[mode], mtxStackProjection.matrix[0]);
+		}
+		else
+		{
+			u32 &index = mtxStackTexture.position;
+			index ^= 1;
+			if (index == 1) MMU_new.gxstat.se = 1; //unknown if this applies to the texture matrix
+			MatrixCopy(mtxCurrent[mode], mtxStackTexture.matrix[0]);
+		}
 	}
 	else
 	{
-		u32& index = mtxStack[MATRIXMODE_POSITION].position;
+		u32 &index = mtxStackPosition.position;
 			
 		index -= v & 63;
 		index &= 63;
-		if(index >= 32) MMU_new.gxstat.se = 1; //(not sure, this might be off by 1)
-			
-		MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], MatrixStackGetPos(&mtxStack[MATRIXMODE_POSITION], index&31));
-		MatrixCopy(mtxCurrent[MATRIXMODE_POSITION_VECTOR], MatrixStackGetPos(&mtxStack[MATRIXMODE_POSITION_VECTOR], index&31));
+		if (index >= 32) MMU_new.gxstat.se = 1; //(not sure, this might be off by 1)
+		
+		MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], mtxStackPosition.matrix[index & 31]);
+		MatrixCopy(mtxCurrent[MATRIXMODE_POSITION_VECTOR], mtxStackPositionVector.matrix[index & 31]);
 	}
 
 	//printf("%d %d %d %d\n",mtxStack[0].position,mtxStack[1].position,mtxStack[2].position,mtxStack[3].position);
@@ -1041,22 +1062,29 @@ static void gfx3d_glStoreMatrix(u32 v)
 	//printf("%d %d %d %d -> ",mtxStack[0].position,mtxStack[1].position,mtxStack[2].position,mtxStack[3].position);
 	//printf("STORE (%d): mode: %d -> ",v,mode,mtxStack[mode].position);
 
-	if(mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE)
+	if (mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE)
 	{
 		//parameter ignored and treated as sensible
 		v = 0;
-
-		MatrixStackLoadMatrix(&mtxStack[mode], v, mtxCurrent[mode]);
+		
+		if (mode == MATRIXMODE_PROJECTION)
+		{
+			MatrixCopy(mtxStackProjection.matrix[0], mtxCurrent[MATRIXMODE_PROJECTION]);
+		}
+		else
+		{
+			MatrixCopy(mtxStackTexture.matrix[0], mtxCurrent[MATRIXMODE_TEXTURE]);
+		}
 	}
 	else
 	{
 		v &= 31;
 
 		//out of bounds function fully properly, but set errors (not sure, this might be off by 1)
-		if(v >= 31) MMU_new.gxstat.se = 1;
-
-		MatrixStackLoadMatrix(&mtxStack[MATRIXMODE_POSITION], v, mtxCurrent[MATRIXMODE_POSITION]);
-		MatrixStackLoadMatrix(&mtxStack[MATRIXMODE_POSITION_VECTOR], v, mtxCurrent[MATRIXMODE_POSITION_VECTOR]);
+		if (v >= 31) MMU_new.gxstat.se = 1;
+		
+		MatrixCopy(mtxStackPosition.matrix[v], mtxCurrent[MATRIXMODE_POSITION]);
+		MatrixCopy(mtxStackPositionVector.matrix[v], mtxCurrent[MATRIXMODE_POSITION_VECTOR]);
 	}
 
 	//printf("%d %d %d %d\n",mtxStack[0].position,mtxStack[1].position,mtxStack[2].position,mtxStack[3].position);
@@ -1066,19 +1094,27 @@ static void gfx3d_glStoreMatrix(u32 v)
 
 static void gfx3d_glRestoreMatrix(u32 v)
 {
-	if(mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE)
+	if (mode == MATRIXMODE_PROJECTION || mode == MATRIXMODE_TEXTURE)
 	{
 		//parameter ignored and treated as sensible
 		v = 0;
-		MatrixCopy(mtxCurrent[mode], MatrixStackGetPos(&mtxStack[mode], v));
+		
+		if (mode == MATRIXMODE_PROJECTION)
+		{
+			MatrixCopy(mtxCurrent[MATRIXMODE_PROJECTION], mtxStackProjection.matrix[0]);
+		}
+		else
+		{
+			MatrixCopy(mtxCurrent[MATRIXMODE_TEXTURE], mtxStackTexture.matrix[0]);
+		}
 	}
 	else
 	{
 		//out of bounds errors function fully properly, but set errors
-		MMU_new.gxstat.se = v>=31; //(not sure, this might be off by 1)
-
-		MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], MatrixStackGetPos(&mtxStack[MATRIXMODE_POSITION], v));
-		MatrixCopy(mtxCurrent[MATRIXMODE_POSITION_VECTOR], MatrixStackGetPos(&mtxStack[MATRIXMODE_POSITION_VECTOR], v));
+		MMU_new.gxstat.se = (v >= 31) ? 1 : 0; //(not sure, this might be off by 1)
+		
+		MatrixCopy(mtxCurrent[MATRIXMODE_POSITION], mtxStackPosition.matrix[v]);
+		MatrixCopy(mtxCurrent[MATRIXMODE_POSITION_VECTOR], mtxStackPositionVector.matrix[v]);
 	}
 
 
@@ -1853,7 +1889,7 @@ void gfx3d_UpdateToonTable(u8 offset, u32 val)
 s32 gfx3d_GetClipMatrix(const u32 index)
 {
 	//printf("reading clip matrix: %d\n",index);
-	return (s32)MatrixGetMultipliedIndex(index, mtxCurrent[MATRIXMODE_PROJECTION], mtxCurrent[MATRIXMODE_POSITION]);
+	return MatrixGetMultipliedIndex(index, mtxCurrent[MATRIXMODE_PROJECTION], mtxCurrent[MATRIXMODE_POSITION]);
 }
 
 s32 gfx3d_GetDirectionalMatrix(const u32 index)
@@ -2491,20 +2527,37 @@ void gfx3d_sendCommand(u32 cmd, u32 param)
 
 //--------------
 //other misc stuff
-void gfx3d_glGetMatrix(const MatrixMode m_mode, int index, float *dst)
+template<MatrixMode MODE>
+void gfx3d_glGetMatrix(const int index, float (&dst)[16])
 {
-	//if(index == -1)
-	//{
-	//	MatrixCopy(dest, mtxCurrent[m_mode]);
-	//	return;
-	//}
-
-	//MatrixCopy(dest, MatrixStackGetPos(&mtxStack[m_mode], index));
-	
-	const s32 *src = (index == -1) ? mtxCurrent[m_mode] : MatrixStackGetPos(&mtxStack[m_mode], index);
-	
-	for (size_t i = 0; i < 16; i++)
-		dst[i] = src[i]/4096.0f;
+	if (index == -1)
+	{
+		MatrixCopy(dst, mtxCurrent[MODE]);
+	}
+	else
+	{
+		switch (MODE)
+		{
+			case MATRIXMODE_PROJECTION:
+				MatrixCopy(dst, mtxStackProjection.matrix[0]);
+				break;
+				
+			case MATRIXMODE_POSITION:
+				MatrixCopy(dst, mtxStackPosition.matrix[0]);
+				break;
+				
+			case MATRIXMODE_POSITION_VECTOR:
+				MatrixCopy(dst, mtxStackPositionVector.matrix[0]);
+				break;
+				
+			case MATRIXMODE_TEXTURE:
+				MatrixCopy(dst, mtxStackTexture.matrix[0]);
+				break;
+				
+			default:
+				break;
+		}
+	}
 }
 
 void gfx3d_glGetLightDirection(const size_t index, u32 &dst)
@@ -2632,12 +2685,35 @@ void gfx3d_savestate(EMUFILE &os)
 	for (size_t i = 0; i < polylist->count; i++)
 		polylist->list[i].save(os);
 
-	for (size_t i = 0; i < ARRAY_SIZE(mtxStack); i++)
+	// Write matrix stack data
+	os.write_32LE(mtxStackProjection.position);
+	for (size_t j = 0; j < 16; j++)
 	{
-		os.write_32LE(mtxStack[i].position);
-		
-		for (size_t j = 0; j < mtxStack[i].size*16; j++)
-			os.write_32LE(mtxStack[i].matrix[j]);
+		os.write_32LE(mtxStackProjection.matrix[0][j]);
+	}
+	
+	os.write_32LE(mtxStackPosition.position);
+	for (size_t i = 0; i < MatrixStack<MATRIXMODE_POSITION>::size; i++)
+	{
+		for (size_t j = 0; j < 16; j++)
+		{
+			os.write_32LE(mtxStackPosition.matrix[i][j]);
+		}
+	}
+	
+	os.write_32LE(mtxStackPositionVector.position);
+	for (size_t i = 0; i < MatrixStack<MATRIXMODE_POSITION_VECTOR>::size; i++)
+	{
+		for (size_t j = 0; j < 16; j++)
+		{
+			os.write_32LE(mtxStackPositionVector.matrix[i][j]);
+		}
+	}
+	
+	os.write_32LE(mtxStackTexture.position);
+	for (size_t j = 0; j < 16; j++)
+	{
+		os.write_32LE(mtxStackTexture.matrix[0][j]);
 	}
 
 	gxf_hardware.savestate(os);
@@ -2703,12 +2779,35 @@ bool gfx3d_loadstate(EMUFILE &is, int size)
 
 	if (version >= 2)
 	{
-		for (size_t i = 0; i < ARRAY_SIZE(mtxStack); i++)
+		// Read matrix stack data
+		is.read_32LE(mtxStackProjection.position);
+		for (size_t j = 0; j < 16; j++)
 		{
-			is.read_32LE(mtxStack[i].position);
-			
-			for (size_t j = 0; j < mtxStack[i].size*16; j++)
-				is.read_32LE(mtxStack[i].matrix[j]);
+			is.read_32LE(mtxStackProjection.matrix[0][j]);
+		}
+		
+		is.read_32LE(mtxStackPosition.position);
+		for (size_t i = 0; i < MatrixStack<MATRIXMODE_POSITION>::size; i++)
+		{
+			for (size_t j = 0; j < 16; j++)
+			{
+				is.read_32LE(mtxStackPosition.matrix[i][j]);
+			}
+		}
+		
+		is.read_32LE(mtxStackPositionVector.position);
+		for (size_t i = 0; i < MatrixStack<MATRIXMODE_POSITION_VECTOR>::size; i++)
+		{
+			for (size_t j = 0; j < 16; j++)
+			{
+				is.read_32LE(mtxStackPositionVector.matrix[i][j]);
+			}
+		}
+		
+		is.read_32LE(mtxStackTexture.position);
+		for (size_t j = 0; j < 16; j++)
+		{
+			is.read_32LE(mtxStackTexture.matrix[0][j]);
 		}
 	}
 
diff --git a/desmume/src/gfx3d.h b/desmume/src/gfx3d.h
index b1b4580bd..ab6cdd577 100644
--- a/desmume/src/gfx3d.h
+++ b/desmume/src/gfx3d.h
@@ -1,6 +1,6 @@
 /*
 	Copyright (C) 2006 yopyop
-	Copyright (C) 2008-2017 DeSmuME team
+	Copyright (C) 2008-2018 DeSmuME team
 
 	This file is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
@@ -23,7 +23,8 @@
 #include <ostream>
 #include <istream>
 
-#include "types.h"
+#include "types.h"
+#include "matrix.h"
 #include "GPU.h"
 
 class EMUFILE;
@@ -74,16 +75,12 @@ class EMUFILE;
 
 // 15-bit to 24-bit depth formula from http://nocash.emubase.de/gbatek.htm#ds3drearplane
 extern CACHE_ALIGN u32 dsDepthExtend_15bit_to_24bit[32768];
-#define DS_DEPTH15TO24(depth) ( dsDepthExtend_15bit_to_24bit[(depth) & 0x7FFF] )
-
-// MATRIX MODES
-enum MatrixMode
-{
-	MATRIXMODE_PROJECTION		= 0,
-	MATRIXMODE_POSITION			= 1,
-	MATRIXMODE_POSITION_VECTOR	= 2,
-	MATRIXMODE_TEXTURE			= 3
-};
+#define DS_DEPTH15TO24(depth) ( dsDepthExtend_15bit_to_24bit[(depth) & 0x7FFF] )
+
+extern CACHE_ALIGN MatrixStack<MATRIXMODE_PROJECTION> mtxStackProjection;
+extern CACHE_ALIGN MatrixStack<MATRIXMODE_POSITION> mtxStackPosition;
+extern CACHE_ALIGN MatrixStack<MATRIXMODE_POSITION_VECTOR> mtxStackPositionVector;
+extern CACHE_ALIGN MatrixStack<MATRIXMODE_TEXTURE> mtxStackTexture;
 
 // POLYGON PRIMITIVE TYPES
 enum PolygonPrimitiveType
@@ -633,7 +630,7 @@ void gfx3d_sendCommandToFIFO(u32 val);
 void gfx3d_sendCommand(u32 cmd, u32 param);
 
 //other misc stuff
-void gfx3d_glGetMatrix(const MatrixMode mode, int index, float *dst);
+template<MatrixMode MODE> void gfx3d_glGetMatrix(const int index, float (&dst)[16]);
 void gfx3d_glGetLightDirection(const size_t index, u32 &dst);
 void gfx3d_glGetLightColor(const size_t index, u32 &dst);
 
diff --git a/desmume/src/matrix.cpp b/desmume/src/matrix.cpp
index b7c95a5bb..0586908df 100644
--- a/desmume/src/matrix.cpp
+++ b/desmume/src/matrix.cpp
@@ -25,17 +25,17 @@
 #include "MMU.h"
 
 
-void MatrixInit(s32 *mtxPtr)
+void MatrixInit(s32 (&mtx)[16])
 {
-	MatrixIdentity(mtxPtr);
+	MatrixIdentity(mtx);
 }
 
-void MatrixInit(float *mtxPtr)
+void MatrixInit(float (&mtx)[16])
 {
-	MatrixIdentity(mtxPtr);
+	MatrixIdentity(mtx);
 }
 
-void MatrixIdentity(s32 *mtxPtr)
+void MatrixIdentity(s32 (&mtx)[16])
 {
 	static const CACHE_ALIGN s32 mtxIdentity[16] = {
 		(1 << 12), 0, 0, 0,
@@ -44,10 +44,10 @@ void MatrixIdentity(s32 *mtxPtr)
 		0, 0, 0, (1 << 12)
 	};
 	
-	memcpy(mtxPtr, mtxIdentity, sizeof(s32)*16);
+	memcpy(mtx, mtxIdentity, sizeof(s32)*16);
 }
 
-void MatrixIdentity(float *mtxPtr)
+void MatrixIdentity(float (&mtx)[16])
 {
 	static const CACHE_ALIGN float mtxIdentity[16] = {
 		1.0f, 0.0f, 0.0f, 0.0f,
@@ -56,37 +56,37 @@ void MatrixIdentity(float *mtxPtr)
 		0.0f, 0.0f, 0.0f, 1.0f
 	};
 	
-	memcpy(mtxPtr, mtxIdentity, sizeof(float)*16);
+	memcpy(mtx, mtxIdentity, sizeof(float)*16);
 }
 
-void MatrixSet(s32 *mtxPtr, const size_t x, const size_t y, const s32 value)
+void MatrixSet(s32 (&mtx)[16], const size_t x, const size_t y, const s32 value)
 {
-	mtxPtr[x+(y<<2)] = value;
+	mtx[x+(y<<2)] = value;
 }
 
-void MatrixSet(float *mtxPtr, const size_t x, const size_t y, const float value)
+void MatrixSet(float (&mtx)[16], const size_t x, const size_t y, const float value)
 {
-	mtxPtr[x+(y<<2)] = value;
+	mtx[x+(y<<2)] = value;
 }
 
-void MatrixSet(float *mtxPtr, const size_t x, const size_t y, const s32 value)
+void MatrixSet(float (&mtx)[16], const size_t x, const size_t y, const s32 value)
 {
-	mtxPtr[x+(y<<2)] = value / 4096.0f;
+	mtx[x+(y<<2)] = (float)value / 4096.0f;
 }
 
-void MatrixCopy(s32 *mtxDst, const s32 *mtxSrc)
+void MatrixCopy(s32 (&mtxDst)[16], const s32 (&mtxSrc)[16])
 {
 	// We're going to assume that the two buffers are not the same.
 	memcpy(mtxDst, mtxSrc, sizeof(s32)*16);
 }
 
-void MatrixCopy(float *mtxDst, const float *mtxSrc)
+void MatrixCopy(float (&mtxDst)[16], const float (&mtxSrc)[16])
 {
 	// We're going to assume that the two buffers are not the same.
 	memcpy(mtxDst, mtxSrc, sizeof(float)*16);
 }
 
-void MatrixCopy(float *mtxDst, const s32 *mtxSrc)
+void MatrixCopy(float (&__restrict mtxDst)[16], const s32 (&__restrict mtxSrc)[16])
 {
 	mtxDst[ 0] = mtxSrc[ 0] / 4096.0f;
 	mtxDst[ 1] = mtxSrc[ 1] / 4096.0f;
@@ -109,76 +109,64 @@ void MatrixCopy(float *mtxDst, const s32 *mtxSrc)
 	mtxDst[15] = mtxSrc[15] / 4096.0f;
 }
 
-int MatrixCompare(const s32 *mtxDst, const s32 *mtxSrc)
+int MatrixCompare(const s32 (&mtxDst)[16], const s32 (&mtxSrc)[16])
 {
 	return memcmp(mtxDst, mtxSrc, sizeof(s32)*16);
 }
 
-int MatrixCompare(const float *mtxDst, const float *mtxSrc)
+int MatrixCompare(const float (&mtxDst)[16], const float (&mtxSrc)[16])
 {
 	return memcmp(mtxDst, mtxSrc, sizeof(float)*16);
 }
 
-s32 MatrixGetMultipliedIndex(const u32 index, s32 *matrix, s32 *rightMatrix)
+s32 MatrixGetMultipliedIndex(const u32 index, const s32 (&mtxA)[16], const s32 (&mtxB)[16])
 {
-	const size_t iMod = index%4, iDiv = (index>>2)<<2;
-
-	s64 temp = ((s64)matrix[iMod  ]*rightMatrix[iDiv  ])+((s64)matrix[iMod+ 4]*rightMatrix[iDiv+1])+
-			((s64)matrix[iMod+8]*rightMatrix[iDiv+2])+((s64)matrix[iMod+12]*rightMatrix[iDiv+3]);
-
-	return (s32)(temp>>12);
+	assert(index < 16);
+	
+	const size_t iMod = index % 4;
+	const size_t iDiv = (index >> 2) << 2;
+	
+	const s32 temp = sfx32_shiftdown( fx32_mul(mtxA[iMod  ], mtxB[iDiv  ]) + fx32_mul(mtxA[iMod+ 4], mtxB[iDiv+1]) + fx32_mul(mtxA[iMod+8], mtxB[iDiv+2]) + fx32_mul(mtxA[iMod+12], mtxB[iDiv+3]) );
+	return temp;
 }
 
-void MatrixStackInit(MatrixStack *stack)
+float MatrixGetMultipliedIndex(const u32 index, const float (&mtxA)[16], const float (&mtxB)[16])
 {
-	for (int i = 0; i < stack->size; i++)
+	assert(index < 16);
+	
+	const size_t iMod = index % 4;
+	const size_t iDiv = (index >> 2) << 2;
+	
+	const float temp = (mtxA[iMod  ] * mtxB[iDiv  ]) + (mtxA[iMod+ 4] * mtxB[iDiv+1]) + (mtxA[iMod+8] * mtxB[iDiv+2]) + (mtxA[iMod+12] * mtxB[iDiv+3]);
+	return temp;
+}
+
+template<MatrixMode MODE>
+void MatrixStackInit(MatrixStack<MODE> *stack)
+{
+	for (size_t i = 0; i < MatrixStack<MODE>::size; i++)
 	{
-		MatrixInit(&stack->matrix[i*16]);
+		MatrixInit(stack->matrix[i]);
 	}
+	
 	stack->position = 0;
 }
 
-void MatrixStackSetMaxSize (MatrixStack *stack, int size)
+template<MatrixMode MODE>
+s32* MatrixStackGet(MatrixStack<MODE> *stack)
 {
-	int i;
-
-	stack->size = size;
-
-	if (stack->matrix != NULL) {
-		free (stack->matrix);
-	}
-	stack->matrix = new s32[stack->size*16*sizeof(s32)];
-
-	for (i = 0; i < stack->size; i++)
-	{
-		MatrixInit (&stack->matrix[i*16]);
-	}
+	return stack->matrix[stack->position];
 }
 
+template void MatrixStackInit(MatrixStack<MATRIXMODE_PROJECTION> *stack);
+template void MatrixStackInit(MatrixStack<MATRIXMODE_POSITION> *stack);
+template void MatrixStackInit(MatrixStack<MATRIXMODE_POSITION_VECTOR> *stack);
+template void MatrixStackInit(MatrixStack<MATRIXMODE_TEXTURE> *stack);
 
-MatrixStack::MatrixStack(int size, int type)
-{
-	MatrixStackSetMaxSize(this,size);
-	this->type = type;
-}
-
-
-s32* MatrixStackGetPos(MatrixStack *stack, const size_t pos)
-{
-	assert(pos < stack->size);
-	return &stack->matrix[pos*16];
-}
-
-s32* MatrixStackGet (MatrixStack *stack)
-{
-	return &stack->matrix[stack->position*16];
-}
-
-void MatrixStackLoadMatrix (MatrixStack *stack, const size_t pos, const s32 *ptr)
-{
-	assert(pos < stack->size);
-	MatrixCopy(&stack->matrix[pos*16], ptr);
-}
+template s32* MatrixStackGet(MatrixStack<MATRIXMODE_PROJECTION> *stack);
+template s32* MatrixStackGet(MatrixStack<MATRIXMODE_POSITION> *stack);
+template s32* MatrixStackGet(MatrixStack<MATRIXMODE_POSITION_VECTOR> *stack);
+template s32* MatrixStackGet(MatrixStack<MATRIXMODE_TEXTURE> *stack);
 
 void Vector2Copy(float *dst, const float *src)
 {
@@ -271,76 +259,76 @@ void Vector4Copy(float *dst, const float *src)
 	dst[3] = src[3];
 }
 
-void _MatrixMultVec4x4_NoSIMD(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
+void _MatrixMultVec4x4_NoSIMD(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4])
 {
 	const CACHE_ALIGN float mtxFloat[16] = {
-		mtxPtr[ 0] / 4096.0f,
-		mtxPtr[ 1] / 4096.0f,
-		mtxPtr[ 2] / 4096.0f,
-		mtxPtr[ 3] / 4096.0f,
+		mtx[ 0] / 4096.0f,
+		mtx[ 1] / 4096.0f,
+		mtx[ 2] / 4096.0f,
+		mtx[ 3] / 4096.0f,
 		
-		mtxPtr[ 4] / 4096.0f,
-		mtxPtr[ 5] / 4096.0f,
-		mtxPtr[ 6] / 4096.0f,
-		mtxPtr[ 7] / 4096.0f,
+		mtx[ 4] / 4096.0f,
+		mtx[ 5] / 4096.0f,
+		mtx[ 6] / 4096.0f,
+		mtx[ 7] / 4096.0f,
 		
-		mtxPtr[ 8] / 4096.0f,
-		mtxPtr[ 9] / 4096.0f,
-		mtxPtr[10] / 4096.0f,
-		mtxPtr[11] / 4096.0f,
+		mtx[ 8] / 4096.0f,
+		mtx[ 9] / 4096.0f,
+		mtx[10] / 4096.0f,
+		mtx[11] / 4096.0f,
 		
-		mtxPtr[12] / 4096.0f,
-		mtxPtr[13] / 4096.0f,
-		mtxPtr[14] / 4096.0f,
-		mtxPtr[15] / 4096.0f
+		mtx[12] / 4096.0f,
+		mtx[13] / 4096.0f,
+		mtx[14] / 4096.0f,
+		mtx[15] / 4096.0f
 	};
 	
-	const float x = vecPtr[0];
-	const float y = vecPtr[1];
-	const float z = vecPtr[2];
-	const float w = vecPtr[3];
+	const float x = vec[0];
+	const float y = vec[1];
+	const float z = vec[2];
+	const float w = vec[3];
 	
-	vecPtr[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]) + (w * mtxFloat[12]);
-	vecPtr[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]) + (w * mtxFloat[13]);
-	vecPtr[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]) + (w * mtxFloat[14]);
-	vecPtr[3] = (x * mtxFloat[3]) + (y * mtxFloat[7]) + (z * mtxFloat[11]) + (w * mtxFloat[15]);
+	vec[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]) + (w * mtxFloat[12]);
+	vec[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]) + (w * mtxFloat[13]);
+	vec[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]) + (w * mtxFloat[14]);
+	vec[3] = (x * mtxFloat[3]) + (y * mtxFloat[7]) + (z * mtxFloat[11]) + (w * mtxFloat[15]);
 }
 
 #ifdef ENABLE_SSE
 
-void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
+void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4])
 {
-	const __m128 loadedVecPtr = _mm_load_ps(vecPtr);
+	const __m128 loadedVec = _mm_load_ps(vec);
 	const __m128 convertScalar = _mm_set1_ps(1.0f/4096.0f);
 	
 #ifdef ENABLE_SSE2
 	__m128 row[4] = {
-		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr +  0)) ),
-		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr +  4)) ),
-		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr +  8)) ),
-		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 12)) )
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx +  0)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx +  4)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx +  8)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 12)) )
 	};
 #else
 	const CACHE_ALIGN float mtxFloat[16] = {
-		(float)mtxPtr[0],
-		(float)mtxPtr[1],
-		(float)mtxPtr[2],
-		(float)mtxPtr[3],
+		(float)mtx[0],
+		(float)mtx[1],
+		(float)mtx[2],
+		(float)mtx[3],
 		
-		(float)mtxPtr[4],
-		(float)mtxPtr[5],
-		(float)mtxPtr[6],
-		(float)mtxPtr[7],
+		(float)mtx[4],
+		(float)mtx[5],
+		(float)mtx[6],
+		(float)mtx[7],
 		
-		(float)mtxPtr[8],
-		(float)mtxPtr[9],
-		(float)mtxPtr[10],
-		(float)mtxPtr[11],
+		(float)mtx[8],
+		(float)mtx[9],
+		(float)mtx[10],
+		(float)mtx[11],
 		
-		(float)mtxPtr[12],
-		(float)mtxPtr[13],
-		(float)mtxPtr[14],
-		(float)mtxPtr[15]
+		(float)mtx[12],
+		(float)mtx[13],
+		(float)mtx[14],
+		(float)mtx[15]
 	};
 	
 	__m128 row[4] = {
@@ -356,49 +344,49 @@ void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
 	row[2] = _mm_mul_ps(row[2], convertScalar);
 	row[3] = _mm_mul_ps(row[3], convertScalar);
 	
-	const __m128 vec[4] = {
-		_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x00),
-		_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x55),
-		_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xAA),
-		_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xFF)
+	const __m128 scalar[4] = {
+		_mm_shuffle_ps(loadedVec, loadedVec, 0x00),
+		_mm_shuffle_ps(loadedVec, loadedVec, 0x55),
+		_mm_shuffle_ps(loadedVec, loadedVec, 0xAA),
+		_mm_shuffle_ps(loadedVec, loadedVec, 0xFF)
 	};
 	
-	const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], vec[0]), _mm_add_ps(_mm_mul_ps(row[1], vec[1]), _mm_add_ps(_mm_mul_ps(row[2], vec[2]), _mm_mul_ps(row[3], vec[3]))) );
-	_mm_store_ps(vecPtr, calcVec);
+	const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], scalar[0]), _mm_add_ps(_mm_mul_ps(row[1], scalar[1]), _mm_add_ps(_mm_mul_ps(row[2], scalar[2]), _mm_mul_ps(row[3], scalar[3]))) );
+	_mm_store_ps(vec, calcVec);
 }
 
-void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
+void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4])
 {
-	const __m128 loadedVecPtr = _mm_load_ps(vecPtr);
+	const __m128 loadedVec = _mm_load_ps(vec);
 	const __m128 convertScalar = _mm_set1_ps(1.0f/4096.0f);
 	
 #ifdef ENABLE_SSE2
 	__m128 row[3] = {
-		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 0)) ),
-		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 4)) ),
-		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtr + 8)) )
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 0)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 4)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtx + 8)) )
 	};
 #else
 	const CACHE_ALIGN float mtxFloat[16] = {
-		(float)mtxPtr[0],
-		(float)mtxPtr[1],
-		(float)mtxPtr[2],
-		(float)mtxPtr[3],
+		(float)mtx[0],
+		(float)mtx[1],
+		(float)mtx[2],
+		(float)mtx[3],
 		
-		(float)mtxPtr[4],
-		(float)mtxPtr[5],
-		(float)mtxPtr[6],
-		(float)mtxPtr[7],
+		(float)mtx[4],
+		(float)mtx[5],
+		(float)mtx[6],
+		(float)mtx[7],
 		
-		(float)mtxPtr[8],
-		(float)mtxPtr[9],
-		(float)mtxPtr[10],
-		(float)mtxPtr[11],
+		(float)mtx[8],
+		(float)mtx[9],
+		(float)mtx[10],
+		(float)mtx[11],
 		
-		(float)mtxPtr[12],
-		(float)mtxPtr[13],
-		(float)mtxPtr[14],
-		(float)mtxPtr[15]
+		(float)mtx[12],
+		(float)mtx[13],
+		(float)mtx[14],
+		(float)mtx[15]
 	};
 	
 	__m128 row[3] = {
@@ -412,79 +400,79 @@ void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
 	row[1] = _mm_mul_ps(row[1], convertScalar);
 	row[2] = _mm_mul_ps(row[2], convertScalar);
 	
-	const __m128 vec[3] = {
-		_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x00),
-		_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0x55),
-		_mm_shuffle_ps(loadedVecPtr, loadedVecPtr, 0xAA)
+	const __m128 scalar[3] = {
+		_mm_shuffle_ps(loadedVec, loadedVec, 0x00),
+		_mm_shuffle_ps(loadedVec, loadedVec, 0x55),
+		_mm_shuffle_ps(loadedVec, loadedVec, 0xAA)
 	};
 	
-	const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], vec[0]), _mm_add_ps(_mm_mul_ps(row[1], vec[1]), _mm_mul_ps(row[2], vec[2])) );
-	_mm_store_ps(vecPtr, calcVec);
+	const __m128 calcVec = _mm_add_ps( _mm_mul_ps(row[0], scalar[0]), _mm_add_ps(_mm_mul_ps(row[1], scalar[1]), _mm_mul_ps(row[2], scalar[2])) );
+	_mm_store_ps(vec, calcVec);
 }
 
-void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr)
+void MatrixTranslate(float (&__restrict mtx)[16], const float (&__restrict vec)[4])
 {
-	__m128 xmm4 = _mm_load_ps(vecPtr);
+	__m128 xmm4 = _mm_load_ps(vec);
 	__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
 	__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
 	xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
 	
-	xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtxPtr));
-	xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtxPtr+4));
-	xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtxPtr+8));
+	xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtx));
+	xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtx+4));
+	xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtx+8));
 	xmm4 = _mm_add_ps(xmm4,xmm5);
 	xmm4 = _mm_add_ps(xmm4,xmm6);
-	xmm4 = _mm_add_ps(xmm4,_mm_load_ps(mtxPtr+12));
-	_mm_store_ps(mtxPtr+12,xmm4);
+	xmm4 = _mm_add_ps(xmm4,_mm_load_ps(mtx+12));
+	_mm_store_ps(mtx+12,xmm4);
 }
 
-void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr)
+void MatrixScale(float (&__restrict mtx)[16], const float (&__restrict vec)[4])
 {
-	__m128 xmm4 = _mm_load_ps(vecPtr);
+	__m128 xmm4 = _mm_load_ps(vec);
 	__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
 	__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
 	xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
 	
-	xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtxPtr));
-	xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtxPtr+4));
-	xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtxPtr+8));
-	_mm_store_ps(mtxPtr,xmm4);
-	_mm_store_ps(mtxPtr+4,xmm5);
-	_mm_store_ps(mtxPtr+8,xmm6);
+	xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(mtx));
+	xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(mtx+4));
+	xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(mtx+8));
+	_mm_store_ps(mtx,xmm4);
+	_mm_store_ps(mtx+4,xmm5);
+	_mm_store_ps(mtx+8,xmm6);
 }
 
-void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB)
+void MatrixMultiply(float (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16])
 {
 	const __m128 convertScale = _mm_set1_ps(1.0f/4096.0f);
 	
 #ifdef ENABLE_SSE2
 	__m128 rowB[4] = {
-		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB +  0)) ),
-		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB +  4)) ),
-		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB +  8)) ),
-		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxPtrB + 12)) )
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxB +  0)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxB +  4)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxB +  8)) ),
+		_mm_cvtepi32_ps( _mm_load_si128((__m128i *)(mtxB + 12)) )
 	};
 #else
 	const CACHE_ALIGN float mtxFloatB[16] = {
-		(float)mtxPtrB[0],
-		(float)mtxPtrB[1],
-		(float)mtxPtrB[2],
-		(float)mtxPtrB[3],
+		(float)mtxB[ 0],
+		(float)mtxB[ 1],
+		(float)mtxB[ 2],
+		(float)mtxB[ 3],
 		
-		(float)mtxPtrB[4],
-		(float)mtxPtrB[5],
-		(float)mtxPtrB[6],
-		(float)mtxPtrB[7],
+		(float)mtxB[ 4],
+		(float)mtxB[ 5],
+		(float)mtxB[ 6],
+		(float)mtxB[ 7],
 		
-		(float)mtxPtrB[8],
-		(float)mtxPtrB[9],
-		(float)mtxPtrB[10],
-		(float)mtxPtrB[11],
+		(float)mtxB[ 8],
+		(float)mtxB[ 9],
+		(float)mtxB[10],
+		(float)mtxB[11],
 		
-		(float)mtxPtrB[12],
-		(float)mtxPtrB[13],
-		(float)mtxPtrB[14],
-		(float)mtxPtrB[15]
+		(float)mtxB[12],
+		(float)mtxB[13],
+		(float)mtxB[14],
+		(float)mtxB[15]
 	};
 	
 	__m128 rowB[4] = {
@@ -501,10 +489,10 @@ void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB)
 	rowB[3] = _mm_mul_ps(rowB[3], convertScale);
 	
 	__m128 rowA[4] = {
-		_mm_load_ps(mtxPtrA +  0),
-		_mm_load_ps(mtxPtrA +  4),
-		_mm_load_ps(mtxPtrA +  8),
-		_mm_load_ps(mtxPtrA + 12)
+		_mm_load_ps(mtxA +  0),
+		_mm_load_ps(mtxA +  4),
+		_mm_load_ps(mtxA +  8),
+		_mm_load_ps(mtxA + 12)
 	};
 	
 	__m128 vecB[4];
@@ -515,243 +503,533 @@ void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB)
 	vecB[2] = _mm_shuffle_ps(rowB[0], rowB[0], 0xAA);
 	vecB[3] = _mm_shuffle_ps(rowB[0], rowB[0], 0xFF);
 	calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
-	_mm_store_ps(mtxPtrA +  0, calcRow);
+	_mm_store_ps(mtxA +  0, calcRow);
 	
 	vecB[0] = _mm_shuffle_ps(rowB[1], rowB[1], 0x00);
 	vecB[1] = _mm_shuffle_ps(rowB[1], rowB[1], 0x55);
 	vecB[2] = _mm_shuffle_ps(rowB[1], rowB[1], 0xAA);
 	vecB[3] = _mm_shuffle_ps(rowB[1], rowB[1], 0xFF);
 	calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
-	_mm_store_ps(mtxPtrA +  4, calcRow);
+	_mm_store_ps(mtxA +  4, calcRow);
 	
 	vecB[0] = _mm_shuffle_ps(rowB[2], rowB[2], 0x00);
 	vecB[1] = _mm_shuffle_ps(rowB[2], rowB[2], 0x55);
 	vecB[2] = _mm_shuffle_ps(rowB[2], rowB[2], 0xAA);
 	vecB[3] = _mm_shuffle_ps(rowB[2], rowB[2], 0xFF);
 	calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
-	_mm_store_ps(mtxPtrA +  8, calcRow);
+	_mm_store_ps(mtxA +  8, calcRow);
 	
 	vecB[0] = _mm_shuffle_ps(rowB[3], rowB[3], 0x00);
 	vecB[1] = _mm_shuffle_ps(rowB[3], rowB[3], 0x55);
 	vecB[2] = _mm_shuffle_ps(rowB[3], rowB[3], 0xAA);
 	vecB[3] = _mm_shuffle_ps(rowB[3], rowB[3], 0xFF);
 	calcRow = _mm_add_ps( _mm_mul_ps(rowA[0], vecB[0]), _mm_add_ps(_mm_mul_ps(rowA[1], vecB[1]), _mm_add_ps(_mm_mul_ps(rowA[2], vecB[2]), _mm_mul_ps(rowA[3], vecB[3]))) );
-	_mm_store_ps(mtxPtrA + 12, calcRow);
+	_mm_store_ps(mtxA + 12, calcRow);
 }
 
 template<size_t NUM_ROWS>
-FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor)
+FORCEINLINE void vector_fix2float(float (&mtx)[16], const float divisor)
 {
 	const __m128 divisor_v128 = _mm_set1_ps(divisor);
 	
 	for (size_t i = 0; i < NUM_ROWS * 4; i+=4)
 	{
-		_mm_store_ps( mtxPtr + i, _mm_div_ps(_mm_load_ps(mtxPtr + i), divisor_v128) );
+		_mm_store_ps( mtx + i, _mm_div_ps(_mm_load_ps(mtx + i), divisor_v128) );
 	}
 }
 
 #else
 
-void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
+void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4])
 {
-	_MatrixMultVec4x4_NoSIMD(mtxPtr, vecPtr);
+	_MatrixMultVec4x4_NoSIMD(mtx, vec);
 }
 
-void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr)
+void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4])
 {
 	const CACHE_ALIGN float mtxFloat[16] = {
-		mtxPtr[ 0] / 4096.0f,
-		mtxPtr[ 1] / 4096.0f,
-		mtxPtr[ 2] / 4096.0f,
-		mtxPtr[ 3] / 4096.0f,
+		mtx[ 0] / 4096.0f,
+		mtx[ 1] / 4096.0f,
+		mtx[ 2] / 4096.0f,
+		mtx[ 3] / 4096.0f,
 		
-		mtxPtr[ 4] / 4096.0f,
-		mtxPtr[ 5] / 4096.0f,
-		mtxPtr[ 6] / 4096.0f,
-		mtxPtr[ 7] / 4096.0f,
+		mtx[ 4] / 4096.0f,
+		mtx[ 5] / 4096.0f,
+		mtx[ 6] / 4096.0f,
+		mtx[ 7] / 4096.0f,
 		
-		mtxPtr[ 8] / 4096.0f,
-		mtxPtr[ 9] / 4096.0f,
-		mtxPtr[10] / 4096.0f,
-		mtxPtr[11] / 4096.0f,
+		mtx[ 8] / 4096.0f,
+		mtx[ 9] / 4096.0f,
+		mtx[10] / 4096.0f,
+		mtx[11] / 4096.0f,
 		
-		mtxPtr[12] / 4096.0f,
-		mtxPtr[13] / 4096.0f,
-		mtxPtr[14] / 4096.0f,
-		mtxPtr[15] / 4096.0f
+		mtx[12] / 4096.0f,
+		mtx[13] / 4096.0f,
+		mtx[14] / 4096.0f,
+		mtx[15] / 4096.0f
 	};
 	
-	const float x = vecPtr[0];
-	const float y = vecPtr[1];
-	const float z = vecPtr[2];
+	const float x = vec[0];
+	const float y = vec[1];
+	const float z = vec[2];
 	
-	vecPtr[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]);
-	vecPtr[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]);
-	vecPtr[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]);
+	vec[0] = (x * mtxFloat[0]) + (y * mtxFloat[4]) + (z * mtxFloat[ 8]);
+	vec[1] = (x * mtxFloat[1]) + (y * mtxFloat[5]) + (z * mtxFloat[ 9]);
+	vec[2] = (x * mtxFloat[2]) + (y * mtxFloat[6]) + (z * mtxFloat[10]);
 }
 
-void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr)
+void MatrixTranslate(float (&__restrict mtx)[16], const float (&__restrict vec)[4])
 {
-	mtxPtr[12] += (mtxPtr[0] * vecPtr[0]) + (mtxPtr[4] * vecPtr[1]) + (mtxPtr[ 8] * vecPtr[2]);
-	mtxPtr[13] += (mtxPtr[1] * vecPtr[0]) + (mtxPtr[5] * vecPtr[1]) + (mtxPtr[ 9] * vecPtr[2]);
-	mtxPtr[14] += (mtxPtr[2] * vecPtr[0]) + (mtxPtr[6] * vecPtr[1]) + (mtxPtr[10] * vecPtr[2]);
-	mtxPtr[15] += (mtxPtr[3] * vecPtr[0]) + (mtxPtr[7] * vecPtr[1]) + (mtxPtr[11] * vecPtr[2]);
+	mtx[12] += (mtx[0] * vec[0]) + (mtx[4] * vec[1]) + (mtx[ 8] * vec[2]);
+	mtx[13] += (mtx[1] * vec[0]) + (mtx[5] * vec[1]) + (mtx[ 9] * vec[2]);
+	mtx[14] += (mtx[2] * vec[0]) + (mtx[6] * vec[1]) + (mtx[10] * vec[2]);
+	mtx[15] += (mtx[3] * vec[0]) + (mtx[7] * vec[1]) + (mtx[11] * vec[2]);
 }
 
-void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr)
+void MatrixScale(float (&__restrict mtx)[16], const float (&__restrict vec)[4])
 {
-	mtxPtr[ 0] *= vecPtr[0];
-	mtxPtr[ 1] *= vecPtr[0];
-	mtxPtr[ 2] *= vecPtr[0];
-	mtxPtr[ 3] *= vecPtr[0];
+	mtx[ 0] *= vec[0];
+	mtx[ 1] *= vec[0];
+	mtx[ 2] *= vec[0];
+	mtx[ 3] *= vec[0];
 	
-	mtxPtr[ 4] *= vecPtr[1];
-	mtxPtr[ 5] *= vecPtr[1];
-	mtxPtr[ 6] *= vecPtr[1];
-	mtxPtr[ 7] *= vecPtr[1];
+	mtx[ 4] *= vec[1];
+	mtx[ 5] *= vec[1];
+	mtx[ 6] *= vec[1];
+	mtx[ 7] *= vec[1];
 	
-	mtxPtr[ 8] *= vecPtr[2];
-	mtxPtr[ 9] *= vecPtr[2];
-	mtxPtr[10] *= vecPtr[2];
-	mtxPtr[11] *= vecPtr[2];
+	mtx[ 8] *= vec[2];
+	mtx[ 9] *= vec[2];
+	mtx[10] *= vec[2];
+	mtx[11] *= vec[2];
 }
 
-void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB)
+void MatrixMultiply(float (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16])
 {
 	const CACHE_ALIGN float mtxFloatB[16] = {
-		(float)mtxPtrB[ 0],
-		(float)mtxPtrB[ 1],
-		(float)mtxPtrB[ 2],
-		(float)mtxPtrB[ 3],
+		(float)mtxB[ 0],
+		(float)mtxB[ 1],
+		(float)mtxB[ 2],
+		(float)mtxB[ 3],
 		
-		(float)mtxPtrB[ 4],
-		(float)mtxPtrB[ 5],
-		(float)mtxPtrB[ 6],
-		(float)mtxPtrB[ 7],
+		(float)mtxB[ 4],
+		(float)mtxB[ 5],
+		(float)mtxB[ 6],
+		(float)mtxB[ 7],
 		
-		(float)mtxPtrB[ 8],
-		(float)mtxPtrB[ 9],
-		(float)mtxPtrB[10],
-		(float)mtxPtrB[11],
+		(float)mtxB[ 8],
+		(float)mtxB[ 9],
+		(float)mtxB[10],
+		(float)mtxB[11],
 		
-		(float)mtxPtrB[12],
-		(float)mtxPtrB[13],
-		(float)mtxPtrB[14],
-		(float)mtxPtrB[15]
+		(float)mtxB[12],
+		(float)mtxB[13],
+		(float)mtxB[14],
+		(float)mtxB[15]
 	};
 	
-	float tmpMatrix[16];
+	CACHE_ALIGN float tmpMatrix[16];
 	
-	tmpMatrix[0]  = (mtxPtrA[ 0] * mtxFloatB[ 0]) + (mtxPtrA[ 4] * mtxFloatB[ 1]) + (mtxPtrA[ 8] * mtxFloatB[ 2]) + (mtxPtrA[12] * mtxFloatB[ 3]);
-	tmpMatrix[1]  = (mtxPtrA[ 1] * mtxFloatB[ 0]) + (mtxPtrA[ 5] * mtxFloatB[ 1]) + (mtxPtrA[ 9] * mtxFloatB[ 2]) + (mtxPtrA[13] * mtxFloatB[ 3]);
-	tmpMatrix[2]  = (mtxPtrA[ 2] * mtxFloatB[ 0]) + (mtxPtrA[ 6] * mtxFloatB[ 1]) + (mtxPtrA[10] * mtxFloatB[ 2]) + (mtxPtrA[14] * mtxFloatB[ 3]);
-	tmpMatrix[3]  = (mtxPtrA[ 3] * mtxFloatB[ 0]) + (mtxPtrA[ 7] * mtxFloatB[ 1]) + (mtxPtrA[11] * mtxFloatB[ 2]) + (mtxPtrA[15] * mtxFloatB[ 3]);
+	tmpMatrix[0]  = (mtxA[ 0] * mtxFloatB[ 0]) + (mtxA[ 4] * mtxFloatB[ 1]) + (mtxA[ 8] * mtxFloatB[ 2]) + (mtxA[12] * mtxFloatB[ 3]);
+	tmpMatrix[1]  = (mtxA[ 1] * mtxFloatB[ 0]) + (mtxA[ 5] * mtxFloatB[ 1]) + (mtxA[ 9] * mtxFloatB[ 2]) + (mtxA[13] * mtxFloatB[ 3]);
+	tmpMatrix[2]  = (mtxA[ 2] * mtxFloatB[ 0]) + (mtxA[ 6] * mtxFloatB[ 1]) + (mtxA[10] * mtxFloatB[ 2]) + (mtxA[14] * mtxFloatB[ 3]);
+	tmpMatrix[3]  = (mtxA[ 3] * mtxFloatB[ 0]) + (mtxA[ 7] * mtxFloatB[ 1]) + (mtxA[11] * mtxFloatB[ 2]) + (mtxA[15] * mtxFloatB[ 3]);
 	
-	tmpMatrix[4]  = (mtxPtrA[ 0] * mtxFloatB[ 4]) + (mtxPtrA[ 4] * mtxFloatB[ 5]) + (mtxPtrA[ 8] * mtxFloatB[ 6]) + (mtxPtrA[12] * mtxFloatB[ 7]);
-	tmpMatrix[5]  = (mtxPtrA[ 1] * mtxFloatB[ 4]) + (mtxPtrA[ 5] * mtxFloatB[ 5]) + (mtxPtrA[ 9] * mtxFloatB[ 6]) + (mtxPtrA[13] * mtxFloatB[ 7]);
-	tmpMatrix[6]  = (mtxPtrA[ 2] * mtxFloatB[ 4]) + (mtxPtrA[ 6] * mtxFloatB[ 5]) + (mtxPtrA[10] * mtxFloatB[ 6]) + (mtxPtrA[14] * mtxFloatB[ 7]);
-	tmpMatrix[7]  = (mtxPtrA[ 3] * mtxFloatB[ 4]) + (mtxPtrA[ 7] * mtxFloatB[ 5]) + (mtxPtrA[11] * mtxFloatB[ 6]) + (mtxPtrA[15] * mtxFloatB[ 7]);
+	tmpMatrix[4]  = (mtxA[ 0] * mtxFloatB[ 4]) + (mtxA[ 4] * mtxFloatB[ 5]) + (mtxA[ 8] * mtxFloatB[ 6]) + (mtxA[12] * mtxFloatB[ 7]);
+	tmpMatrix[5]  = (mtxA[ 1] * mtxFloatB[ 4]) + (mtxA[ 5] * mtxFloatB[ 5]) + (mtxA[ 9] * mtxFloatB[ 6]) + (mtxA[13] * mtxFloatB[ 7]);
+	tmpMatrix[6]  = (mtxA[ 2] * mtxFloatB[ 4]) + (mtxA[ 6] * mtxFloatB[ 5]) + (mtxA[10] * mtxFloatB[ 6]) + (mtxA[14] * mtxFloatB[ 7]);
+	tmpMatrix[7]  = (mtxA[ 3] * mtxFloatB[ 4]) + (mtxA[ 7] * mtxFloatB[ 5]) + (mtxA[11] * mtxFloatB[ 6]) + (mtxA[15] * mtxFloatB[ 7]);
 	
-	tmpMatrix[8]  = (mtxPtrA[ 0] * mtxFloatB[ 8]) + (mtxPtrA[ 4] * mtxFloatB[ 9]) + (mtxPtrA[ 8] * mtxFloatB[10]) + (mtxPtrA[12] * mtxFloatB[11]);
-	tmpMatrix[9]  = (mtxPtrA[ 1] * mtxFloatB[ 8]) + (mtxPtrA[ 5] * mtxFloatB[ 9]) + (mtxPtrA[ 9] * mtxFloatB[10]) + (mtxPtrA[13] * mtxFloatB[11]);
-	tmpMatrix[10] = (mtxPtrA[ 2] * mtxFloatB[ 8]) + (mtxPtrA[ 6] * mtxFloatB[ 9]) + (mtxPtrA[10] * mtxFloatB[10]) + (mtxPtrA[14] * mtxFloatB[11]);
-	tmpMatrix[11] = (mtxPtrA[ 3] * mtxFloatB[ 8]) + (mtxPtrA[ 7] * mtxFloatB[ 9]) + (mtxPtrA[11] * mtxFloatB[10]) + (mtxPtrA[15] * mtxFloatB[11]);
+	tmpMatrix[8]  = (mtxA[ 0] * mtxFloatB[ 8]) + (mtxA[ 4] * mtxFloatB[ 9]) + (mtxA[ 8] * mtxFloatB[10]) + (mtxA[12] * mtxFloatB[11]);
+	tmpMatrix[9]  = (mtxA[ 1] * mtxFloatB[ 8]) + (mtxA[ 5] * mtxFloatB[ 9]) + (mtxA[ 9] * mtxFloatB[10]) + (mtxA[13] * mtxFloatB[11]);
+	tmpMatrix[10] = (mtxA[ 2] * mtxFloatB[ 8]) + (mtxA[ 6] * mtxFloatB[ 9]) + (mtxA[10] * mtxFloatB[10]) + (mtxA[14] * mtxFloatB[11]);
+	tmpMatrix[11] = (mtxA[ 3] * mtxFloatB[ 8]) + (mtxA[ 7] * mtxFloatB[ 9]) + (mtxA[11] * mtxFloatB[10]) + (mtxA[15] * mtxFloatB[11]);
 	
-	tmpMatrix[12] = (mtxPtrA[ 0] * mtxFloatB[12]) + (mtxPtrA[ 4] * mtxFloatB[13]) + (mtxPtrA[ 8] * mtxFloatB[14]) + (mtxPtrA[12] * mtxFloatB[15]);
-	tmpMatrix[13] = (mtxPtrA[ 1] * mtxFloatB[12]) + (mtxPtrA[ 5] * mtxFloatB[13]) + (mtxPtrA[ 9] * mtxFloatB[14]) + (mtxPtrA[13] * mtxFloatB[15]);
-	tmpMatrix[14] = (mtxPtrA[ 2] * mtxFloatB[12]) + (mtxPtrA[ 6] * mtxFloatB[13]) + (mtxPtrA[10] * mtxFloatB[14]) + (mtxPtrA[14] * mtxFloatB[15]);
-	tmpMatrix[15] = (mtxPtrA[ 3] * mtxFloatB[12]) + (mtxPtrA[ 7] * mtxFloatB[13]) + (mtxPtrA[11] * mtxFloatB[14]) + (mtxPtrA[15] * mtxFloatB[15]);
+	tmpMatrix[12] = (mtxA[ 0] * mtxFloatB[12]) + (mtxA[ 4] * mtxFloatB[13]) + (mtxA[ 8] * mtxFloatB[14]) + (mtxA[12] * mtxFloatB[15]);
+	tmpMatrix[13] = (mtxA[ 1] * mtxFloatB[12]) + (mtxA[ 5] * mtxFloatB[13]) + (mtxA[ 9] * mtxFloatB[14]) + (mtxA[13] * mtxFloatB[15]);
+	tmpMatrix[14] = (mtxA[ 2] * mtxFloatB[12]) + (mtxA[ 6] * mtxFloatB[13]) + (mtxA[10] * mtxFloatB[14]) + (mtxA[14] * mtxFloatB[15]);
+	tmpMatrix[15] = (mtxA[ 3] * mtxFloatB[12]) + (mtxA[ 7] * mtxFloatB[13]) + (mtxA[11] * mtxFloatB[14]) + (mtxA[15] * mtxFloatB[15]);
 	
-	memcpy(mtxPtrA, tmpMatrix, sizeof(float)*16);
+	memcpy(mtxA, tmpMatrix, sizeof(float)*16);
 }
 
 template<size_t NUM_ROWS>
-FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor)
+FORCEINLINE void vector_fix2float(float (&mtx)[16], const float divisor)
 {
 	for (size_t i = 0; i < NUM_ROWS * 4; i+=4)
 	{
-		mtxPtr[i+0] /= divisor;
-		mtxPtr[i+1] /= divisor;
-		mtxPtr[i+2] /= divisor;
-		mtxPtr[i+3] /= divisor;
+		mtx[i+0] /= divisor;
+		mtx[i+1] /= divisor;
+		mtx[i+2] /= divisor;
+		mtx[i+3] /= divisor;
 	}
 }
 
 #endif
 
-void MatrixMultVec4x4(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr)
+#ifdef ENABLE_SSE4_1
+
+FORCEINLINE void _Vec4_MultiplyByMatrix(__m128i &outVec,
+										const __m128i &c0, const __m128i &c1, const __m128i &c2, const __m128i &c3,
+										const __m128i &rowLo0, const __m128i &rowLo1, const __m128i &rowLo2, const __m128i &rowLo3,
+										const __m128i &rowHi0, const __m128i &rowHi1, const __m128i &rowHi2, const __m128i &rowHi3)
 {
-	const s32 x = vecPtr[0];
-	const s32 y = vecPtr[1];
-	const s32 z = vecPtr[2];
-	const s32 w = vecPtr[3];
+	__m128i outVecLo = _mm_add_epi64( _mm_add_epi64(_mm_mul_epi32(rowLo0, c0), _mm_mul_epi32(rowLo1, c1)), _mm_add_epi64(_mm_mul_epi32(rowLo2, c2), _mm_mul_epi32(rowLo3, c3)) );
+	outVecLo = _mm_srli_epi64(outVecLo, 12);
+	outVecLo = _mm_shuffle_epi32(outVecLo, 0xD8);
 	
-	vecPtr[0] = sfx32_shiftdown( fx32_mul(x,mtxPtr[0]) + fx32_mul(y,mtxPtr[4]) + fx32_mul(z,mtxPtr[ 8]) + fx32_mul(w,mtxPtr[12]) );
-	vecPtr[1] = sfx32_shiftdown( fx32_mul(x,mtxPtr[1]) + fx32_mul(y,mtxPtr[5]) + fx32_mul(z,mtxPtr[ 9]) + fx32_mul(w,mtxPtr[13]) );
-	vecPtr[2] = sfx32_shiftdown( fx32_mul(x,mtxPtr[2]) + fx32_mul(y,mtxPtr[6]) + fx32_mul(z,mtxPtr[10]) + fx32_mul(w,mtxPtr[14]) );
-	vecPtr[3] = sfx32_shiftdown( fx32_mul(x,mtxPtr[3]) + fx32_mul(y,mtxPtr[7]) + fx32_mul(z,mtxPtr[11]) + fx32_mul(w,mtxPtr[15]) );
+	__m128i outVecHi = _mm_add_epi64( _mm_add_epi64(_mm_mul_epi32(rowHi0, c0), _mm_mul_epi32(rowHi1, c1)), _mm_add_epi64(_mm_mul_epi32(rowHi2, c2), _mm_mul_epi32(rowHi3, c3)) );
+	outVecHi = _mm_srli_epi64(outVecHi, 12);
+	outVecHi = _mm_shuffle_epi32(outVecHi, 0x8D);
+	
+	outVec = _mm_blendv_epi8(outVecLo, outVecHi, _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
 }
 
-void MatrixMultVec3x3(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr)
+static s32 GEM_SaturateAndShiftdown36To32(const s64 val)
 {
-	const s32 x = vecPtr[0];
-	const s32 y = vecPtr[1];
-	const s32 z = vecPtr[2];
+	if(val>(s64)0x000007FFFFFFFFFFULL) return (s32)0x7FFFFFFFU;
+	if(val<(s64)0xFFFFF80000000000ULL) return (s32)0x80000000U;
 	
-	vecPtr[0] = sfx32_shiftdown( fx32_mul(x,mtxPtr[0]) + fx32_mul(y,mtxPtr[4]) + fx32_mul(z,mtxPtr[ 8]) );
-	vecPtr[1] = sfx32_shiftdown( fx32_mul(x,mtxPtr[1]) + fx32_mul(y,mtxPtr[5]) + fx32_mul(z,mtxPtr[ 9]) );
-	vecPtr[2] = sfx32_shiftdown( fx32_mul(x,mtxPtr[2]) + fx32_mul(y,mtxPtr[6]) + fx32_mul(z,mtxPtr[10]) );
+	return fx32_shiftdown(val);
 }
 
-void MatrixTranslate(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr)
+FORCEINLINE void _Vec3_MultiplyByMatrix(__m128i &outVec,
+										const __m128i &c0, const __m128i &c1, const __m128i &c2,
+										const __m128i &rowLo0, const __m128i &rowLo1, const __m128i &rowLo2,
+										const __m128i &rowHi0, const __m128i &rowHi1, const __m128i &rowHi2)
 {
-	mtxPtr[12] = sfx32_shiftdown( fx32_mul(mtxPtr[0], vecPtr[0]) + fx32_mul(mtxPtr[4], vecPtr[1]) + fx32_mul(mtxPtr[ 8], vecPtr[2]) + fx32_shiftup(mtxPtr[12]) );
-	mtxPtr[13] = sfx32_shiftdown( fx32_mul(mtxPtr[1], vecPtr[0]) + fx32_mul(mtxPtr[5], vecPtr[1]) + fx32_mul(mtxPtr[ 9], vecPtr[2]) + fx32_shiftup(mtxPtr[13]) );
-	mtxPtr[14] = sfx32_shiftdown( fx32_mul(mtxPtr[2], vecPtr[0]) + fx32_mul(mtxPtr[6], vecPtr[1]) + fx32_mul(mtxPtr[10], vecPtr[2]) + fx32_shiftup(mtxPtr[14]) );
-	mtxPtr[15] = sfx32_shiftdown( fx32_mul(mtxPtr[3], vecPtr[0]) + fx32_mul(mtxPtr[7], vecPtr[1]) + fx32_mul(mtxPtr[11], vecPtr[2]) + fx32_shiftup(mtxPtr[15]) );
+	__m128i outVecLo = _mm_add_epi64( _mm_mul_epi32(rowLo0, c0), _mm_add_epi64(_mm_mul_epi32(rowLo1, c1), _mm_mul_epi32(rowLo2, c2)) );
+	outVecLo = _mm_srli_epi64(outVecLo, 12);
+	outVecLo = _mm_shuffle_epi32(outVecLo, 0xD8);
+	
+	__m128i outVecHi = _mm_add_epi64( _mm_mul_epi32(rowHi0, c0), _mm_add_epi64(_mm_mul_epi32(rowLo1, c1), _mm_mul_epi32(rowHi2, c2)) );
+	outVecHi = _mm_srli_epi64(outVecHi, 12);
+	outVecHi = _mm_shuffle_epi32(outVecHi, 0x8D);
+	
+	outVec = _mm_blendv_epi8(outVecLo, outVecHi, _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
 }
 
-void MatrixScale(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr)
+FORCEINLINE void _Vec4_Translate(__m128i &outVec,
+								 const __m128i &c0, const __m128i &c1, const __m128i &c2,
+								 const __m128i &rowLo0, const __m128i &rowLo1, const __m128i &rowLo2, const __m128i &rowLo3,
+								 const __m128i &rowHi0, const __m128i &rowHi1, const __m128i &rowHi2, const __m128i &rowHi3)
 {
-	mtxPtr[ 0] = sfx32_shiftdown( fx32_mul(mtxPtr[ 0], vecPtr[0]) );
-	mtxPtr[ 1] = sfx32_shiftdown( fx32_mul(mtxPtr[ 1], vecPtr[0]) );
-	mtxPtr[ 2] = sfx32_shiftdown( fx32_mul(mtxPtr[ 2], vecPtr[0]) );
-	mtxPtr[ 3] = sfx32_shiftdown( fx32_mul(mtxPtr[ 3], vecPtr[0]) );
+	__m128i outVecLo = _mm_add_epi64( _mm_add_epi64(_mm_mul_epi32(rowLo0, c0), _mm_mul_epi32(rowLo1, c1)), _mm_add_epi64(_mm_mul_epi32(rowLo2, c2), _mm_slli_epi64(rowLo3, 12)) );
+	outVecLo = _mm_srli_epi64(outVecLo, 12);
+	outVecLo = _mm_shuffle_epi32(outVecLo, 0xD8);
 	
-	mtxPtr[ 4] = sfx32_shiftdown( fx32_mul(mtxPtr[ 4], vecPtr[1]) );
-	mtxPtr[ 5] = sfx32_shiftdown( fx32_mul(mtxPtr[ 5], vecPtr[1]) );
-	mtxPtr[ 6] = sfx32_shiftdown( fx32_mul(mtxPtr[ 6], vecPtr[1]) );
-	mtxPtr[ 7] = sfx32_shiftdown( fx32_mul(mtxPtr[ 7], vecPtr[1]) );
+	__m128i outVecHi = _mm_add_epi64( _mm_add_epi64(_mm_mul_epi32(rowHi0, c0), _mm_mul_epi32(rowHi1, c1)), _mm_add_epi64(_mm_mul_epi32(rowHi2, c2), _mm_slli_epi64(rowHi3, 12)) );
+	outVecHi = _mm_srli_epi64(outVecHi, 12);
+	outVecHi = _mm_shuffle_epi32(outVecHi, 0x8D);
 	
-	mtxPtr[ 8] = sfx32_shiftdown( fx32_mul(mtxPtr[ 8], vecPtr[2]) );
-	mtxPtr[ 9] = sfx32_shiftdown( fx32_mul(mtxPtr[ 9], vecPtr[2]) );
-	mtxPtr[10] = sfx32_shiftdown( fx32_mul(mtxPtr[10], vecPtr[2]) );
-	mtxPtr[11] = sfx32_shiftdown( fx32_mul(mtxPtr[11], vecPtr[2]) );
+	outVec = _mm_blendv_epi8(outVecLo, outVecHi, _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
 }
 
-void MatrixMultiply(s32 *__restrict mtxPtrA, const s32 *__restrict mtxPtrB)
+FORCEINLINE void _Vec4_Scale(__m128i &inoutVec, const __m128i &scalar)
 {
-	s32 tmpMatrix[16];
+	__m128i outVecLo = _mm_cvtepu32_epi64(inoutVec);
+	__m128i outVecHi = _mm_cvtepu32_epi64( _mm_srli_si128(inoutVec, 8) );
 	
-	tmpMatrix[ 0] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 0])+fx32_mul(mtxPtrA[4],mtxPtrB[ 1])+fx32_mul(mtxPtrA[ 8],mtxPtrB[ 2])+fx32_mul(mtxPtrA[12],mtxPtrB[ 3]) );
-	tmpMatrix[ 1] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 0])+fx32_mul(mtxPtrA[5],mtxPtrB[ 1])+fx32_mul(mtxPtrA[ 9],mtxPtrB[ 2])+fx32_mul(mtxPtrA[13],mtxPtrB[ 3]) );
-	tmpMatrix[ 2] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 0])+fx32_mul(mtxPtrA[6],mtxPtrB[ 1])+fx32_mul(mtxPtrA[10],mtxPtrB[ 2])+fx32_mul(mtxPtrA[14],mtxPtrB[ 3]) );
-	tmpMatrix[ 3] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 0])+fx32_mul(mtxPtrA[7],mtxPtrB[ 1])+fx32_mul(mtxPtrA[11],mtxPtrB[ 2])+fx32_mul(mtxPtrA[15],mtxPtrB[ 3]) );
+	outVecLo = _mm_mul_epi32(outVecLo, scalar);
+	outVecLo = _mm_srli_epi64(outVecLo, 12);
+	outVecLo = _mm_shuffle_epi32(outVecLo, 0xD8);
 	
-	tmpMatrix[ 4] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 4])+fx32_mul(mtxPtrA[4],mtxPtrB[ 5])+fx32_mul(mtxPtrA[ 8],mtxPtrB[ 6])+fx32_mul(mtxPtrA[12],mtxPtrB[ 7]) );
-	tmpMatrix[ 5] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 4])+fx32_mul(mtxPtrA[5],mtxPtrB[ 5])+fx32_mul(mtxPtrA[ 9],mtxPtrB[ 6])+fx32_mul(mtxPtrA[13],mtxPtrB[ 7]) );
-	tmpMatrix[ 6] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 4])+fx32_mul(mtxPtrA[6],mtxPtrB[ 5])+fx32_mul(mtxPtrA[10],mtxPtrB[ 6])+fx32_mul(mtxPtrA[14],mtxPtrB[ 7]) );
-	tmpMatrix[ 7] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 4])+fx32_mul(mtxPtrA[7],mtxPtrB[ 5])+fx32_mul(mtxPtrA[11],mtxPtrB[ 6])+fx32_mul(mtxPtrA[15],mtxPtrB[ 7]) );
+	outVecHi = _mm_mul_epi32(outVecHi, scalar);
+	outVecHi = _mm_srli_epi64(outVecHi, 12);
+	outVecHi = _mm_shuffle_epi32(outVecHi, 0x8D);
 	
-	tmpMatrix[ 8] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[ 8])+fx32_mul(mtxPtrA[4],mtxPtrB[ 9])+fx32_mul(mtxPtrA[ 8],mtxPtrB[10])+fx32_mul(mtxPtrA[12],mtxPtrB[11]) );
-	tmpMatrix[ 9] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[ 8])+fx32_mul(mtxPtrA[5],mtxPtrB[ 9])+fx32_mul(mtxPtrA[ 9],mtxPtrB[10])+fx32_mul(mtxPtrA[13],mtxPtrB[11]) );
-	tmpMatrix[10] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[ 8])+fx32_mul(mtxPtrA[6],mtxPtrB[ 9])+fx32_mul(mtxPtrA[10],mtxPtrB[10])+fx32_mul(mtxPtrA[14],mtxPtrB[11]) );
-	tmpMatrix[11] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[ 8])+fx32_mul(mtxPtrA[7],mtxPtrB[ 9])+fx32_mul(mtxPtrA[11],mtxPtrB[10])+fx32_mul(mtxPtrA[15],mtxPtrB[11]) );
-	
-	tmpMatrix[12] = sfx32_shiftdown( fx32_mul(mtxPtrA[0],mtxPtrB[12])+fx32_mul(mtxPtrA[4],mtxPtrB[13])+fx32_mul(mtxPtrA[ 8],mtxPtrB[14])+fx32_mul(mtxPtrA[12],mtxPtrB[15]) );
-	tmpMatrix[13] = sfx32_shiftdown( fx32_mul(mtxPtrA[1],mtxPtrB[12])+fx32_mul(mtxPtrA[5],mtxPtrB[13])+fx32_mul(mtxPtrA[ 9],mtxPtrB[14])+fx32_mul(mtxPtrA[13],mtxPtrB[15]) );
-	tmpMatrix[14] = sfx32_shiftdown( fx32_mul(mtxPtrA[2],mtxPtrB[12])+fx32_mul(mtxPtrA[6],mtxPtrB[13])+fx32_mul(mtxPtrA[10],mtxPtrB[14])+fx32_mul(mtxPtrA[14],mtxPtrB[15]) );
-	tmpMatrix[15] = sfx32_shiftdown( fx32_mul(mtxPtrA[3],mtxPtrB[12])+fx32_mul(mtxPtrA[7],mtxPtrB[13])+fx32_mul(mtxPtrA[11],mtxPtrB[14])+fx32_mul(mtxPtrA[15],mtxPtrB[15]) );
-	
-	memcpy(mtxPtrA, tmpMatrix, sizeof(s32)*16);
+	inoutVec = _mm_blendv_epi8(outVecLo, outVecHi, _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
 }
+
+void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4])
+{
+	const __m128i inVec = _mm_load_si128((__m128i *)vec);
+	
+	const __m128i scalar[4] = {
+		_mm_shuffle_epi32(inVec, 0x00),
+		_mm_shuffle_epi32(inVec, 0x55),
+		_mm_shuffle_epi32(inVec, 0xAA),
+		_mm_shuffle_epi32(inVec, 0xFF)
+	};
+	
+	const __m128i row[4] = {
+		_mm_load_si128((__m128i *)(mtx +  0)),
+		_mm_load_si128((__m128i *)(mtx +  4)),
+		_mm_load_si128((__m128i *)(mtx +  8)),
+		_mm_load_si128((__m128i *)(mtx + 12))
+	};
+	
+	const __m128i rowLo[4] = {
+		_mm_cvtepu32_epi64(row[0]),
+		_mm_cvtepu32_epi64(row[1]),
+		_mm_cvtepu32_epi64(row[2]),
+		_mm_cvtepu32_epi64(row[3])
+	};
+	
+	const __m128i rowHi[4] = {
+		_mm_cvtepu32_epi64( _mm_srli_si128(row[0], 8)),
+		_mm_cvtepu32_epi64( _mm_srli_si128(row[1], 8)),
+		_mm_cvtepu32_epi64( _mm_srli_si128(row[2], 8)),
+		_mm_cvtepu32_epi64( _mm_srli_si128(row[3], 8))
+	};
+	
+	__m128i outVec;
+	_Vec4_MultiplyByMatrix(outVec,
+						   scalar[0], scalar[1], scalar[2], scalar[3],
+						   rowLo[0], rowLo[1], rowLo[2], rowLo[3],
+						   rowHi[0], rowHi[1], rowHi[2], rowHi[3]);
+	
+	_mm_store_si128((__m128i *)vec, outVec);
+}
+
+void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4])
+{
+	const __m128i inVec = _mm_load_si128((__m128i *)vec);
+	
+	const __m128i scalar[3] = {
+		_mm_shuffle_epi32(inVec, 0x00),
+		_mm_shuffle_epi32(inVec, 0x55),
+		_mm_shuffle_epi32(inVec, 0xAA)
+	};
+	
+	const __m128i row[3] = {
+		_mm_load_si128((__m128i *)(mtx + 0)),
+		_mm_load_si128((__m128i *)(mtx + 4)),
+		_mm_load_si128((__m128i *)(mtx + 8))
+	};
+	
+	const __m128i rowLo[3] = {
+		_mm_cvtepu32_epi64(row[0]),
+		_mm_cvtepu32_epi64(row[1]),
+		_mm_cvtepu32_epi64(row[2])
+	};
+	
+	const __m128i rowHi[3] = {
+		_mm_cvtepu32_epi64( _mm_srli_si128(row[0], 8)),
+		_mm_cvtepu32_epi64( _mm_srli_si128(row[1], 8)),
+		_mm_cvtepu32_epi64( _mm_srli_si128(row[2], 8))
+	};
+	
+	__m128i outVec;
+	_Vec3_MultiplyByMatrix(outVec,
+						   scalar[0], scalar[1], scalar[2],
+						   rowLo[0], rowLo[1], rowLo[2],
+						   rowHi[0], rowHi[1], rowHi[2]);
+	
+	outVec = _mm_blend_epi16(outVec, inVec, 0xC0);
+	_mm_store_si128((__m128i *)vec, outVec);
+}
+
+void MatrixTranslate(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4])
+{
+	const __m128i inVec = _mm_load_si128((__m128i *)vec);
+	
+	const __m128i scalar[3] = {
+		_mm_shuffle_epi32(inVec, 0x00),
+		_mm_shuffle_epi32(inVec, 0x55),
+		_mm_shuffle_epi32(inVec, 0xAA)
+	};
+	
+	const __m128i row[4] = {
+		_mm_load_si128((__m128i *)(mtx +  0)),
+		_mm_load_si128((__m128i *)(mtx +  4)),
+		_mm_load_si128((__m128i *)(mtx +  8)),
+		_mm_load_si128((__m128i *)(mtx + 12))
+	};
+	
+	const __m128i rowLo[4] = {
+		_mm_cvtepu32_epi64(row[0]),
+		_mm_cvtepu32_epi64(row[1]),
+		_mm_cvtepu32_epi64(row[2]),
+		_mm_cvtepu32_epi64(row[3])
+	};
+	
+	const __m128i rowHi[4] = {
+		_mm_cvtepu32_epi64( _mm_srli_si128(row[0], 8)),
+		_mm_cvtepu32_epi64( _mm_srli_si128(row[1], 8)),
+		_mm_cvtepu32_epi64( _mm_srli_si128(row[2], 8)),
+		_mm_cvtepu32_epi64( _mm_srli_si128(row[3], 8))
+	};
+	
+	__m128i outVec;
+	_Vec4_Translate(outVec,
+					scalar[0], scalar[1], scalar[2],
+					rowLo[0], rowLo[1], rowLo[2], rowLo[3],
+					rowHi[0], rowHi[1], rowHi[2], rowHi[3]);
+	
+	_mm_store_si128((__m128i *)(mtx + 12), outVec);
+}
+
+void MatrixScale(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4])
+{
+	const __m128i inVec = _mm_load_si128((__m128i *)vec);
+	const __m128i scalar[3] = {
+		_mm_shuffle_epi32(inVec, 0x00),
+		_mm_shuffle_epi32(inVec, 0x55),
+		_mm_shuffle_epi32(inVec, 0xAA)
+	};
+	
+	__m128i row[3] = {
+		_mm_load_si128((__m128i *)(mtx + 0)),
+		_mm_load_si128((__m128i *)(mtx + 4)),
+		_mm_load_si128((__m128i *)(mtx + 8))
+	};
+	
+	_Vec4_Scale(row[0], scalar[0]);
+	_mm_store_si128((__m128i *)(mtx + 0), row[0]);
+	
+	_Vec4_Scale(row[1], scalar[1]);
+	_mm_store_si128((__m128i *)(mtx + 4), row[1]);
+	
+	_Vec4_Scale(row[2], scalar[2]);
+	_mm_store_si128((__m128i *)(mtx + 8), row[2]);
+}
+
+void MatrixMultiply(s32 (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16])
+{
+	const __m128i rowA[4] = {
+		_mm_load_si128((__m128i *)(mtxA +  0)),
+		_mm_load_si128((__m128i *)(mtxA +  4)),
+		_mm_load_si128((__m128i *)(mtxA +  8)),
+		_mm_load_si128((__m128i *)(mtxA + 12))
+	};
+	
+	const __m128i rowB[4] = {
+		_mm_load_si128((__m128i *)(mtxB +  0)),
+		_mm_load_si128((__m128i *)(mtxB +  4)),
+		_mm_load_si128((__m128i *)(mtxB +  8)),
+		_mm_load_si128((__m128i *)(mtxB + 12))
+	};
+	
+	const __m128i rowLo[4] = {
+		_mm_cvtepu32_epi64(rowA[0]),
+		_mm_cvtepu32_epi64(rowA[1]),
+		_mm_cvtepu32_epi64(rowA[2]),
+		_mm_cvtepu32_epi64(rowA[3])
+	};
+	
+	const __m128i rowHi[4] = {
+		_mm_cvtepu32_epi64( _mm_srli_si128(rowA[0], 8)),
+		_mm_cvtepu32_epi64( _mm_srli_si128(rowA[1], 8)),
+		_mm_cvtepu32_epi64( _mm_srli_si128(rowA[2], 8)),
+		_mm_cvtepu32_epi64( _mm_srli_si128(rowA[3], 8))
+	};
+	
+	__m128i outVec;
+	__m128i scalar[4];
+	
+	scalar[0] = _mm_shuffle_epi32(rowB[0], 0x00);
+	scalar[1] = _mm_shuffle_epi32(rowB[0], 0x55);
+	scalar[2] = _mm_shuffle_epi32(rowB[0], 0xAA);
+	scalar[3] = _mm_shuffle_epi32(rowB[0], 0xFF);
+	_Vec4_MultiplyByMatrix(outVec,
+						   scalar[0], scalar[1], scalar[2], scalar[3],
+						   rowLo[0], rowLo[1], rowLo[2], rowLo[3],
+						   rowHi[0], rowHi[1], rowHi[2], rowHi[3]);
+	_mm_store_si128((__m128i *)(mtxA +  0), outVec);
+	
+	scalar[0] = _mm_shuffle_epi32(rowB[1], 0x00);
+	scalar[1] = _mm_shuffle_epi32(rowB[1], 0x55);
+	scalar[2] = _mm_shuffle_epi32(rowB[1], 0xAA);
+	scalar[3] = _mm_shuffle_epi32(rowB[1], 0xFF);
+	_Vec4_MultiplyByMatrix(outVec,
+						   scalar[0], scalar[1], scalar[2], scalar[3],
+						   rowLo[0], rowLo[1], rowLo[2], rowLo[3],
+						   rowHi[0], rowHi[1], rowHi[2], rowHi[3]);
+	_mm_store_si128((__m128i *)(mtxA +  4), outVec);
+	
+	scalar[0] = _mm_shuffle_epi32(rowB[2], 0x00);
+	scalar[1] = _mm_shuffle_epi32(rowB[2], 0x55);
+	scalar[2] = _mm_shuffle_epi32(rowB[2], 0xAA);
+	scalar[3] = _mm_shuffle_epi32(rowB[2], 0xFF);
+	_Vec4_MultiplyByMatrix(outVec,
+						   scalar[0], scalar[1], scalar[2], scalar[3],
+						   rowLo[0], rowLo[1], rowLo[2], rowLo[3],
+						   rowHi[0], rowHi[1], rowHi[2], rowHi[3]);
+	_mm_store_si128((__m128i *)(mtxA +  8), outVec);
+	
+	scalar[0] = _mm_shuffle_epi32(rowB[3], 0x00);
+	scalar[1] = _mm_shuffle_epi32(rowB[3], 0x55);
+	scalar[2] = _mm_shuffle_epi32(rowB[3], 0xAA);
+	scalar[3] = _mm_shuffle_epi32(rowB[3], 0xFF);
+	_Vec4_MultiplyByMatrix(outVec,
+						   scalar[0], scalar[1], scalar[2], scalar[3],
+						   rowLo[0], rowLo[1], rowLo[2], rowLo[3],
+						   rowHi[0], rowHi[1], rowHi[2], rowHi[3]);
+	_mm_store_si128((__m128i *)(mtxA + 12), outVec);
+}
+
+#else
+
+FORCEINLINE void _Vec4_MultiplyByMatrix(s32 (&__restrict outVec)[4], const s32 (&__restrict inVec)[4], const s32 (&__restrict mtx)[16])
+{
+	outVec[0] = sfx32_shiftdown( fx32_mul(mtx[0],inVec[0]) + fx32_mul(mtx[4],inVec[1]) + fx32_mul(mtx[ 8],inVec[2]) + fx32_mul(mtx[12],inVec[3]) );
+	outVec[1] = sfx32_shiftdown( fx32_mul(mtx[1],inVec[0]) + fx32_mul(mtx[5],inVec[1]) + fx32_mul(mtx[ 9],inVec[2]) + fx32_mul(mtx[13],inVec[3]) );
+	outVec[2] = sfx32_shiftdown( fx32_mul(mtx[2],inVec[0]) + fx32_mul(mtx[6],inVec[1]) + fx32_mul(mtx[10],inVec[2]) + fx32_mul(mtx[14],inVec[3]) );
+	outVec[3] = sfx32_shiftdown( fx32_mul(mtx[3],inVec[0]) + fx32_mul(mtx[7],inVec[1]) + fx32_mul(mtx[11],inVec[2]) + fx32_mul(mtx[15],inVec[3]) );
+}
+
+FORCEINLINE void _Vec3_MultiplyByMatrix(s32 (&__restrict outVec)[4], const s32 (&__restrict inVec)[3], const s32 (&__restrict mtx)[16])
+{
+	outVec[0] = sfx32_shiftdown( fx32_mul(mtx[0],inVec[0]) + fx32_mul(mtx[4],inVec[1]) + fx32_mul(mtx[ 8],inVec[2]) );
+	outVec[1] = sfx32_shiftdown( fx32_mul(mtx[1],inVec[0]) + fx32_mul(mtx[5],inVec[1]) + fx32_mul(mtx[ 9],inVec[2]) );
+	outVec[2] = sfx32_shiftdown( fx32_mul(mtx[2],inVec[0]) + fx32_mul(mtx[6],inVec[1]) + fx32_mul(mtx[10],inVec[2]) );
+}
+
+FORCEINLINE void _Vec4_Scale(s32 (&inoutVec)[4], const s32 scalar)
+{
+	inoutVec[0] = sfx32_shiftdown( fx32_mul(inoutVec[0], scalar) );
+	inoutVec[1] = sfx32_shiftdown( fx32_mul(inoutVec[1], scalar) );
+	inoutVec[2] = sfx32_shiftdown( fx32_mul(inoutVec[2], scalar) );
+	inoutVec[3] = sfx32_shiftdown( fx32_mul(inoutVec[3], scalar) );
+}
+
+void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4])
+{
+	const CACHE_ALIGN s32 __restrict tmpVec[4] = {
+		vec[0], vec[1], vec[2], vec[3]
+	};
+	
+	_Vec4_MultiplyByMatrix(vec, tmpVec, mtx);
+}
+
+void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4])
+{
+	const CACHE_ALIGN s32 __restrict tmpVec[3] = {
+		vec[0], vec[1], vec[2]
+	};
+	
+	_Vec3_MultiplyByMatrix(vec, tmpVec, mtx);
+}
+
+void MatrixTranslate(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4])
+{
+	mtx[12] = sfx32_shiftdown( fx32_mul(mtx[0], vec[0]) + fx32_mul(mtx[4], vec[1]) + fx32_mul(mtx[ 8], vec[2]) + fx32_shiftup(mtx[12]) );
+	mtx[13] = sfx32_shiftdown( fx32_mul(mtx[1], vec[0]) + fx32_mul(mtx[5], vec[1]) + fx32_mul(mtx[ 9], vec[2]) + fx32_shiftup(mtx[13]) );
+	mtx[14] = sfx32_shiftdown( fx32_mul(mtx[2], vec[0]) + fx32_mul(mtx[6], vec[1]) + fx32_mul(mtx[10], vec[2]) + fx32_shiftup(mtx[14]) );
+	mtx[15] = sfx32_shiftdown( fx32_mul(mtx[3], vec[0]) + fx32_mul(mtx[7], vec[1]) + fx32_mul(mtx[11], vec[2]) + fx32_shiftup(mtx[15]) );
+}
+
+void MatrixScale(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4])
+{
+	_Vec4_Scale((s32 (&__restrict)[4])mtx[0], vec[0]);
+	_Vec4_Scale((s32 (&__restrict)[4])mtx[4], vec[1]);
+	_Vec4_Scale((s32 (&__restrict)[4])mtx[8], vec[2]);
+}
+
+void MatrixMultiply(s32 (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16])
+{
+	const CACHE_ALIGN s32 tmpMtxA[16] = {
+		mtxA[ 0], mtxA[ 1], mtxA[ 2], mtxA[ 3],
+		mtxA[ 4], mtxA[ 5], mtxA[ 6], mtxA[ 7],
+		mtxA[ 8], mtxA[ 9], mtxA[10], mtxA[11],
+		mtxA[12], mtxA[13], mtxA[14], mtxA[15]
+	};
+	
+	_Vec4_MultiplyByMatrix((s32 (&__restrict)[4])mtxA[ 0], (s32 (&__restrict)[4])mtxB[ 0], tmpMtxA);
+	_Vec4_MultiplyByMatrix((s32 (&__restrict)[4])mtxA[ 4], (s32 (&__restrict)[4])mtxB[ 4], tmpMtxA);
+	_Vec4_MultiplyByMatrix((s32 (&__restrict)[4])mtxA[ 8], (s32 (&__restrict)[4])mtxB[ 8], tmpMtxA);
+	_Vec4_MultiplyByMatrix((s32 (&__restrict)[4])mtxA[12], (s32 (&__restrict)[4])mtxB[12], tmpMtxA);
+}
+
+#endif
diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h
index 12fe8f740..220638a39 100644
--- a/desmume/src/matrix.h
+++ b/desmume/src/matrix.h
@@ -31,42 +31,52 @@
 
 #ifdef ENABLE_SSE2
 #include <emmintrin.h>
+#endif
+
+#ifdef ENABLE_SSE4_1
+#include "smmintrin.h"
 #endif
-
+
+enum MatrixMode
+{
+	MATRIXMODE_PROJECTION		= 0,
+	MATRIXMODE_POSITION			= 1,
+	MATRIXMODE_POSITION_VECTOR	= 2,
+	MATRIXMODE_TEXTURE			= 3
+};
+
+template<MatrixMode MODE>
 struct MatrixStack
 {
-	MatrixStack(int size, int type);
-	s32		*matrix;
-	u32		position;
-	s32		size;
-	u8		type;
-};
+	static const size_t size = ((MODE == MATRIXMODE_PROJECTION) || (MODE == MATRIXMODE_TEXTURE)) ? 1 : 32;
+	static const MatrixMode type = MODE;
+	
+	s32 matrix[size][16];
+	u32 position;
+};
 
-void MatrixInit(s32 *mtxPtr);
-void MatrixInit(float *mtxPtr);
+void MatrixInit(s32 (&mtx)[16]);
+void MatrixInit(float (&mtx)[16]);
 
-void MatrixIdentity(s32 *mtxPtr);
-void MatrixIdentity(float *mtxPtr);
+void MatrixIdentity(s32 (&mtx)[16]);
+void MatrixIdentity(float (&mtx)[16]);
 
-void MatrixSet(s32 *mtxPtr, const size_t x, const size_t y, const s32 value);
-void MatrixSet(float *mtxPtr, const size_t x, const size_t y, const float value);
-void MatrixSet(float *mtxPtr, const size_t x, const size_t y, const s32 value);
+void MatrixSet(s32 (&mtx)[16], const size_t x, const size_t y, const s32 value);
+void MatrixSet(float (&mtx)[16], const size_t x, const size_t y, const float value);
+void MatrixSet(float (&mtx)[16], const size_t x, const size_t y, const s32 value);
 
-void MatrixCopy(s32 *mtxDst, const s32 *mtxSrc);
-void MatrixCopy(float *mtxDst, const float *mtxSrc);
-void MatrixCopy(float *mtxDst, const s32 *mtxSrc);
+void MatrixCopy(s32 (&mtxDst)[16], const s32 (&mtxSrc)[16]);
+void MatrixCopy(float (&mtxDst)[16], const float (&mtxSrc)[16]);
+void MatrixCopy(float (&__restrict mtxDst)[16], const s32 (&__restrict mtxSrc)[16]);
 
-int MatrixCompare(const s32 *mtxDst, const s32 *mtxSrc);
-int MatrixCompare(const float *mtxDst, const float *mtxSrc);
+int MatrixCompare(const s32 (&mtxDst)[16], const s32 (&mtxSrc)[16]);
+int MatrixCompare(const float (&mtxDst)[16], const float (&mtxSrc)[16]);
 
-s32	MatrixGetMultipliedIndex(const u32 index, s32 *matrix, s32 *rightMatrix);
-float MatrixGetMultipliedIndex(const u32 index, float *matrix, float *rightMatrix);
+s32	MatrixGetMultipliedIndex(const u32 index, const s32 (&mtxA)[16], const s32 (&mtxB)[16]);
+float MatrixGetMultipliedIndex(const u32 index, const float (&mtxA)[16], const float (&mtxB)[16]);
 
-void	MatrixStackInit				(MatrixStack *stack);
-void	MatrixStackSetMaxSize		(MatrixStack *stack, int size);
-s32*	MatrixStackGetPos			(MatrixStack *stack, const size_t pos);
-s32*	MatrixStackGet				(MatrixStack *stack);
-void	MatrixStackLoadMatrix		(MatrixStack *stack, const size_t pos, const s32 *ptr);
+template<MatrixMode MODE> void MatrixStackInit(MatrixStack<MODE> *stack);
+template<MatrixMode MODE> s32* MatrixStackGet(MatrixStack<MODE> *stack);
 
 void Vector2Copy(float *dst, const float *src);
 void Vector2Add(float *dst, const float *src);
@@ -86,21 +96,21 @@ void Vector3Normalize(float *dst);
 void Vector4Copy(float *dst, const float *src);
 
 
-void _MatrixMultVec4x4_NoSIMD(const s32 *__restrict mtxPtr, float *__restrict vecPtr);
+void _MatrixMultVec4x4_NoSIMD(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]);
 
-void MatrixMultVec4x4(const s32 *__restrict mtxPtr, float *__restrict vecPtr);
-void MatrixMultVec3x3(const s32 *__restrict mtxPtr, float *__restrict vecPtr);
-void MatrixTranslate(float *__restrict mtxPtr, const float *__restrict vecPtr);
-void MatrixScale(float *__restrict mtxPtr, const float *__restrict vecPtr);
-void MatrixMultiply(float *__restrict mtxPtrA, const s32 *__restrict mtxPtrB);
+void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]);
+void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], float (&__restrict vec)[4]);
+void MatrixTranslate(float (&__restrict mtx)[16], const float (&__restrict vec)[4]);
+void MatrixScale(float (&__restrict mtx)[16], const float (&__restrict vec)[4]);
+void MatrixMultiply(float (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]);
 
-template<size_t NUM_ROWS> FORCEINLINE void vector_fix2float(float *mtxPtr, const float divisor);
+template<size_t NUM_ROWS> FORCEINLINE void vector_fix2float(float (&mtx)[16], const float divisor);
 
-void MatrixMultVec4x4(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr);
-void MatrixMultVec3x3(const s32 *__restrict mtxPtr, s32 *__restrict vecPtr);
-void MatrixTranslate(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr);
-void MatrixScale(s32 *__restrict mtxPtr, const s32 *__restrict vecPtr);
-void MatrixMultiply(s32 *__restrict mtxPtrA, const s32 *__restrict mtxPtrB);
+void MatrixMultVec4x4(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]);
+void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4]);
+void MatrixTranslate(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]);
+void MatrixScale(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]);
+void MatrixMultiply(s32 (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]);
 
 //these functions are an unreliable, inaccurate floor.
 //it should only be used for positive numbers