- Add many of NHerve's improvements into OGLRender because I was trying to fix all the 3d issues

- Track polycount better. still worthless: at the very least, it doesnt account for clipping and culling - carry w=1 from vertex() through pipeline (this will be necessary for software 3d rendering) - Make GPU matrix mult and load commands clear out unused rows and cols to identity correctly - Make matrix 4x4 multiply routines use W-coordinate.
2008-09-06 04:08:35 +00:00 · 2008-09-06 04:08:35 +00:00 · 5278185e73
parent 7d2fc8964e
commit 5278185e73
6 changed files with 2842 additions and 2749 deletions
--- a/desmume/ChangeLog
+++ b/desmume/ChangeLog
@ -27,6 +27,12 @@
 - Some fixes in 3D core OGL (fixed textures) [CrazyMax]
 - Added texture caching (speedup 3D core) [CrazyMax]
 - Fixes clear depth (ex. Castlevania now don't flipping) [NHerve]
+ - Make matrix 4x4 multiply routines use W-coordinate. [zeromus]
+ - Make GPU matrix mult and load commands clear out unused rows and cols to identity correctly; 
+   carry w=1 from vertex() through pipeline (this will be necessary for software 3d rendering) [zeromus]
+ - Track polycount better. still worthless: at the very least, it doesnt account for clipping and culling [zeromus]
+ - Fix errors in matrix operations regarding projection mode and pos-vector mode [zeromus]
+ - Fix error in command unpacking which caused some display lists to totally blow up [zeromus]
 
 0.7.3 -> 0.8
 Cocoa:
--- a/desmume/src/matrix.c
+++ b/desmume/src/matrix.c
@ -1,228 +1,255 @@
-/*  
-	Copyright (C) 2006-2007 shash
-
-    This file is part of DeSmuME
-
-    DeSmuME is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    DeSmuME is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with DeSmuME; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-*/
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include "matrix.h"
-
-void MatrixInit  (float *matrix)
-{
-	memset (matrix, 0, sizeof(float)*16);
-	matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
-}
-
-#ifdef SSE2
-void __fastcall MatrixIdentity	(float *matrix) //============== TODO
-{
-	memset (matrix, 0, sizeof(float)*16);
-	matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
-}
-
-float __fastcall MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix)
-{
-	int iMod = index%4, iDiv = (index>>2)<<2;
-
-	return	(matrix[iMod  ]*rightMatrix[iDiv  ])+(matrix[iMod+ 4]*rightMatrix[iDiv+1])+
-			(matrix[iMod+8]*rightMatrix[iDiv+2])+(matrix[iMod+12]*rightMatrix[iDiv+3]);
-}
-
-void __fastcall MatrixSet (float *matrix, int x, int y, float value)	// TODO
-{
-	matrix [x+(y<<2)] = value;
-}
-
-void __fastcall MatrixCopy (float *matrixDST, float *matrixSRC)
-{
-	memcpy (matrixDST, matrixSRC, sizeof(float)*16);
-}
-#else
-void MatrixMultVec4x4 (float *matrix, float *vecPtr)
-{
-	float x = vecPtr[0];
-	float y = vecPtr[1];
-	float z = vecPtr[2];
-
-	vecPtr[0] = x * matrix[0] + y * matrix[4] + z * matrix[ 8] + matrix[12];
-	vecPtr[1] = x * matrix[1] + y * matrix[5] + z * matrix[ 9] + matrix[13];
-	vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10] + matrix[14];
-}
-
-void MatrixMultVec3x3 (float *matrix, float *vecPtr)
-{
-	float x = vecPtr[0];
-	float y = vecPtr[1];
-	float z = vecPtr[2];
-
-	vecPtr[0] = x * matrix[0] + y * matrix[4] + z * matrix[ 8];
-	vecPtr[1] = x * matrix[1] + y * matrix[5] + z * matrix[ 9];
-	vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10];
-}
-
-void MatrixIdentity	(float *matrix)
-{
-	memset (matrix, 0, sizeof(float)*16);
-
-	matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
-}
-
-void MatrixMultiply (float *matrix, float *rightMatrix)
-{
-	float tmpMatrix[16];
-
-	tmpMatrix[0]  = (matrix[0]*rightMatrix[0])+(matrix[4]*rightMatrix[1])+(matrix[8]*rightMatrix[2])+(matrix[12]*rightMatrix[3]);
-	tmpMatrix[1]  = (matrix[1]*rightMatrix[0])+(matrix[5]*rightMatrix[1])+(matrix[9]*rightMatrix[2])+(matrix[13]*rightMatrix[3]);
-	tmpMatrix[2]  = (matrix[2]*rightMatrix[0])+(matrix[6]*rightMatrix[1])+(matrix[10]*rightMatrix[2])+(matrix[14]*rightMatrix[3]);
-	tmpMatrix[3]  = (matrix[3]*rightMatrix[0])+(matrix[7]*rightMatrix[1])+(matrix[11]*rightMatrix[2])+(matrix[15]*rightMatrix[3]);
-
-	tmpMatrix[4]  = (matrix[0]*rightMatrix[4])+(matrix[4]*rightMatrix[5])+(matrix[8]*rightMatrix[6])+(matrix[12]*rightMatrix[7]);
-	tmpMatrix[5]  = (matrix[1]*rightMatrix[4])+(matrix[5]*rightMatrix[5])+(matrix[9]*rightMatrix[6])+(matrix[13]*rightMatrix[7]);
-	tmpMatrix[6]  = (matrix[2]*rightMatrix[4])+(matrix[6]*rightMatrix[5])+(matrix[10]*rightMatrix[6])+(matrix[14]*rightMatrix[7]);
-	tmpMatrix[7]  = (matrix[3]*rightMatrix[4])+(matrix[7]*rightMatrix[5])+(matrix[11]*rightMatrix[6])+(matrix[15]*rightMatrix[7]);
-
-	tmpMatrix[8]  = (matrix[0]*rightMatrix[8])+(matrix[4]*rightMatrix[9])+(matrix[8]*rightMatrix[10])+(matrix[12]*rightMatrix[11]);
-	tmpMatrix[9]  = (matrix[1]*rightMatrix[8])+(matrix[5]*rightMatrix[9])+(matrix[9]*rightMatrix[10])+(matrix[13]*rightMatrix[11]);
-	tmpMatrix[10] = (matrix[2]*rightMatrix[8])+(matrix[6]*rightMatrix[9])+(matrix[10]*rightMatrix[10])+(matrix[14]*rightMatrix[11]);
-	tmpMatrix[11] = (matrix[3]*rightMatrix[8])+(matrix[7]*rightMatrix[9])+(matrix[11]*rightMatrix[10])+(matrix[15]*rightMatrix[11]);
-
-	tmpMatrix[12] = (matrix[0]*rightMatrix[12])+(matrix[4]*rightMatrix[13])+(matrix[8]*rightMatrix[14])+(matrix[12]*rightMatrix[15]);
-	tmpMatrix[13] = (matrix[1]*rightMatrix[12])+(matrix[5]*rightMatrix[13])+(matrix[9]*rightMatrix[14])+(matrix[13]*rightMatrix[15]);
-	tmpMatrix[14] = (matrix[2]*rightMatrix[12])+(matrix[6]*rightMatrix[13])+(matrix[10]*rightMatrix[14])+(matrix[14]*rightMatrix[15]);
-	tmpMatrix[15] = (matrix[3]*rightMatrix[12])+(matrix[7]*rightMatrix[13])+(matrix[11]*rightMatrix[14])+(matrix[15]*rightMatrix[15]);
-
-	memcpy (matrix, tmpMatrix, sizeof(float)*16);
-}
-
-float MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix)
-{
-	int iMod = index%4, iDiv = (index>>2)<<2;
-
-	return	(matrix[iMod  ]*rightMatrix[iDiv  ])+(matrix[iMod+ 4]*rightMatrix[iDiv+1])+
-			(matrix[iMod+8]*rightMatrix[iDiv+2])+(matrix[iMod+12]*rightMatrix[iDiv+3]);
-}
-
-void MatrixSet (float *matrix, int x, int y, float value)
-{
-	matrix [x+(y<<2)] = value;
-}
-
-void MatrixCopy (float *matrixDST, float *matrixSRC)
-{
-	memcpy (matrixDST, matrixSRC, sizeof(float)*16);
-}
-
-void MatrixTranslate	(float *matrix, float *ptr)
-{
-	matrix[12] += (matrix[0]*ptr[0])+(matrix[4]*ptr[1])+(matrix[ 8]*ptr[2]);
-	matrix[13] += (matrix[1]*ptr[0])+(matrix[5]*ptr[1])+(matrix[ 9]*ptr[2]);
-	matrix[14] += (matrix[2]*ptr[0])+(matrix[6]*ptr[1])+(matrix[10]*ptr[2]);
-	matrix[15] += (matrix[3]*ptr[0])+(matrix[7]*ptr[1])+(matrix[11]*ptr[2]);
-}
-
-void MatrixScale (float *matrix, float *ptr)
-{
-	matrix[0]  *= ptr[0];
-	matrix[1]  *= ptr[0];
-	matrix[2]  *= ptr[0];
-	matrix[3]  *= ptr[0];
-
-	matrix[4]  *= ptr[1];
-	matrix[5]  *= ptr[1];
-	matrix[6]  *= ptr[1];
-	matrix[7]  *= ptr[1];
-
-	matrix[8] *= ptr[2];
-	matrix[9] *= ptr[2];
-	matrix[10] *= ptr[2];
-	matrix[11] *= ptr[2];
-}
-#endif
-//-----------------------------------------
-
-void MatrixStackInit (MatrixStack *stack)
-{
-	stack->matrix	= NULL;
-	stack->position	= 0;
-	stack->size		= 0;
-}
-
-void MatrixStackSetMaxSize (MatrixStack *stack, int size)
-{
-	int i = 0;
-
-	stack->size = size;
-
-	if (stack->matrix == NULL)
-	{
-		stack->matrix = (float*) malloc (stack->size*16*sizeof(float));
-	}
-	else
-	{
-		free (stack->matrix);
-		stack->matrix = (float*) malloc (stack->size*16*sizeof(float));
-	}
-
-	for (i = 0; i < stack->size; i++)
-	{
-		MatrixInit (&stack->matrix[i*16]);
-	}
-
-	stack->size--;
-}
-
-
-void MatrixStackSetStackPosition (MatrixStack *stack, int pos)
-{
-	stack->position += pos;
-
-	if (stack->position < 0)
-		stack->position = 0;
-	else if (stack->position > stack->size)	
-		stack->position = stack->size;
-}
-
-void MatrixStackPushMatrix (MatrixStack *stack, float *ptr)
-{
-	MatrixCopy (&stack->matrix[stack->position*16], ptr);
-
-	MatrixStackSetStackPosition (stack, 1);
-}
-
-float * MatrixStackPopMatrix (MatrixStack *stack, int size)
-{
-	MatrixStackSetStackPosition(stack, -size);
-
-	return &stack->matrix[stack->position*16];
-}
-
-float * MatrixStackGetPos (MatrixStack *stack, int pos)
-{
-	return &stack->matrix[pos*16];
-}
-
-float * MatrixStackGet (MatrixStack *stack)
-{
-	return &stack->matrix[stack->position*16];
-}
-
-void MatrixStackLoadMatrix (MatrixStack *stack, int pos, float *ptr)
-{
-	MatrixCopy (&stack->matrix[pos*16], ptr);
-}
+/*  
+	Copyright (C) 2006-2007 shash
+
+    This file is part of DeSmuME
+
+    DeSmuME is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DeSmuME is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DeSmuME; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "matrix.h"
+
+void MatrixInit  (float *matrix)
+{
+	memset (matrix, 0, sizeof(float)*16);
+	matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
+}
+
+#ifdef SSE2
+void __fastcall MatrixIdentity	(float *matrix) //============== TODO
+{
+	memset (matrix, 0, sizeof(float)*16);
+	matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
+}
+
+float __fastcall MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix)
+{
+	int iMod = index%4, iDiv = (index>>2)<<2;
+
+	return	(matrix[iMod  ]*rightMatrix[iDiv  ])+(matrix[iMod+ 4]*rightMatrix[iDiv+1])+
+			(matrix[iMod+8]*rightMatrix[iDiv+2])+(matrix[iMod+12]*rightMatrix[iDiv+3]);
+}
+
+void __fastcall MatrixSet (float *matrix, int x, int y, float value)	// TODO
+{
+	matrix [x+(y<<2)] = value;
+}
+
+void __fastcall MatrixCopy (float *matrixDST, float *matrixSRC)
+{
+	memcpy (matrixDST, matrixSRC, sizeof(float)*16);
+}
+#else
+void MatrixMultVec4x4 (float *matrix, float *vecPtr)
+{
+	float x = vecPtr[0];
+	float y = vecPtr[1];
+	float z = vecPtr[2];
+	float w = vecPtr[3];
+
+	vecPtr[0] = x * matrix[0] + y * matrix[4] + z * matrix[ 8] + w * matrix[12];
+	vecPtr[1] = x * matrix[1] + y * matrix[5] + z * matrix[ 9] + w * matrix[13];
+	vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10] + w * matrix[14];
+	vecPtr[3] = x * matrix[3] + y * matrix[7] + z * matrix[11] + w * matrix[15];
+}
+
+void MatrixMultVec3x3 (float *matrix, float *vecPtr)
+{
+	float x = vecPtr[0];
+	float y = vecPtr[1];
+	float z = vecPtr[2];
+
+	vecPtr[0] = x * matrix[0] + y * matrix[4] + z * matrix[ 8];
+	vecPtr[1] = x * matrix[1] + y * matrix[5] + z * matrix[ 9];
+	vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10];
+}
+
+void MatrixIdentity	(float *matrix)
+{
+	memset (matrix, 0, sizeof(float)*16);
+
+	matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
+}
+
+void MatrixMultiply (float *matrix, float *rightMatrix)
+{
+	float tmpMatrix[16];
+
+	tmpMatrix[0]  = (matrix[0]*rightMatrix[0])+(matrix[4]*rightMatrix[1])+(matrix[8]*rightMatrix[2])+(matrix[12]*rightMatrix[3]);
+	tmpMatrix[1]  = (matrix[1]*rightMatrix[0])+(matrix[5]*rightMatrix[1])+(matrix[9]*rightMatrix[2])+(matrix[13]*rightMatrix[3]);
+	tmpMatrix[2]  = (matrix[2]*rightMatrix[0])+(matrix[6]*rightMatrix[1])+(matrix[10]*rightMatrix[2])+(matrix[14]*rightMatrix[3]);
+	tmpMatrix[3]  = (matrix[3]*rightMatrix[0])+(matrix[7]*rightMatrix[1])+(matrix[11]*rightMatrix[2])+(matrix[15]*rightMatrix[3]);
+
+	tmpMatrix[4]  = (matrix[0]*rightMatrix[4])+(matrix[4]*rightMatrix[5])+(matrix[8]*rightMatrix[6])+(matrix[12]*rightMatrix[7]);
+	tmpMatrix[5]  = (matrix[1]*rightMatrix[4])+(matrix[5]*rightMatrix[5])+(matrix[9]*rightMatrix[6])+(matrix[13]*rightMatrix[7]);
+	tmpMatrix[6]  = (matrix[2]*rightMatrix[4])+(matrix[6]*rightMatrix[5])+(matrix[10]*rightMatrix[6])+(matrix[14]*rightMatrix[7]);
+	tmpMatrix[7]  = (matrix[3]*rightMatrix[4])+(matrix[7]*rightMatrix[5])+(matrix[11]*rightMatrix[6])+(matrix[15]*rightMatrix[7]);
+
+	tmpMatrix[8]  = (matrix[0]*rightMatrix[8])+(matrix[4]*rightMatrix[9])+(matrix[8]*rightMatrix[10])+(matrix[12]*rightMatrix[11]);
+	tmpMatrix[9]  = (matrix[1]*rightMatrix[8])+(matrix[5]*rightMatrix[9])+(matrix[9]*rightMatrix[10])+(matrix[13]*rightMatrix[11]);
+	tmpMatrix[10] = (matrix[2]*rightMatrix[8])+(matrix[6]*rightMatrix[9])+(matrix[10]*rightMatrix[10])+(matrix[14]*rightMatrix[11]);
+	tmpMatrix[11] = (matrix[3]*rightMatrix[8])+(matrix[7]*rightMatrix[9])+(matrix[11]*rightMatrix[10])+(matrix[15]*rightMatrix[11]);
+
+	tmpMatrix[12] = (matrix[0]*rightMatrix[12])+(matrix[4]*rightMatrix[13])+(matrix[8]*rightMatrix[14])+(matrix[12]*rightMatrix[15]);
+	tmpMatrix[13] = (matrix[1]*rightMatrix[12])+(matrix[5]*rightMatrix[13])+(matrix[9]*rightMatrix[14])+(matrix[13]*rightMatrix[15]);
+	tmpMatrix[14] = (matrix[2]*rightMatrix[12])+(matrix[6]*rightMatrix[13])+(matrix[10]*rightMatrix[14])+(matrix[14]*rightMatrix[15]);
+	tmpMatrix[15] = (matrix[3]*rightMatrix[12])+(matrix[7]*rightMatrix[13])+(matrix[11]*rightMatrix[14])+(matrix[15]*rightMatrix[15]);
+
+	memcpy (matrix, tmpMatrix, sizeof(float)*16);
+}
+
+float MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix)
+{
+	int iMod = index%4, iDiv = (index>>2)<<2;
+
+	return	(matrix[iMod  ]*rightMatrix[iDiv  ])+(matrix[iMod+ 4]*rightMatrix[iDiv+1])+
+			(matrix[iMod+8]*rightMatrix[iDiv+2])+(matrix[iMod+12]*rightMatrix[iDiv+3]);
+}
+
+void MatrixSet (float *matrix, int x, int y, float value)
+{
+	matrix [x+(y<<2)] = value;
+}
+
+void MatrixTranspose(float *matrix)
+{
+	float temp;
+#define swap(A,B) temp = matrix[A];matrix[A] = matrix[B]; matrix[B] = temp;
+	swap(1,4);
+	swap(2,8);
+	swap(3,0xC);
+	swap(6,9);
+	swap(7,0xD);
+	swap(0xB,0xE);
+#undef swap
+
+/*
+0 1 2 3
+4 5 6 7
+8 9 A B
+C D E F
+
+0 4 8 C
+1 5 9 D
+2 6 A E
+3 7 B F
+*/
+}
+
+void MatrixCopy (float *matrixDST, float *matrixSRC)
+{
+	memcpy (matrixDST, matrixSRC, sizeof(float)*16);
+}
+
+void MatrixTranslate	(float *matrix, float *ptr)
+{
+	matrix[12] += (matrix[0]*ptr[0])+(matrix[4]*ptr[1])+(matrix[ 8]*ptr[2]);
+	matrix[13] += (matrix[1]*ptr[0])+(matrix[5]*ptr[1])+(matrix[ 9]*ptr[2]);
+	matrix[14] += (matrix[2]*ptr[0])+(matrix[6]*ptr[1])+(matrix[10]*ptr[2]);
+	matrix[15] += (matrix[3]*ptr[0])+(matrix[7]*ptr[1])+(matrix[11]*ptr[2]);
+}
+
+void MatrixScale (float *matrix, float *ptr)
+{
+	matrix[0]  *= ptr[0];
+	matrix[1]  *= ptr[0];
+	matrix[2]  *= ptr[0];
+	matrix[3]  *= ptr[0];
+
+	matrix[4]  *= ptr[1];
+	matrix[5]  *= ptr[1];
+	matrix[6]  *= ptr[1];
+	matrix[7]  *= ptr[1];
+
+	matrix[8] *= ptr[2];
+	matrix[9] *= ptr[2];
+	matrix[10] *= ptr[2];
+	matrix[11] *= ptr[2];
+}
+#endif
+//-----------------------------------------
+
+void MatrixStackInit (MatrixStack *stack)
+{
+	stack->matrix	= NULL;
+	stack->position	= 0;
+	stack->size		= 0;
+}
+
+void MatrixStackSetMaxSize (MatrixStack *stack, int size)
+{
+	int i = 0;
+
+	stack->size = size;
+
+	if (stack->matrix == NULL)
+	{
+		stack->matrix = (float*) malloc (stack->size*16*sizeof(float));
+	}
+	else
+	{
+		free (stack->matrix);
+		stack->matrix = (float*) malloc (stack->size*16*sizeof(float));
+	}
+
+	for (i = 0; i < stack->size; i++)
+	{
+		MatrixInit (&stack->matrix[i*16]);
+	}
+
+	stack->size--;
+}
+
+
+void MatrixStackSetStackPosition (MatrixStack *stack, int pos)
+{
+	stack->position += pos;
+
+	if (stack->position < 0)
+		stack->position = 0;
+	else if (stack->position > stack->size)	
+		stack->position = stack->size;
+}
+
+void MatrixStackPushMatrix (MatrixStack *stack, float *ptr)
+{
+	MatrixCopy (&stack->matrix[stack->position*16], ptr);
+
+	MatrixStackSetStackPosition (stack, 1);
+}
+
+float * MatrixStackPopMatrix (MatrixStack *stack, int size)
+{
+	MatrixStackSetStackPosition(stack, -size);
+
+	return &stack->matrix[stack->position*16];
+}
+
+float * MatrixStackGetPos (MatrixStack *stack, int pos)
+{
+	return &stack->matrix[pos*16];
+}
+
+float * MatrixStackGet (MatrixStack *stack)
+{
+	return &stack->matrix[stack->position*16];
+}
+
+void MatrixStackLoadMatrix (MatrixStack *stack, int pos, float *ptr)
+{
+	MatrixCopy (&stack->matrix[pos*16], ptr);
+}
--- a/desmume/src/matrix.h
+++ b/desmume/src/matrix.h
@ -1,79 +1,81 @@
-/*  
-	Copyright (C) 2006-2007 shash
-
-    This file is part of DeSmuME
-
-    DeSmuME is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    DeSmuME is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with DeSmuME; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-*/
-
-#ifndef MATRIX_H
-#define MATRIX_H
-
-#include "types.h"
-
-#ifdef SSE2
-	#include <xmmintrin.h>
-	#include <emmintrin.h>
-	//typedef __declspec(align(16)) float gMatrix[4][4];
-	//typedef float gMatrix[4][4];
-	typedef float gMatrix[16];
-#endif
-
-typedef struct MatrixStack
-{
-#ifdef SSE2
-	//gMatrix *matrix;
-	float	*matrix;
-#else
-	float	*matrix;
-#endif
-	int		position;
-	int		size;
-} MatrixStack;
-
-void	MatrixInit				(float *matrix);
-#ifdef SSE2
-extern void	__fastcall MatrixMultVec3x3		(const gMatrix matrix, const gMatrix vecPtr);
-extern void	__fastcall MatrixMultVec4x4		(const gMatrix matrix, const gMatrix vecPtr);
-void	__fastcall MatrixIdentity			(float *matrix);
-extern void	__fastcall MatrixMultiply		(const gMatrix matrix, const gMatrix rightMatrix);
-float	__fastcall MatrixGetMultipliedIndex	(int index, float *matrix, float *rightMatrix);
-void	__fastcall MatrixSet				(float *matrix, int x, int y, float value);
-void	__fastcall MatrixCopy				(const gMatrix matrixDST, const gMatrix matrixSRC);
-extern void __fastcall MatrixTranslate		(float *matrix, float *ptr);
-extern void	__fastcall MatrixScale			(const gMatrix matrix, const gMatrix ptr);
-void	__fastcall MatrixScale				(const gMatrix matrix, const gMatrix ptr);
-#else
-void	MatrixMultVec3x3		(float *matrix, float *vecPtr);
-void	MatrixMultVec4x4		(float *matrix, float *vecPtr);
-void	MatrixIdentity			(float *matrix);
-void	MatrixMultiply			(float *matrix, float *rightMatrix);
-float	MatrixGetMultipliedIndex(int index, float *matrix, float *rightMatrix);
-void	MatrixSet				(float *matrix, int x, int y, float value);
-void	MatrixCopy				(float *matrixDST, float *matrixSRC);
-void	MatrixTranslate			(float *matrix, float *ptr);
-void	MatrixScale				(float *matrix, float *ptr);
-#endif
-
-void	MatrixStackInit				(MatrixStack *stack);
-void	MatrixStackSetMaxSize		(MatrixStack *stack, int size);
-void	MatrixStackSetStackPosition (MatrixStack *stack, int pos);
-void	MatrixStackPushMatrix		(MatrixStack *stack, float *ptr);
-float*	MatrixStackPopMatrix		(MatrixStack *stack, int size);
-float*	MatrixStackGetPos			(MatrixStack *stack, int pos);
-float*	MatrixStackGet				(MatrixStack *stack);
-void	MatrixStackLoadMatrix		(MatrixStack *stack, int pos, float *ptr);
-
-#endif
+/*  
+	Copyright (C) 2006-2007 shash
+
+    This file is part of DeSmuME
+
+    DeSmuME is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DeSmuME is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DeSmuME; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#ifndef MATRIX_H
+#define MATRIX_H
+
+#include "types.h"
+
+#ifdef SSE2
+	#include <xmmintrin.h>
+	#include <emmintrin.h>
+	//typedef __declspec(align(16)) float gMatrix[4][4];
+	//typedef float gMatrix[4][4];
+	typedef float gMatrix[16];
+#endif
+
+typedef struct MatrixStack
+{
+#ifdef SSE2
+	//gMatrix *matrix;
+	float	*matrix;
+#else
+	float	*matrix;
+#endif
+	int		position;
+	int		size;
+} MatrixStack;
+
+void	MatrixInit				(float *matrix);
+#ifdef SSE2
+extern void	__fastcall MatrixMultVec3x3		(const gMatrix matrix, const gMatrix vecPtr);
+extern void	__fastcall MatrixMultVec4x4		(const gMatrix matrix, const gMatrix vecPtr);
+void	__fastcall MatrixIdentity			(float *matrix);
+extern void	__fastcall MatrixMultiply		(const gMatrix matrix, const gMatrix rightMatrix);
+float	__fastcall MatrixGetMultipliedIndex	(int index, float *matrix, float *rightMatrix);
+void	__fastcall MatrixSet				(float *matrix, int x, int y, float value);
+void	__fastcall MatrixCopy				(const gMatrix matrixDST, const gMatrix matrixSRC);
+extern void __fastcall MatrixTranslate		(float *matrix, float *ptr);
+extern void	__fastcall MatrixScale			(const gMatrix matrix, const gMatrix ptr);
+void	__fastcall MatrixScale				(const gMatrix matrix, const gMatrix ptr);
+#else
+void	MatrixMultVec3x3		(float *matrix, float *vecPtr);
+void	MatrixMultVec4x4		(float *matrix, float *vecPtr);
+void	MatrixIdentity			(float *matrix);
+void	MatrixMultiply			(float *matrix, float *rightMatrix);
+float	MatrixGetMultipliedIndex(int index, float *matrix, float *rightMatrix);
+void	MatrixSet				(float *matrix, int x, int y, float value);
+void	MatrixCopy				(float *matrixDST, float *matrixSRC);
+void	MatrixTranslate			(float *matrix, float *ptr);
+void	MatrixScale				(float *matrix, float *ptr);
+#endif
+
+void MatrixTranspose(float *matrix);
+
+void	MatrixStackInit				(MatrixStack *stack);
+void	MatrixStackSetMaxSize		(MatrixStack *stack, int size);
+void	MatrixStackSetStackPosition (MatrixStack *stack, int pos);
+void	MatrixStackPushMatrix		(MatrixStack *stack, float *ptr);
+float*	MatrixStackPopMatrix		(MatrixStack *stack, int size);
+float*	MatrixStackGetPos			(MatrixStack *stack, int pos);
+float*	MatrixStackGet				(MatrixStack *stack);
+void	MatrixStackLoadMatrix		(MatrixStack *stack, int pos, float *ptr);
+
+#endif
--- a/desmume/src/matrix_sse2-x64.asm
+++ b/desmume/src/matrix_sse2-x64.asm
@ -1,178 +1,180 @@
-;
-;	Copyright (C) 2006 yopyop
-;	Copyright (C) 2008 CrazyMax
-;
-;    This file is part of DeSmuME
-;
-;    DeSmuME is free software; you can redistribute it and/or modify
-;    it under the terms of the GNU General Public License as published by
-;    the Free Software Foundation; either version 2 of the License, or
-;    (at your option) any later version.
-;
-;    DeSmuME is distributed in the hope that it will be useful,
-;    but WITHOUT ANY WARRANTY; without even the implied warranty of
-;    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;    GNU General Public License for more details.
-;
-;    You should have received a copy of the GNU General Public License
-;    along with DeSmuME; if not, write to the Free Software
-;    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-	TITLE	matrix_sse2-x64.asm
-	.code
-	
-MatrixMultVec4x4 PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [rcx]
-		movaps	xmm1, XMMWORD PTR [rcx+16]
-		movaps	xmm2, XMMWORD PTR [rcx+32]
-		movaps	xmm3, XMMWORD PTR [rcx+48]
-		movaps	xmm4, XMMWORD PTR [rdx]
-		movaps	xmm5, xmm4
-		movaps	xmm6, xmm4
-		movaps	xmm7, xmm4
-		shufps	xmm4, xmm4, 00000000b
-		shufps	xmm5, xmm5, 01010101b
-		shufps	xmm6, xmm6, 10101010b
-		mulps	xmm4, xmm0
-		mulps	xmm5, xmm1
-		mulps	xmm6, xmm2
-		addps	xmm4, xmm5
-		addps	xmm4, xmm6
-		addps	xmm4, xmm3
-		movaps	XMMWORD PTR [rdx], xmm4
-		ret		0
-MatrixMultVec4x4 ENDP
-
-MatrixMultVec3x3 PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [rcx]
-		movaps	xmm1, XMMWORD PTR [rcx+16]
-		movaps	xmm2, XMMWORD PTR [rcx+32]
-		movaps	xmm4, XMMWORD PTR [rdx]
-		movaps	xmm5, xmm4
-		movaps	xmm6, xmm4
-		movaps	xmm7, xmm4
-		shufps	xmm4, xmm4, 00000000b
-		shufps	xmm5, xmm5, 01010101b
-		shufps	xmm6, xmm6, 10101010b
-		mulps	xmm4, xmm0
-		mulps	xmm5, xmm1
-		mulps	xmm6, xmm2
-		addps	xmm4, xmm5
-		addps	xmm4, xmm6
-		movaps	XMMWORD PTR [rdx], xmm4
-MatrixMultVec3x3 ENDP
-
-MatrixMultiply PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [rcx]
-		movaps	xmm1, XMMWORD PTR [rcx+16]
-		movaps	xmm2, XMMWORD PTR [rcx+32]
-		movaps	xmm3, XMMWORD PTR [rcx+48]
-		movaps	xmm4, XMMWORD PTR [rdx]			; r00, r01, r02, r03
-		movaps	xmm5,xmm4
-		movaps	xmm6,xmm4
-		movaps	xmm7,xmm4
-		shufps	xmm4,xmm4,00000000b
-		shufps	xmm5,xmm5,01010101b
-		shufps	xmm6,xmm6,10101010b
-		shufps	xmm7,xmm7,11111111b
-		mulps	xmm4,xmm0
-		mulps	xmm5,xmm1
-		mulps	xmm6,xmm2
-		mulps	xmm7,xmm3
-		addps	xmm4,xmm5
-		addps	xmm4,xmm6
-		addps	xmm4,xmm7
-		movaps	XMMWORD PTR [rcx],xmm4
-		movaps	xmm4, XMMWORD PTR [rdx+16]		; r04, r05, r06, r07
-		movaps	xmm5,xmm4
-		movaps	xmm6,xmm4
-		movaps	xmm7,xmm4
-		shufps	xmm4,xmm4,00000000b
-		shufps	xmm5,xmm5,01010101b
-		shufps	xmm6,xmm6,10101010b
-		shufps	xmm7,xmm7,11111111b
-		mulps	xmm4,xmm0
-		mulps	xmm5,xmm1
-		mulps	xmm6,xmm2
-		mulps	xmm7,xmm3
-		addps	xmm4,xmm5
-		addps	xmm4,xmm6
-		addps	xmm4,xmm7
-		movaps	XMMWORD PTR [rcx+16],xmm4
-		movaps	xmm4, XMMWORD PTR [rdx+32]		; r08, r09, r10, r11
-		movaps	xmm5,xmm4
-		movaps	xmm6,xmm4
-		movaps	xmm7,xmm4
-		shufps	xmm4,xmm4,00000000b
-		shufps	xmm5,xmm5,01010101b
-		shufps	xmm6,xmm6,10101010b
-		shufps	xmm7,xmm7,11111111b
-		mulps	xmm4,xmm0
-		mulps	xmm5,xmm1
-		mulps	xmm6,xmm2
-		mulps	xmm7,xmm3
-		addps	xmm4,xmm5
-		addps	xmm4,xmm6
-		addps	xmm4,xmm7
-		movaps	XMMWORD PTR [rcx+32],xmm4
-		movaps	xmm4, XMMWORD PTR [rdx+48]		; r12, r13, r14, r15
-		movaps	xmm5,xmm4
-		movaps	xmm6,xmm4
-		movaps	xmm7,xmm4
-		shufps	xmm4,xmm4,00000000b
-		shufps	xmm5,xmm5,01010101b
-		shufps	xmm6,xmm6,10101010b
-		shufps	xmm7,xmm7,11111111b
-		mulps	xmm4,xmm0
-		mulps	xmm5,xmm1
-		mulps	xmm6,xmm2
-		mulps	xmm7,xmm3
-		addps	xmm4,xmm5
-		addps	xmm4,xmm6
-		addps	xmm4,xmm7
-		movaps	XMMWORD PTR [rcx+48],xmm4
-		ret		0
-MatrixMultiply ENDP
-
-MatrixTranslate PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [rcx]
-		movaps	xmm1, XMMWORD PTR [rcx+16]
-		movaps	xmm2, XMMWORD PTR [rcx+32]
-		movaps	xmm3, XMMWORD PTR [rcx+48]
-		movaps	xmm4, XMMWORD PTR [rdx]
-		movaps	xmm5, xmm4
-		movaps	xmm6, xmm4
-		movaps	xmm7, xmm4
-		shufps	xmm4, xmm4, 00000000b
-		shufps	xmm5, xmm5, 01010101b
-		shufps	xmm6, xmm6, 10101010b
-		mulps	xmm4, xmm0
-		mulps	xmm5, xmm1
-		mulps	xmm6, xmm2
-		addps	xmm4, xmm5
-		addps	xmm4, xmm6
-		addps	xmm4, xmm3
-		movaps	XMMWORD PTR [rcx+48], xmm4
-		ret		0
-MatrixTranslate ENDP
-
-MatrixScale PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [rcx]
-		movaps	xmm1, XMMWORD PTR [rcx+16]
-		movaps	xmm2, XMMWORD PTR [rcx+32]
-		movaps	xmm4, XMMWORD PTR [rdx]
-		movaps	xmm5, xmm4
-		movaps	xmm6, xmm4
-		shufps	xmm4, xmm4, 00000000b
-		shufps	xmm5, xmm5, 01010101b
-		shufps	xmm6, xmm6, 10101010b
-		mulps	xmm4, xmm0
-		mulps	xmm5, xmm1
-		mulps	xmm6, xmm2
-		movaps	XMMWORD PTR [rcx],xmm4
-		movaps	XMMWORD PTR [rcx+16],xmm5
-		movaps	XMMWORD PTR [rcx+32],xmm6
-		ret		0
-MatrixScale ENDP
-
-end
+;
+;	Copyright (C) 2006 yopyop
+;	Copyright (C) 2008 CrazyMax
+;
+;    This file is part of DeSmuME
+;
+;    DeSmuME is free software; you can redistribute it and/or modify
+;    it under the terms of the GNU General Public License as published by
+;    the Free Software Foundation; either version 2 of the License, or
+;    (at your option) any later version.
+;
+;    DeSmuME is distributed in the hope that it will be useful,
+;    but WITHOUT ANY WARRANTY; without even the implied warranty of
+;    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;    GNU General Public License for more details.
+;
+;    You should have received a copy of the GNU General Public License
+;    along with DeSmuME; if not, write to the Free Software
+;    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+	TITLE	matrix_sse2-x64.asm
+	.code
+	
+MatrixMultVec4x4 PROC PUBLIC
+		movaps	xmm0, XMMWORD PTR [rcx]
+		movaps	xmm1, XMMWORD PTR [rcx+16]
+		movaps	xmm2, XMMWORD PTR [rcx+32]
+		movaps	xmm3, XMMWORD PTR [rcx+48]
+		movaps	xmm4, XMMWORD PTR [rdx]
+		movaps	xmm5, xmm4
+		movaps	xmm6, xmm4
+		movaps	xmm7, xmm4
+		shufps	xmm4, xmm4, 00000000b
+		shufps	xmm5, xmm5, 01010101b
+		shufps	xmm6, xmm6, 10101010b
+		shufps	xmm7, xmm7, 11111111b
+		mulps	xmm4, xmm0
+		mulps	xmm5, xmm1
+		mulps	xmm6, xmm2
+		mulps	xmm7, xmm3
+		addps	xmm4, xmm5
+		addps	xmm4, xmm6
+		addps	xmm4, xmm7
+		movaps	XMMWORD PTR [rdx], xmm4
+		ret		0
+MatrixMultVec4x4 ENDP
+
+MatrixMultVec3x3 PROC PUBLIC
+		movaps	xmm0, XMMWORD PTR [rcx]
+		movaps	xmm1, XMMWORD PTR [rcx+16]
+		movaps	xmm2, XMMWORD PTR [rcx+32]
+		movaps	xmm4, XMMWORD PTR [rdx]
+		movaps	xmm5, xmm4
+		movaps	xmm6, xmm4
+		movaps	xmm7, xmm4
+		shufps	xmm4, xmm4, 00000000b
+		shufps	xmm5, xmm5, 01010101b
+		shufps	xmm6, xmm6, 10101010b
+		mulps	xmm4, xmm0
+		mulps	xmm5, xmm1
+		mulps	xmm6, xmm2
+		addps	xmm4, xmm5
+		addps	xmm4, xmm6
+		movaps	XMMWORD PTR [rdx], xmm4
+MatrixMultVec3x3 ENDP
+
+MatrixMultiply PROC PUBLIC
+		movaps	xmm0, XMMWORD PTR [rcx]
+		movaps	xmm1, XMMWORD PTR [rcx+16]
+		movaps	xmm2, XMMWORD PTR [rcx+32]
+		movaps	xmm3, XMMWORD PTR [rcx+48]
+		movaps	xmm4, XMMWORD PTR [rdx]			; r00, r01, r02, r03
+		movaps	xmm5,xmm4
+		movaps	xmm6,xmm4
+		movaps	xmm7,xmm4
+		shufps	xmm4,xmm4,00000000b
+		shufps	xmm5,xmm5,01010101b
+		shufps	xmm6,xmm6,10101010b
+		shufps	xmm7,xmm7,11111111b
+		mulps	xmm4,xmm0
+		mulps	xmm5,xmm1
+		mulps	xmm6,xmm2
+		mulps	xmm7,xmm3
+		addps	xmm4,xmm5
+		addps	xmm4,xmm6
+		addps	xmm4,xmm7
+		movaps	XMMWORD PTR [rcx],xmm4
+		movaps	xmm4, XMMWORD PTR [rdx+16]		; r04, r05, r06, r07
+		movaps	xmm5,xmm4
+		movaps	xmm6,xmm4
+		movaps	xmm7,xmm4
+		shufps	xmm4,xmm4,00000000b
+		shufps	xmm5,xmm5,01010101b
+		shufps	xmm6,xmm6,10101010b
+		shufps	xmm7,xmm7,11111111b
+		mulps	xmm4,xmm0
+		mulps	xmm5,xmm1
+		mulps	xmm6,xmm2
+		mulps	xmm7,xmm3
+		addps	xmm4,xmm5
+		addps	xmm4,xmm6
+		addps	xmm4,xmm7
+		movaps	XMMWORD PTR [rcx+16],xmm4
+		movaps	xmm4, XMMWORD PTR [rdx+32]		; r08, r09, r10, r11
+		movaps	xmm5,xmm4
+		movaps	xmm6,xmm4
+		movaps	xmm7,xmm4
+		shufps	xmm4,xmm4,00000000b
+		shufps	xmm5,xmm5,01010101b
+		shufps	xmm6,xmm6,10101010b
+		shufps	xmm7,xmm7,11111111b
+		mulps	xmm4,xmm0
+		mulps	xmm5,xmm1
+		mulps	xmm6,xmm2
+		mulps	xmm7,xmm3
+		addps	xmm4,xmm5
+		addps	xmm4,xmm6
+		addps	xmm4,xmm7
+		movaps	XMMWORD PTR [rcx+32],xmm4
+		movaps	xmm4, XMMWORD PTR [rdx+48]		; r12, r13, r14, r15
+		movaps	xmm5,xmm4
+		movaps	xmm6,xmm4
+		movaps	xmm7,xmm4
+		shufps	xmm4,xmm4,00000000b
+		shufps	xmm5,xmm5,01010101b
+		shufps	xmm6,xmm6,10101010b
+		shufps	xmm7,xmm7,11111111b
+		mulps	xmm4,xmm0
+		mulps	xmm5,xmm1
+		mulps	xmm6,xmm2
+		mulps	xmm7,xmm3
+		addps	xmm4,xmm5
+		addps	xmm4,xmm6
+		addps	xmm4,xmm7
+		movaps	XMMWORD PTR [rcx+48],xmm4
+		ret		0
+MatrixMultiply ENDP
+
+MatrixTranslate PROC PUBLIC
+		movaps	xmm0, XMMWORD PTR [rcx]
+		movaps	xmm1, XMMWORD PTR [rcx+16]
+		movaps	xmm2, XMMWORD PTR [rcx+32]
+		movaps	xmm3, XMMWORD PTR [rcx+48]
+		movaps	xmm4, XMMWORD PTR [rdx]
+		movaps	xmm5, xmm4
+		movaps	xmm6, xmm4
+		movaps	xmm7, xmm4
+		shufps	xmm4, xmm4, 00000000b
+		shufps	xmm5, xmm5, 01010101b
+		shufps	xmm6, xmm6, 10101010b
+		mulps	xmm4, xmm0
+		mulps	xmm5, xmm1
+		mulps	xmm6, xmm2
+		addps	xmm4, xmm5
+		addps	xmm4, xmm6
+		addps	xmm4, xmm3
+		movaps	XMMWORD PTR [rcx+48], xmm4
+		ret		0
+MatrixTranslate ENDP
+
+MatrixScale PROC PUBLIC
+		movaps	xmm0, XMMWORD PTR [rcx]
+		movaps	xmm1, XMMWORD PTR [rcx+16]
+		movaps	xmm2, XMMWORD PTR [rcx+32]
+		movaps	xmm4, XMMWORD PTR [rdx]
+		movaps	xmm5, xmm4
+		movaps	xmm6, xmm4
+		shufps	xmm4, xmm4, 00000000b
+		shufps	xmm5, xmm5, 01010101b
+		shufps	xmm6, xmm6, 10101010b
+		mulps	xmm4, xmm0
+		mulps	xmm5, xmm1
+		mulps	xmm6, xmm2
+		movaps	XMMWORD PTR [rcx],xmm4
+		movaps	XMMWORD PTR [rcx+16],xmm5
+		movaps	XMMWORD PTR [rcx+32],xmm6
+		ret		0
+MatrixScale ENDP
+
+end
--- a/desmume/src/matrix_sse2-x86.asm
+++ b/desmume/src/matrix_sse2-x86.asm
@ -1,183 +1,185 @@
-;
-;	Copyright (C) 2006 yopyop
-;	Copyright (C) 2008 CrazyMax
-;
-;    This file is part of DeSmuME
-;
-;    DeSmuME is free software; you can redistribute it and/or modify
-;    it under the terms of the GNU General Public License as published by
-;    the Free Software Foundation; either version 2 of the License, or
-;    (at your option) any later version.
-;
-;    DeSmuME is distributed in the hope that it will be useful,
-;    but WITHOUT ANY WARRANTY; without even the implied warranty of
-;    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;    GNU General Public License for more details.
-;
-;    You should have received a copy of the GNU General Public License
-;    along with DeSmuME; if not, write to the Free Software
-;    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-	TITLE	matrix_sse2-x86.asm
-	.686P
-	.XMM
-	.model	flat
-	.code
-	
-@MatrixMultVec4x4@8 PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [ecx]
-		movaps	xmm1, XMMWORD PTR [ecx+16]
-		movaps	xmm2, XMMWORD PTR [ecx+32]
-		movaps	xmm3, XMMWORD PTR [ecx+48]
-		movaps	xmm4, XMMWORD PTR [edx]
-		movaps	xmm5, xmm4
-		movaps	xmm6, xmm4
-		movaps	xmm7, xmm4
-		shufps	xmm4, xmm4, 00000000b
-		shufps	xmm5, xmm5, 01010101b
-		shufps	xmm6, xmm6, 10101010b
-		mulps	xmm4, xmm0
-		mulps	xmm5, xmm1
-		mulps	xmm6, xmm2
-		addps	xmm4, xmm5
-		addps	xmm4, xmm6
-		addps	xmm4, xmm3
-		movaps	XMMWORD PTR [edx], xmm4
-		ret		0
-@MatrixMultVec4x4@8 ENDP
-
-@MatrixMultVec3x3@8 PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [ecx]
-		movaps	xmm1, XMMWORD PTR [ecx+16]
-		movaps	xmm2, XMMWORD PTR [ecx+32]
-		movaps	xmm4, XMMWORD PTR [edx]
-		movaps	xmm5, xmm4
-		movaps	xmm6, xmm4
-		movaps	xmm7, xmm4
-		shufps	xmm4, xmm4, 00000000b
-		shufps	xmm5, xmm5, 01010101b
-		shufps	xmm6, xmm6, 10101010b
-		mulps	xmm4, xmm0
-		mulps	xmm5, xmm1
-		mulps	xmm6, xmm2
-		addps	xmm4, xmm5
-		addps	xmm4, xmm6
-		movaps	XMMWORD PTR [edx], xmm4
-		ret		0
-@MatrixMultVec3x3@8 ENDP
-
-@MatrixMultiply@8 PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [ecx]
-		movaps	xmm1, XMMWORD PTR [ecx+16]
-		movaps	xmm2, XMMWORD PTR [ecx+32]
-		movaps	xmm3, XMMWORD PTR [ecx+48]
-		movaps	xmm4, XMMWORD PTR [edx]			; r00, r01, r02, r03
-		movaps	xmm5,xmm4
-		movaps	xmm6,xmm4
-		movaps	xmm7,xmm4
-		shufps	xmm4,xmm4,00000000b
-		shufps	xmm5,xmm5,01010101b
-		shufps	xmm6,xmm6,10101010b
-		shufps	xmm7,xmm7,11111111b
-		mulps	xmm4,xmm0
-		mulps	xmm5,xmm1
-		mulps	xmm6,xmm2
-		mulps	xmm7,xmm3
-		addps	xmm4,xmm5
-		addps	xmm4,xmm6
-		addps	xmm4,xmm7
-		movaps	XMMWORD PTR [ecx],xmm4
-		movaps	xmm4, XMMWORD PTR [edx+16]		; r04, r05, r06, r07
-		movaps	xmm5,xmm4
-		movaps	xmm6,xmm4
-		movaps	xmm7,xmm4
-		shufps	xmm4,xmm4,00000000b
-		shufps	xmm5,xmm5,01010101b
-		shufps	xmm6,xmm6,10101010b
-		shufps	xmm7,xmm7,11111111b
-		mulps	xmm4,xmm0
-		mulps	xmm5,xmm1
-		mulps	xmm6,xmm2
-		mulps	xmm7,xmm3
-		addps	xmm4,xmm5
-		addps	xmm4,xmm6
-		addps	xmm4,xmm7
-		movaps	XMMWORD PTR [ecx+16],xmm4
-		movaps	xmm4, XMMWORD PTR [edx+32]		; r08, r09, r10, r11
-		movaps	xmm5,xmm4
-		movaps	xmm6,xmm4
-		movaps	xmm7,xmm4
-		shufps	xmm4,xmm4,00000000b
-		shufps	xmm5,xmm5,01010101b
-		shufps	xmm6,xmm6,10101010b
-		shufps	xmm7,xmm7,11111111b
-		mulps	xmm4,xmm0
-		mulps	xmm5,xmm1
-		mulps	xmm6,xmm2
-		mulps	xmm7,xmm3
-		addps	xmm4,xmm5
-		addps	xmm4,xmm6
-		addps	xmm4,xmm7
-		movaps	XMMWORD PTR [ecx+32],xmm4
-		movaps	xmm4, XMMWORD PTR [edx+48]		; r12, r13, r14, r15
-		movaps	xmm5,xmm4
-		movaps	xmm6,xmm4
-		movaps	xmm7,xmm4
-		shufps	xmm4,xmm4,00000000b
-		shufps	xmm5,xmm5,01010101b
-		shufps	xmm6,xmm6,10101010b
-		shufps	xmm7,xmm7,11111111b
-		mulps	xmm4,xmm0
-		mulps	xmm5,xmm1
-		mulps	xmm6,xmm2
-		mulps	xmm7,xmm3
-		addps	xmm4,xmm5
-		addps	xmm4,xmm6
-		addps	xmm4,xmm7
-		movaps	XMMWORD PTR [ecx+48],xmm4
-		ret		0
-@MatrixMultiply@8 ENDP
-
-@MatrixTranslate@8 PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [ecx]
-		movaps	xmm1, XMMWORD PTR [ecx+16]
-		movaps	xmm2, XMMWORD PTR [ecx+32]
-		movaps	xmm3, XMMWORD PTR [ecx+48]
-		movaps	xmm4, XMMWORD PTR [edx]
-		movaps	xmm5, xmm4
-		movaps	xmm6, xmm4
-		movaps	xmm7, xmm4
-		shufps	xmm4, xmm4, 00000000b
-		shufps	xmm5, xmm5, 01010101b
-		shufps	xmm6, xmm6, 10101010b
-		mulps	xmm4, xmm0
-		mulps	xmm5, xmm1
-		mulps	xmm6, xmm2
-		addps	xmm4, xmm5
-		addps	xmm4, xmm6
-		addps	xmm4, xmm3
-		movaps	XMMWORD PTR [ecx+48], xmm4
-		ret		0
-@MatrixTranslate@8 ENDP
-
-@MatrixScale@8 PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [ecx]
-		movaps	xmm1, XMMWORD PTR [ecx+16]
-		movaps	xmm2, XMMWORD PTR [ecx+32]
-		movaps	xmm4, XMMWORD PTR [edx]
-		movaps	xmm5, xmm4
-		movaps	xmm6, xmm4
-		shufps	xmm4, xmm4, 00000000b
-		shufps	xmm5, xmm5, 01010101b
-		shufps	xmm6, xmm6, 10101010b
-		mulps	xmm4, xmm0
-		mulps	xmm5, xmm1
-		mulps	xmm6, xmm2
-		movaps	XMMWORD PTR [ecx],xmm4
-		movaps	XMMWORD PTR [ecx+16],xmm5
-		movaps	XMMWORD PTR [ecx+32],xmm6
-		ret		0
-@MatrixScale@8 ENDP
-
-end
-
+;
+;	Copyright (C) 2006 yopyop
+;	Copyright (C) 2008 CrazyMax
+;
+;    This file is part of DeSmuME
+;
+;    DeSmuME is free software; you can redistribute it and/or modify
+;    it under the terms of the GNU General Public License as published by
+;    the Free Software Foundation; either version 2 of the License, or
+;    (at your option) any later version.
+;
+;    DeSmuME is distributed in the hope that it will be useful,
+;    but WITHOUT ANY WARRANTY; without even the implied warranty of
+;    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;    GNU General Public License for more details.
+;
+;    You should have received a copy of the GNU General Public License
+;    along with DeSmuME; if not, write to the Free Software
+;    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+	TITLE	matrix_sse2-x86.asm
+	.686P
+	.XMM
+	.model	flat
+	.code
+	
+@MatrixMultVec4x4@8 PROC PUBLIC
+		movaps	xmm0, XMMWORD PTR [ecx]
+		movaps	xmm1, XMMWORD PTR [ecx+16]
+		movaps	xmm2, XMMWORD PTR [ecx+32]
+		movaps	xmm3, XMMWORD PTR [ecx+48]
+		movaps	xmm4, XMMWORD PTR [edx]
+		movaps	xmm5, xmm4
+		movaps	xmm6, xmm4
+		movaps	xmm7, xmm4
+		shufps	xmm4, xmm4, 00000000b
+		shufps	xmm5, xmm5, 01010101b
+		shufps	xmm6, xmm6, 10101010b
+		shufps	xmm7, xmm7, 11111111b
+		mulps	xmm4, xmm0
+		mulps	xmm5, xmm1
+		mulps	xmm6, xmm2
+		mulps	xmm7, xmm3
+		addps	xmm4, xmm5
+		addps	xmm4, xmm6
+		addps	xmm4, xmm7
+		movaps	XMMWORD PTR [edx], xmm4
+		ret		0
+@MatrixMultVec4x4@8 ENDP
+
+@MatrixMultVec3x3@8 PROC PUBLIC
+		movaps	xmm0, XMMWORD PTR [ecx]
+		movaps	xmm1, XMMWORD PTR [ecx+16]
+		movaps	xmm2, XMMWORD PTR [ecx+32]
+		movaps	xmm4, XMMWORD PTR [edx]
+		movaps	xmm5, xmm4
+		movaps	xmm6, xmm4
+		movaps	xmm7, xmm4
+		shufps	xmm4, xmm4, 00000000b
+		shufps	xmm5, xmm5, 01010101b
+		shufps	xmm6, xmm6, 10101010b
+		mulps	xmm4, xmm0
+		mulps	xmm5, xmm1
+		mulps	xmm6, xmm2
+		addps	xmm4, xmm5
+		addps	xmm4, xmm6
+		movaps	XMMWORD PTR [edx], xmm4
+		ret		0
+@MatrixMultVec3x3@8 ENDP
+
+@MatrixMultiply@8 PROC PUBLIC
+		movaps	xmm0, XMMWORD PTR [ecx]
+		movaps	xmm1, XMMWORD PTR [ecx+16]
+		movaps	xmm2, XMMWORD PTR [ecx+32]
+		movaps	xmm3, XMMWORD PTR [ecx+48]
+		movaps	xmm4, XMMWORD PTR [edx]			; r00, r01, r02, r03
+		movaps	xmm5,xmm4
+		movaps	xmm6,xmm4
+		movaps	xmm7,xmm4
+		shufps	xmm4,xmm4,00000000b
+		shufps	xmm5,xmm5,01010101b
+		shufps	xmm6,xmm6,10101010b
+		shufps	xmm7,xmm7,11111111b
+		mulps	xmm4,xmm0
+		mulps	xmm5,xmm1
+		mulps	xmm6,xmm2
+		mulps	xmm7,xmm3
+		addps	xmm4,xmm5
+		addps	xmm4,xmm6
+		addps	xmm4,xmm7
+		movaps	XMMWORD PTR [ecx],xmm4
+		movaps	xmm4, XMMWORD PTR [edx+16]		; r04, r05, r06, r07
+		movaps	xmm5,xmm4
+		movaps	xmm6,xmm4
+		movaps	xmm7,xmm4
+		shufps	xmm4,xmm4,00000000b
+		shufps	xmm5,xmm5,01010101b
+		shufps	xmm6,xmm6,10101010b
+		shufps	xmm7,xmm7,11111111b
+		mulps	xmm4,xmm0
+		mulps	xmm5,xmm1
+		mulps	xmm6,xmm2
+		mulps	xmm7,xmm3
+		addps	xmm4,xmm5
+		addps	xmm4,xmm6
+		addps	xmm4,xmm7
+		movaps	XMMWORD PTR [ecx+16],xmm4
+		movaps	xmm4, XMMWORD PTR [edx+32]		; r08, r09, r10, r11
+		movaps	xmm5,xmm4
+		movaps	xmm6,xmm4
+		movaps	xmm7,xmm4
+		shufps	xmm4,xmm4,00000000b
+		shufps	xmm5,xmm5,01010101b
+		shufps	xmm6,xmm6,10101010b
+		shufps	xmm7,xmm7,11111111b
+		mulps	xmm4,xmm0
+		mulps	xmm5,xmm1
+		mulps	xmm6,xmm2
+		mulps	xmm7,xmm3
+		addps	xmm4,xmm5
+		addps	xmm4,xmm6
+		addps	xmm4,xmm7
+		movaps	XMMWORD PTR [ecx+32],xmm4
+		movaps	xmm4, XMMWORD PTR [edx+48]		; r12, r13, r14, r15
+		movaps	xmm5,xmm4
+		movaps	xmm6,xmm4
+		movaps	xmm7,xmm4
+		shufps	xmm4,xmm4,00000000b
+		shufps	xmm5,xmm5,01010101b
+		shufps	xmm6,xmm6,10101010b
+		shufps	xmm7,xmm7,11111111b
+		mulps	xmm4,xmm0
+		mulps	xmm5,xmm1
+		mulps	xmm6,xmm2
+		mulps	xmm7,xmm3
+		addps	xmm4,xmm5
+		addps	xmm4,xmm6
+		addps	xmm4,xmm7
+		movaps	XMMWORD PTR [ecx+48],xmm4
+		ret		0
+@MatrixMultiply@8 ENDP
+
+@MatrixTranslate@8 PROC PUBLIC
+		movaps	xmm0, XMMWORD PTR [ecx]
+		movaps	xmm1, XMMWORD PTR [ecx+16]
+		movaps	xmm2, XMMWORD PTR [ecx+32]
+		movaps	xmm3, XMMWORD PTR [ecx+48]
+		movaps	xmm4, XMMWORD PTR [edx]
+		movaps	xmm5, xmm4
+		movaps	xmm6, xmm4
+		movaps	xmm7, xmm4
+		shufps	xmm4, xmm4, 00000000b
+		shufps	xmm5, xmm5, 01010101b
+		shufps	xmm6, xmm6, 10101010b
+		mulps	xmm4, xmm0
+		mulps	xmm5, xmm1
+		mulps	xmm6, xmm2
+		addps	xmm4, xmm5
+		addps	xmm4, xmm6
+		addps	xmm4, xmm3
+		movaps	XMMWORD PTR [ecx+48], xmm4
+		ret		0
+@MatrixTranslate@8 ENDP
+
+@MatrixScale@8 PROC PUBLIC
+		movaps	xmm0, XMMWORD PTR [ecx]
+		movaps	xmm1, XMMWORD PTR [ecx+16]
+		movaps	xmm2, XMMWORD PTR [ecx+32]
+		movaps	xmm4, XMMWORD PTR [edx]
+		movaps	xmm5, xmm4
+		movaps	xmm6, xmm4
+		shufps	xmm4, xmm4, 00000000b
+		shufps	xmm5, xmm5, 01010101b
+		shufps	xmm6, xmm6, 10101010b
+		mulps	xmm4, xmm0
+		mulps	xmm5, xmm1
+		mulps	xmm6, xmm2
+		movaps	XMMWORD PTR [ecx],xmm4
+		movaps	XMMWORD PTR [ecx+16],xmm5
+		movaps	XMMWORD PTR [ecx+32],xmm6
+		ret		0
+@MatrixScale@8 ENDP
+
+end
+
--- a/desmume/src/windows/OGLRender.c
+++ b/desmume/src/windows/OGLRender.c