reconfigure sse optimizations. all asm routines have been rewritten with intrinsics so that they are more neatly inlineable; this speeds them up by a couple of FPS; also, they work in gcc; and they can be enabled separately with ENABLE_SSE and ENABLE_SSE2; 95% of the optimizations come from the SSE optimizations, meaning that SSE2 will not be necessary to enjoy the bulk of the optimizations. I think.

2009-07-20 23:33:39 +00:00 · 2009-07-20 23:33:39 +00:00 · 0d36fecf93
parent 3abc58ae36
commit 0d36fecf93
14 changed files with 308 additions and 826 deletions
--- a/desmume/src/Makefile.am
+++ b/desmume/src/Makefile.am
@ -1,9 +1,8 @@
 include $(top_srcdir)/src/desmume.mk

-AM_CPPFLAGS += $(SDL_CFLAGS) $(GTK_CFLAGS) $(GTHREAD_CFLAGS) $(X_CFLAGS) $(LUA_CFLAGS) $(ALSA_CFLAGS) $(LIBAGG_CFLAGS)
+AM_CPPFLAGS += $(SDL_CFLAGS) $(GTK_CFLAGS) $(GTHREAD_CFLAGS) $(X_CFLAGS) $(LUA_CFLAGS) $(ALSA_CFLAGS) $(LIBAGG_CFLAGS) 

-EXTRA_DIST = build.bat instruction_tabdef.inc thumb_tabdef.inc fs-linux.cpp fs-windows.cpp \
-	matrix_sse2-x64.asm matrix_sse2-x86.asm
+EXTRA_DIST = build.bat instruction_tabdef.inc thumb_tabdef.inc fs-linux.cpp fs-windows.cpp
 if HAVE_GDB_STUB
 SUBDIRS = . gdbstub $(UI_DIR)
 else
--- a/desmume/src/OGLRender.cpp
+++ b/desmume/src/OGLRender.cpp
@ -73,15 +73,8 @@ static void ENDGL() {
 #include "shaders.h"
 #include "texcache.h"

-
-
-#ifndef CTASSERT
-#define	CTASSERT(x)		typedef char __assert ## y[(x) ? 1 : -1]
-#endif
-
 static ALIGN(16) u8  GPU_screen3D			[256*192*4];

-
 static const unsigned short map3d_cull[4] = {GL_FRONT_AND_BACK, GL_FRONT, GL_BACK, 0};
 static const int texEnv[4] = { GL_MODULATE, GL_DECAL, GL_MODULATE, GL_MODULATE };
 static const int depthFunc[2] = { GL_LESS, GL_EQUAL };
@ -703,17 +696,15 @@ static void GL_ReadFramebuffer()
 		u16* dst = gfx3d_convertedScreen + (y<<8);
 		u8* dstAlpha = gfx3d_convertedAlpha + (y<<8);

-		#ifndef NOSSE2
 			//I dont know much about this kind of stuff, but this seems to help
 			//for some reason I couldnt make the intrinsics work 
-			u8* u8screen3D =  (u8*)&((u32*)GPU_screen3D)[i];
-			#define PREFETCH32(X,Y) __asm { prefetchnta [u8screen3D+32*0x##X##Y] }
+			//u8* u8screen3D =  (u8*)&((u32*)GPU_screen3D)[i];
+			/*#define PREFETCH32(X,Y) __asm { prefetchnta [u8screen3D+32*0x##X##Y] }
 			#define PREFETCH128(X) 	PREFETCH32(X,0) PREFETCH32(X,1) PREFETCH32(X,2) PREFETCH32(X,3) \
 									PREFETCH32(X,4) PREFETCH32(X,5) PREFETCH32(X,6) PREFETCH32(X,7) \
 									PREFETCH32(X,8) PREFETCH32(X,9) PREFETCH32(X,A) PREFETCH32(X,B) \
 									PREFETCH32(X,C) PREFETCH32(X,D) PREFETCH32(X,E) PREFETCH32(X,F) 
-			PREFETCH128(0); PREFETCH128(1);
-		#endif
+			PREFETCH128(0); PREFETCH128(1);*/

 		for(int x=0;x<256;x++,i++)
 		{
--- a/desmume/src/gfx3d.cpp
+++ b/desmume/src/gfx3d.cpp
@ -1,3 +1,4 @@
+//2
 /*	Copyright (C) 2006 yopyop
    yopyop156@ifrance.com
    yopyop156.ifrance.com
@ -146,11 +147,6 @@ static float float10Table[1024];
 static float float10RelTable[1024];
 static float normalTable[1024];

-#ifndef NOSSE2
-float ALIGN(16) _fix2float_divizor_mask[4] = { 4096.f, 4096.f, 4096.f, 4096.f };
-float ALIGN(16) _fix10_2float_divizor_mask[4] = { 512.f, 512.f, 512.f, 512.f };
-#endif
-
 #define fix2float(v)    (((float)((s32)(v))) / (float)(1<<12))
 #define fix10_2float(v) (((float)((s32)(v))) / (float)(1<<9))

@ -317,6 +313,20 @@ static void makeTables() {

 void gfx3d_init()
 {
+	//DWORD start = timeGetTime();
+	//for(int i=0;i<1000000000;i++)
+	//	MatrixMultVec4x4(mtxCurrent[0],mtxCurrent[1]);
+	//DWORD end = timeGetTime();
+	//DWORD diff = end-start;
+
+	//start = timeGetTime();
+	//for(int i=0;i<1000000000;i++)
+	//	MatrixMultVec4x4_b(mtxCurrent[0],mtxCurrent[1]);
+	//end = timeGetTime();
+	//DWORD diff2 = end-start;
+
+	//printf("SPEED TEST %d %d\n",diff,diff2);
+
 	if(polylists == NULL) { polylists = new POLYLIST[2]; polylist = &polylists[0]; }
 	if(vertlists == NULL) { vertlists = new VERTLIST[2]; vertlist = &vertlists[0]; }
 	makeTables();
@ -409,15 +419,11 @@ static void SetVertex()
 	if(polylist->count >= POLYLIST_SIZE) 
 			return;
 	
-#ifdef NOSSE2
-	//apply modelview matrix
-	MatrixMultVec4x4 (mtxCurrent[1], coordTransformed);
-
-	//apply projection matrix
-	MatrixMultVec4x4 (mtxCurrent[0], coordTransformed);
-#else
-	_sse2_MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed);
-#endif
+	//TODO - think about keeping the clip matrix concatenated,
+	//so that we only have to multiply one matrix here
+	//(we could lazy cache the concatenated clip matrix and only generate it
+	//when we need to)
+	MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed);

 	//TODO - culling should be done here.
 	//TODO - viewport transform?
@ -694,11 +700,7 @@ void FORCEINLINE gfx3d_glLoadIdentity()

 void FORCEINLINE gfx3d_glLoadMatrix4x4(s32 v)
 {
-#ifdef NOSSE2
-	mtxCurrent[mode][0] = fix2float(v);
-#else
 	mtxCurrent[mode][0] = v;
-#endif

 	for (int i = 1; i < 16; i++)
 	{
@ -707,16 +709,10 @@ void FORCEINLINE gfx3d_glLoadMatrix4x4(s32 v)

 		if (!GFX_PIPErecv(&cmd, &param)) break;
 		dEXEC("glLoadMatrix4x4", 0x16, cmd);
-#ifdef NOSSE2
-		mtxCurrent[mode][i] = fix2float((s32)param);
-#else
 		mtxCurrent[mode][i] = (s32)param;
-#endif
 	}

-#ifndef NOSSE2
-	_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
-#endif
+	vector_fix2float<4>(mtxCurrent[mode], 4096.f);

 	GFX_DELAY(19);

@ -726,11 +722,7 @@ void FORCEINLINE gfx3d_glLoadMatrix4x4(s32 v)

 void FORCEINLINE gfx3d_glLoadMatrix4x3(s32 v)
 {
-#ifdef NOSSE2
-	mtxCurrent[mode][0] = fix2float(v);
-#else
 	mtxCurrent[mode][0] = v;
-#endif

 	for (int i = 1; i < 16; i++)
 	{
@ -740,16 +732,10 @@ void FORCEINLINE gfx3d_glLoadMatrix4x3(s32 v)

 		if (!GFX_PIPErecv(&cmd, &param)) break;
 		dEXEC("glLoadMatrix4x3", 0x17, cmd);
-#ifdef NOSSE2
-		mtxCurrent[mode][i] = fix2float((s32)param);
-#else
 		mtxCurrent[mode][i] = (s32)param;
-#endif
 	}

-#ifndef NOSSE2
-	_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
-#endif
+	vector_fix2float<4>(mtxCurrent[mode], 4096.f);

 	//fill in the unusued matrix values
 	mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0.f;
@ -763,11 +749,7 @@ void FORCEINLINE gfx3d_glLoadMatrix4x3(s32 v)

 void FORCEINLINE gfx3d_glMultMatrix4x4(s32 v)
 {
-#ifdef NOSSE2
-	mtxTemporal[0] = fix2float(v);
-#else
 	mtxTemporal[0] = v;
-#endif

 	for (int i = 1; i < 16; i++)
 	{
@ -776,16 +758,10 @@ void FORCEINLINE gfx3d_glMultMatrix4x4(s32 v)

 		if (!GFX_PIPErecv(&cmd, &param)) break;
 		dEXEC("glMultMatrix4x4", 0x18, cmd);
-#ifdef NOSSE2
-		mtxTemporal[i] = fix2float((s32)param);
-#else
 		mtxTemporal[i] = (s32)param;
-#endif
 	}

-#ifndef NOSSE2
-	_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
-#endif
+	vector_fix2float<4>(mtxTemporal, 4096.f);

 	MatrixMultiply (mtxCurrent[mode], mtxTemporal);

@ -802,11 +778,7 @@ void FORCEINLINE gfx3d_glMultMatrix4x4(s32 v)

 void FORCEINLINE gfx3d_glMultMatrix4x3(s32 v)
 {
-#ifdef NOSSE2
-	mtxTemporal[0] = fix2float(v);
-#else
 	mtxTemporal[0] = v;
-#endif

 	for (int i = 1; i < 16; i++)
 	{
@ -816,16 +788,10 @@ void FORCEINLINE gfx3d_glMultMatrix4x3(s32 v)

 		if (!GFX_PIPErecv(&cmd, &param)) break;
 		dEXEC("glMultMatrix4x3", 0x19, cmd);
-#ifdef NOSSE2
-		mtxTemporal[i] = fix2float((s32)param);
-#else
 		mtxTemporal[i] = (s32)param;
-#endif
 	}

-#ifndef NOSSE2
-	_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
-#endif
+	vector_fix2float<4>(mtxTemporal, 4096.f);

 	//fill in the unusued matrix values
 	mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0.f;
@ -847,12 +813,7 @@ void FORCEINLINE gfx3d_glMultMatrix4x3(s32 v)

 void FORCEINLINE gfx3d_glMultMatrix3x3(s32 v)
 {
-#ifdef NOSSE2
-	mtxTemporal[0] = fix2float(v);
-#else
 	mtxTemporal[0] = v;
-#endif
-

 	for (int i = 1; i < 12; i++)
 	{
@ -862,16 +823,10 @@ void FORCEINLINE gfx3d_glMultMatrix3x3(s32 v)

 		if (!GFX_PIPErecv(&cmd, &param)) break;
 		dEXEC("glMultMatrix3x3", 0x1A, cmd);
-#ifdef NOSSE2
-		mtxTemporal[i] = fix2float((s32)param);
-#else
 		mtxTemporal[i] = (s32)param;
-#endif
 	}

-#ifndef NOSSE2
-	_sse2_fix2float_12(mtxTemporal, _fix2float_divizor_mask);
-#endif
+	vector_fix2float<3>(mtxTemporal, 4096.f);

 	//fill in the unusued matrix values
 	mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0;
@ -1276,12 +1231,7 @@ void FORCEINLINE gfx3d_glPosTest(u32 v)
 	PTcoords[2] = float16table[param & 0xFFFF];
 	PTcoords[3] = 1.0f;

-#ifdef NOSSE2
-	MatrixMultVec4x4 (mtxCurrent[1], PTcoords);
-	MatrixMultVec4x4 (mtxCurrent[0], PTcoords);
-#else
-	_sse2_MatrixMultVec4x4_M2(mtxCurrent[0], PTcoords);
-#endif
+	MatrixMultVec4x4_M2(mtxCurrent[0], PTcoords);

 	gxstat &= 0xFFFFFFFE;		// cleay busy bit
 	T1WriteLong(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x600, gxstat);
@ -1413,11 +1363,7 @@ void gfx3d_glLoadIdentity()

 BOOL gfx3d_glLoadMatrix4x4(s32 v)
 {
-#ifdef NOSSE2
-	mtxCurrent[mode][ML4x4ind] = fix2float(v);
-#else
 	mtxCurrent[mode][ML4x4ind] = v;
-#endif

 	++ML4x4ind;
 	if(ML4x4ind<16) return FALSE;
@ -1425,9 +1371,7 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v)

 	GFX_DELAY(19);

-#ifndef NOSSE2
-	_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
-#endif
+	vector_fix2float<4>(mtxCurrent[mode], 4096.f);

 	if (mode == 2)
 		MatrixCopy (mtxCurrent[1], mtxCurrent[2]);
@ -1436,20 +1380,14 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v)

 BOOL gfx3d_glLoadMatrix4x3(s32 v)
 {
-#ifdef NOSSE2
-	mtxCurrent[mode][ML4x3ind] = fix2float(v);
-#else
 	mtxCurrent[mode][ML4x3ind] = v;
-#endif

 	ML4x3ind++;
 	if((ML4x3ind & 0x03) == 3) ML4x3ind++;
 	if(ML4x3ind<16) return FALSE;
 	ML4x3ind = 0;

-#ifndef NOSSE2
-	_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
-#endif
+	vector_fix2float<4>(mtxCurrent[mode], 4096.f);

 	//fill in the unusued matrix values
 	mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0.f;
@ -1464,11 +1402,7 @@ BOOL gfx3d_glLoadMatrix4x3(s32 v)

 BOOL gfx3d_glMultMatrix4x4(s32 v)
 {
-#ifdef NOSSE2
-	mtxTemporal[MM4x4ind] = fix2float(v);
-#else
 	mtxTemporal[MM4x4ind] = v;
-#endif

 	MM4x4ind++;
 	if(MM4x4ind<16) return FALSE;
@ -1476,9 +1410,7 @@ BOOL gfx3d_glMultMatrix4x4(s32 v)

 	GFX_DELAY(35);

-#ifndef NOSSE2
-	_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
-#endif
+	vector_fix2float<4>(mtxTemporal, 4096.f);

 	MatrixMultiply (mtxCurrent[mode], mtxTemporal);

@ -1494,11 +1426,7 @@ BOOL gfx3d_glMultMatrix4x4(s32 v)

 BOOL gfx3d_glMultMatrix4x3(s32 v)
 {
-#ifdef NOSSE2
-	mtxTemporal[MM4x3ind] = fix2float(v);
-#else
 	mtxTemporal[MM4x3ind] = v;
-#endif

 	MM4x3ind++;
 	if((MM4x3ind & 0x03) == 3) MM4x3ind++;
@ -1507,9 +1435,7 @@ BOOL gfx3d_glMultMatrix4x3(s32 v)

 	GFX_DELAY(31);

-#ifndef NOSSE2
-	_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
-#endif
+	vector_fix2float<4>(mtxTemporal, 4096.f);

 	//fill in the unusued matrix values
 	mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0.f;
@ -1530,11 +1456,7 @@ BOOL gfx3d_glMultMatrix4x3(s32 v)

 BOOL gfx3d_glMultMatrix3x3(s32 v)
 {
-#ifdef NOSSE2
-	mtxTemporal[MM3x3ind] = fix2float(v);
-#else
 	mtxTemporal[MM3x3ind] = v;
-#endif


 	MM3x3ind++;
@ -1544,9 +1466,7 @@ BOOL gfx3d_glMultMatrix3x3(s32 v)

 	GFX_DELAY(28);

-#ifndef NOSSE2
-	_sse2_fix2float_12(mtxTemporal, _fix2float_divizor_mask);
-#endif
+	vector_fix2float<3>(mtxTemporal, 4096.f);

 	//fill in the unusued matrix values
 	mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0;
--- a/desmume/src/matrix.cpp
+++ b/desmume/src/matrix.cpp
@ -25,17 +25,10 @@
 #include <assert.h>
 #include "matrix.h"

-extern "C" {
-
-
-void MatrixInit  (float *matrix)
-{
-	memset (matrix, 0, sizeof(float)*16);
-	matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
-}
-
-#ifdef NOSSE2
-void MATRIXFASTCALL MatrixMultVec4x4 (const float *matrix, float *vecPtr)
+//-------------------------
+//switched SSE functions: implementations for no SSE
+#ifndef ENABLE_SSE
+void MatrixMultVec4x4 (const float *matrix, float *vecPtr)
 {
 	float x = vecPtr[0];
 	float y = vecPtr[1];
@ -48,7 +41,8 @@ void MATRIXFASTCALL MatrixMultVec4x4 (const float *matrix, float *vecPtr)
 	vecPtr[3] = x * matrix[3] + y * matrix[7] + z * matrix[11] + w * matrix[15];
 }

-void MATRIXFASTCALL MatrixMultVec3x3 (const float *matrix, float *vecPtr)
+
+void MatrixMultVec3x3 (const float *matrix, float *vecPtr)
 {
 	float x = vecPtr[0];
 	float y = vecPtr[1];
@ -59,7 +53,7 @@ void MATRIXFASTCALL MatrixMultVec3x3 (const float *matrix, float *vecPtr)
 	vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10];
 }

-void MATRIXFASTCALL MatrixMultiply (float *matrix, const float *rightMatrix)
+void MatrixMultiply (float *matrix, const float *rightMatrix)
 {
 	float tmpMatrix[16];

@ -86,7 +80,7 @@ void MATRIXFASTCALL MatrixMultiply (float *matrix, const float *rightMatrix)
 	memcpy (matrix, tmpMatrix, sizeof(float)*16);
 }

-void MATRIXFASTCALL MatrixTranslate	(float *matrix, const float *ptr)
+void MatrixTranslate	(float *matrix, const float *ptr)
 {
 	matrix[12] += (matrix[0]*ptr[0])+(matrix[4]*ptr[1])+(matrix[ 8]*ptr[2]);
 	matrix[13] += (matrix[1]*ptr[0])+(matrix[5]*ptr[1])+(matrix[ 9]*ptr[2]);
@ -94,7 +88,7 @@ void MATRIXFASTCALL MatrixTranslate	(float *matrix, const float *ptr)
 	matrix[15] += (matrix[3]*ptr[0])+(matrix[7]*ptr[1])+(matrix[11]*ptr[2]);
 }

-void MATRIXFASTCALL MatrixScale (float *matrix, const float *ptr)
+void MatrixScale (float *matrix, const float *ptr)
 {
 	matrix[0]  *= ptr[0];
 	matrix[1]  *= ptr[0];
@ -111,9 +105,16 @@ void MATRIXFASTCALL MatrixScale (float *matrix, const float *ptr)
 	matrix[10] *= ptr[2];
 	matrix[11] *= ptr[2];
 }
+
 #endif //switched c/asm functions
 //-----------------------------------------

+void MatrixInit  (float *matrix)
+{
+	memset (matrix, 0, sizeof(float)*16);
+	matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
+}
+
 void MatrixTranspose(float *matrix)
 {
 	float temp;
@ -127,7 +128,7 @@ void MatrixTranspose(float *matrix)
 #undef swap
 }

-void MATRIXFASTCALL MatrixIdentity	(float *matrix)
+void MatrixIdentity	(float *matrix)
 {
 	matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0.0f;
 	matrix[6] = matrix[7] = matrix[8] = matrix[9] = 0.0f;
@ -135,7 +136,7 @@ void MATRIXFASTCALL MatrixIdentity	(float *matrix)
 	matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
 }

-float MATRIXFASTCALL MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix)
+float MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix)
 {
 	int iMod = index%4, iDiv = (index>>2)<<2;

@ -143,12 +144,12 @@ float MATRIXFASTCALL MatrixGetMultipliedIndex (int index, float *matrix, float *
 			(matrix[iMod+8]*rightMatrix[iDiv+2])+(matrix[iMod+12]*rightMatrix[iDiv+3]);
 }

-void MATRIXFASTCALL MatrixSet (float *matrix, int x, int y, float value)	// TODO
+void MatrixSet (float *matrix, int x, int y, float value)	// TODO
 {
 	matrix [x+(y<<2)] = value;
 }

-void MATRIXFASTCALL MatrixCopy (float* matrixDST, const float* matrixSRC)
+void MatrixCopy (float* matrixDST, const float* matrixSRC)
 {
 	matrixDST[0] = matrixSRC[0];
 	matrixDST[1] = matrixSRC[1];
@ -169,7 +170,7 @@ void MATRIXFASTCALL MatrixCopy (float* matrixDST, const float* matrixSRC)

 }

-int MATRIXFASTCALL MatrixCompare (const float* matrixDST, const float* matrixSRC)
+int MatrixCompare (const float* matrixDST, const float* matrixSRC)
 {
 	return memcmp((void*)matrixDST, matrixSRC, sizeof(float)*16);
 }
@ -340,5 +341,4 @@ void Vector4Copy(float *dst, const float *src)
 	dst[3] = src[3];
 }

-} //extern "C"

--- a/desmume/src/matrix.h
+++ b/desmume/src/matrix.h
@ -1,5 +1,5 @@
-/*  
-	Copyright (C) 2006-2007 shash
+/*  Copyright (C) 2006-2007 shash
+	Copyright (C) 2009 DeSmuME team

    This file is part of DeSmuME

@ -27,17 +27,14 @@
 #include "types.h"
 #include "mem.h"

-#if !defined(NOSSE2) && !defined(SSE2_NOINTRIN)
-#define SSE2_INTRIN
+#ifdef ENABLE_SSE
+#include <xmmintrin.h>
 #endif

-#ifdef SSE2_INTRIN
-#include <xmmintrin.h>
+#ifdef ENABLE_SSE2
 #include <emmintrin.h>
 #endif

-extern "C" {
-
 struct MatrixStack
 {
 	MatrixStack(int size);
@ -48,42 +45,15 @@ struct MatrixStack

 void	MatrixInit				(float *matrix);

-#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
-#define MATRIXFASTCALL __fastcall
-#else
-#define MATRIXFASTCALL
-#endif
-
 //In order to conditionally use these asm optimized functions in visual studio
 //without having to make new build types to exclude the assembly files.
 //a bit sloppy, but there aint much to it
-#ifndef NOSSE2
-#define SSE2_FUNC(X) _sse2_##X
-#define MatrixMultVec4x4 _sse2_MatrixMultVec4x4
-#define MatrixMultVec3x3 _sse2_MatrixMultVec3x3
-#define MatrixMultiply _sse2_MatrixMultiply
-#define MatrixTranslate _sse2_MatrixTranslate
-#define MatrixScale _sse2_MatrixScale
-void	MATRIXFASTCALL _sse2_fix2float_16		(float* matrix, float* divizor_mask);
-void	MATRIXFASTCALL _sse2_fix2float_12		(float* matrix, float* divizor_mask);
-void	MATRIXFASTCALL _sse2_MatrixMultVec4x4_M2 (const float * matrix, float * vecPtr); // mode 2
-#else
-#define SSE2_FUNC(X) X
-#endif

-void	MATRIXFASTCALL SSE2_FUNC(MatrixMultVec3x3)		(const float * matrix, float * vecPtr);
-void	MATRIXFASTCALL SSE2_FUNC(MatrixMultVec4x4)		(const float * matrix, float * vecPtr);
-void	MATRIXFASTCALL SSE2_FUNC(MatrixMultiply)		(float * matrix, const float * rightMatrix);
-void	MATRIXFASTCALL SSE2_FUNC(MatrixTranslate)		(float *matrix, const float *ptr);
-void	MATRIXFASTCALL SSE2_FUNC(MatrixScale)			(float * matrix, const float * ptr);
-
-
-
-float	MATRIXFASTCALL MatrixGetMultipliedIndex	(int index, float *matrix, float *rightMatrix);
-void	MATRIXFASTCALL MatrixSet				(float *matrix, int x, int y, float value);
-void	MATRIXFASTCALL MatrixCopy				(float * matrixDST, const float * matrixSRC);
-int		MATRIXFASTCALL MatrixCompare				(const float * matrixDST, const float * matrixSRC);
-void	MATRIXFASTCALL MatrixIdentity			(float *matrix);
+float	MatrixGetMultipliedIndex	(int index, float *matrix, float *rightMatrix);
+void	MatrixSet				(float *matrix, int x, int y, float value);
+void	MatrixCopy				(float * matrixDST, const float * matrixSRC);
+int		MatrixCompare				(const float * matrixDST, const float * matrixSRC);
+void	MatrixIdentity			(float *matrix);

 void	MatrixTranspose				(float *matrix);
 void	MatrixStackInit				(MatrixStack *stack);
@ -112,27 +82,21 @@ void Vector3Normalize(float *dst);

 void Vector4Copy(float *dst, const float *src);

-} //extern "C"
-
 //these functions are an unreliable, inaccurate floor.
 //it should only be used for positive numbers
 //this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available
 FORCEINLINE u32 u32floor(float f)
 {
-#if defined(SSE2_INTRIN)
-	return (u32)_mm_cvttss_si32(_mm_set_ss(f));
-#elif !defined(NOSSE2)
-	__asm cvttss2si eax, f;
+#ifdef ENABLE_SSE2
+	return (u32)_mm_cvtt_ss2si(_mm_set_ss(f));
 #else
 	return (u32)f;
 #endif
 }
 FORCEINLINE u32 u32floor(double d)
 {
-#if defined(SSE2_INTRIN)
+#ifdef ENABLE_SSE2
 	return (u32)_mm_cvttsd_si32(_mm_set_sd(d));
-#elif !defined(NOSSE2)
-	__asm cvttsd2si eax, d;
 #else
 	return (u32)d;
 #endif
@ -142,66 +106,212 @@ FORCEINLINE u32 u32floor(double d)
 //be sure that the results are the same thing as floorf!
 FORCEINLINE s32 s32floor(float f)
 {
-#if defined(SSE2_INTRIN)
+#ifdef ENABLE_SSE2
 	return _mm_cvtss_si32( _mm_add_ss(_mm_set_ss(-0.5f),_mm_add_ss(_mm_set_ss(f), _mm_set_ss(f))) ) >> 1;
-#elif !defined(NOSSE2)
-	static const float c = -0.5f;
-	__asm
-	{
-		movss xmm0, f;
-		addss xmm0, xmm0;
-		addss xmm0, c;
-		cvtss2si eax, xmm0
-		sar eax, 1
-	}
 #else
 	return (s32)floorf(f);
 #endif
 }

-//now comes some sse2 functions coded solely with intrinsics. 
-//let's wait and see how many people this upsets.
-//they can always #define SSE2_NOINTRIN in their userconfig.h....
-
-#ifdef SSE2_INTRIN
+//switched SSE2 functions
+//-------------
+#ifdef ENABLE_SSE

 template<int NUM>
-static FORCEINLINE void memset_u16_le(void* dst, u16 val)
+FORCEINLINE void memset_u16_le(void* dst, u16 val)
 {
 	u32 u32val;
 	//just for the endian safety
 	T1WriteWord((u8*)&u32val,0,val);
 	T1WriteWord((u8*)&u32val,2,val);
-	const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
-	MACRODO_N(NUM/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp));
+	////const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
+	__m128 temp; temp.m128_i32[0] = u32val;
+	//MACRODO_N(NUM/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp));
+	MACRODO_N(NUM/8,_mm_store_ps1((float*)((u8*)dst+(X)*16), temp));
 }
-#else
+
+#else //no sse2
+
 template<int NUM>
 static FORCEINLINE void memset_u16_le(void* dst, u16 val)
 {
 	for(int i=0;i<NUM;i++)
 		T1WriteWord((u8*)dst,i<<1,val);
 }
+
 #endif

+//---------------------------
+//switched SSE functions
+#ifdef ENABLE_SSE
+
+struct SSE_MATRIX
+{
+	SSE_MATRIX(const float *matrix)
+		: row0(_mm_load_ps(matrix))
+		, row1(_mm_load_ps(matrix+4))
+		, row2(_mm_load_ps(matrix+8))
+		, row3(_mm_load_ps(matrix+12))
+	{}
+
+	union {
+		__m128 rows[4];
+		struct { __m128 row0; __m128 row1; __m128 row2; __m128 row3; };
+	};
+		
+};
+
+FORCEINLINE __m128 _util_MatrixMultVec4x4_(const SSE_MATRIX &mat, __m128 vec)
+{
+	__m128 xmm5 = _mm_shuffle_ps(vec, vec, B8(01010101));
+	__m128 xmm6 = _mm_shuffle_ps(vec, vec, B8(10101010));
+	__m128 xmm7 = _mm_shuffle_ps(vec, vec, B8(11111111));
+	__m128 xmm4 = _mm_shuffle_ps(vec, vec, B8(00000000));
+
+	xmm4 = _mm_mul_ps(xmm4,mat.row0);
+	xmm5 = _mm_mul_ps(xmm5,mat.row1);
+	xmm6 = _mm_mul_ps(xmm6,mat.row2);
+	xmm7 = _mm_mul_ps(xmm7,mat.row3);
+	xmm4 = _mm_add_ps(xmm4,xmm5);
+	xmm4 = _mm_add_ps(xmm4,xmm6);
+	xmm4 = _mm_add_ps(xmm4,xmm7);
+	return xmm4;
+}
+
+FORCEINLINE void MatrixMultiply(float * matrix, const float * rightMatrix)
+{
+	//this seems to generate larger code, including many movaps, but maybe it is less harsh on the registers than the
+	//more hand-tailored approach
+	__m128 row0 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix)); 
+	__m128 row1 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+4));
+	__m128 row2 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+8)); 
+	__m128 row3 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+12));
+	_mm_store_ps(matrix,row0); 
+	_mm_store_ps(matrix+4,row1); 
+	_mm_store_ps(matrix+8,row2);
+	_mm_store_ps(matrix+12,row3);
+}
+
+
+
+FORCEINLINE void MatrixMultVec4x4(const float *matrix, float *vecPtr)
+{
+	_mm_store_ps(vecPtr,_util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(vecPtr)));
+}
+
+FORCEINLINE void MatrixMultVec4x4_M2(const float *matrix, float *vecPtr)
+{
+	//there are hardly any gains from merging these manually
+	MatrixMultVec4x4(matrix+16,vecPtr);
+	MatrixMultVec4x4(matrix,vecPtr);
+}
+
+
+FORCEINLINE void MatrixMultVec3x3(const float * matrix, float * vecPtr)
+{
+	const __m128 vec = _mm_load_ps(vecPtr);
+
+	__m128 xmm5 = _mm_shuffle_ps(vec, vec, B8(01010101));
+	__m128 xmm6 = _mm_shuffle_ps(vec, vec, B8(10101010));
+	__m128 xmm4 = _mm_shuffle_ps(vec, vec, B8(00000000));
+
+	const SSE_MATRIX mat(matrix);
+
+	xmm4 = _mm_mul_ps(xmm4,mat.row0);
+	xmm5 = _mm_mul_ps(xmm5,mat.row1);
+	xmm6 = _mm_mul_ps(xmm6,mat.row2);
+	xmm4 = _mm_add_ps(xmm4,xmm5);
+	xmm4 = _mm_add_ps(xmm4,xmm6);
+
+	_mm_store_ps(vecPtr,xmm4);
+}
+
+FORCEINLINE void MatrixTranslate(float *matrix, const float *ptr)
+{
+	__m128 xmm4 = _mm_load_ps(ptr);
+	__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
+	__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
+	xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
+	
+	xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(matrix));
+	xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(matrix+4));
+	xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(matrix+8));
+	xmm4 = _mm_add_ps(xmm4,xmm5);
+	xmm4 = _mm_add_ps(xmm4,xmm6);
+	xmm4 = _mm_add_ps(xmm4,_mm_load_ps(matrix+12));
+	_mm_store_ps(matrix+12,xmm4);
+}
+
+FORCEINLINE void MatrixScale(float *matrix, const float *ptr)
+{
+	__m128 xmm4 = _mm_load_ps(ptr);
+	__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
+	__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
+	xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
+	
+	xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(matrix));
+	xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(matrix+4));
+	xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(matrix+8));
+	_mm_store_ps(matrix,xmm4);
+	_mm_store_ps(matrix+4,xmm5);
+	_mm_store_ps(matrix+8,xmm6);
+}
+
+template<int NUM_ROWS>
+FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
+{
+	CTASSERT(NUM_ROWS==3 || NUM_ROWS==4);
+
+	const __m128 val = _mm_set_ps1(divisor);
+
+	_mm_store_ps(matrix,_mm_div_ps(_mm_load_ps(matrix),val));
+	_mm_store_ps(matrix+4,_mm_div_ps(_mm_load_ps(matrix+4),val));
+	_mm_store_ps(matrix+8,_mm_div_ps(_mm_load_ps(matrix+8),val));
+	if(NUM_ROWS==4)
+		_mm_store_ps(matrix+12,_mm_div_ps(_mm_load_ps(matrix+12),val));
+}
+
 //WARNING: I do not think this is as fast as a memset, for some reason.
 //at least in vc2005 with sse enabled. better figure out why before using it
-#ifdef SSE2_INTRIN
 template<int NUM>
 static FORCEINLINE void memset_u8(void* _dst, u8 val)
 {
-	const u8* dst = (u8*)_dst;
-	u32 u32val = (val<<24)|(val<<16)|(val<<8)|val;
-	const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
-	MACRODO_N(NUM/16,_mm_store_si128((__m128i*)(dst+(X)*16), temp));
+	memset(_dst,val,NUM);
+	//const u8* dst = (u8*)_dst;
+	//u32 u32val = (val<<24)|(val<<16)|(val<<8)|val;
+	//const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
+	//MACRODO_N(NUM/16,_mm_store_si128((__m128i*)(dst+(X)*16), temp));
 }
-#else
+
+#else //no sse
+
+void MatrixMultVec4x4 (const float *matrix, float *vecPtr);
+void MatrixMultVec3x3(const float * matrix, float * vecPtr);
+void MatrixMultiply(float * matrix, const float * rightMatrix);
+void MatrixTranslate(float *matrix, const float *ptr);
+void MatrixScale(float * matrix, const float * ptr);
+
+FORCEINLINE void MatrixMultVec4x4_M2(const float *matrix, float *vecPtr)
+{
+	//there are hardly any gains from merging these manually
+	MatrixMultVec4x4(matrix+16,vecPtr);
+	MatrixMultVec4x4(matrix,vecPtr);
+}
+
+template<int NUM_ROWS>
+FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
+{
+	for(int i=0;i<NUM_ROWS*4;i++)
+		matrix[i] /= divisor;
+}
+
 template<int NUM>
 static FORCEINLINE void memset_u8(void* dst, u8 val)
 {
 	memset(dst,val,NUM);
 }
-#endif
+
+#endif //switched SSE functions


 #endif
--- a/desmume/src/matrix_sse2-x64.asm
+++ b/desmume/src/matrix_sse2-x64.asm
@ -1,182 +0,0 @@
-;
-;	Copyright (C) 2006 yopyop
-;	Copyright (C) 2008 CrazyMax
-;
-;    This file is part of DeSmuME
-;
-;    DeSmuME is free software; you can redistribute it and/or modify
-;    it under the terms of the GNU General Public License as published by
-;    the Free Software Foundation; either version 2 of the License, or
-;    (at your option) any later version.
-;
-;    DeSmuME is distributed in the hope that it will be useful,
-;    but WITHOUT ANY WARRANTY; without even the implied warranty of
-;    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;    GNU General Public License for more details.
-;
-;    You should have received a copy of the GNU General Public License
-;    along with DeSmuME; if not, write to the Free Software
-;    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-	TITLE	matrix_sse2-x64.asm
-	.code
-	
-_sse2_MatrixMultVec4x4 PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [rcx]
-		movaps	xmm1, XMMWORD PTR [rcx+16]
-		movaps	xmm2, XMMWORD PTR [rcx+32]
-		movaps	xmm3, XMMWORD PTR [rcx+48]
-		movaps	xmm4, XMMWORD PTR [rdx]
-		movaps	xmm5, xmm4
-		movaps	xmm6, xmm4
-		movaps	xmm7, xmm4
-		shufps	xmm4, xmm4, 00000000b
-		shufps	xmm5, xmm5, 01010101b
-		shufps	xmm6, xmm6, 10101010b
-		shufps	xmm7, xmm7, 11111111b
-		mulps	xmm4, xmm0
-		mulps	xmm5, xmm1
-		mulps	xmm6, xmm2
-		mulps	xmm7, xmm3
-		addps	xmm4, xmm5
-		addps	xmm4, xmm6
-		addps	xmm4, xmm7
-		movaps	XMMWORD PTR [rdx], xmm4
-		ret		0
-_sse2_MatrixMultVec4x4 ENDP
-
-_sse2_MatrixMultVec3x3 PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [rcx]
-		movaps	xmm1, XMMWORD PTR [rcx+16]
-		movaps	xmm2, XMMWORD PTR [rcx+32]
-		movaps	xmm4, XMMWORD PTR [rdx]
-		movaps	xmm5, xmm4
-		movaps	xmm6, xmm4
-		movaps	xmm7, xmm4
-		shufps	xmm4, xmm4, 00000000b
-		shufps	xmm5, xmm5, 01010101b
-		shufps	xmm6, xmm6, 10101010b
-		mulps	xmm4, xmm0
-		mulps	xmm5, xmm1
-		mulps	xmm6, xmm2
-		addps	xmm4, xmm5
-		addps	xmm4, xmm6
-		movaps	XMMWORD PTR [rdx], xmm4
-		ret		0
-_sse2_MatrixMultVec3x3 ENDP
-
-_sse2_MatrixMultiply PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [rcx]
-		movaps	xmm1, XMMWORD PTR [rcx+16]
-		movaps	xmm2, XMMWORD PTR [rcx+32]
-		movaps	xmm3, XMMWORD PTR [rcx+48]
-		movaps	xmm4, XMMWORD PTR [rdx]			; r00, r01, r02, r03
-		movaps	xmm8, XMMWORD PTR [rdx+16]		; r04, r05, r06, r07
-		movaps	xmm5,xmm4
-		movaps	xmm6,xmm4
-		movaps	xmm7,xmm4
-		movaps	xmm9,xmm8						;
-		movaps	xmm10,xmm8
-		movaps	xmm11,xmm8
-		shufps	xmm4,xmm4,00000000b
-		shufps	xmm5,xmm5,01010101b
-		shufps	xmm6,xmm6,10101010b
-		shufps	xmm7,xmm7,11111111b
-		shufps	xmm8, xmm8, 00000000b			;
-		shufps	xmm9, xmm9, 01010101b
-		shufps	xmm10,xmm10,10101010b
-		shufps	xmm11,xmm11,11111111b
-		mulps	xmm4,xmm0
-		mulps	xmm5,xmm1
-		mulps	xmm6,xmm2
-		mulps	xmm7,xmm3
-		mulps	xmm8, xmm0						;
-		mulps	xmm9, xmm1
-		mulps	xmm10,xmm2
-		mulps	xmm11,xmm3
-		addps	xmm4,xmm5
-		addps	xmm4,xmm6
-		addps	xmm4,xmm7
-		addps	xmm8,xmm9						;
-		addps	xmm8,xmm10
-		addps	xmm8,xmm11
-		movaps	XMMWORD PTR [rcx],xmm4
-		movaps	XMMWORD PTR [rcx+16],xmm8
-
-		movaps	xmm4, XMMWORD PTR [rdx+32]		; r00, r01, r02, r03
-		movaps	xmm8, XMMWORD PTR [rdx+48]		; r04, r05, r06, r07
-		movaps	xmm5,xmm4
-		movaps	xmm6,xmm4
-		movaps	xmm7,xmm4
-		movaps	xmm9,xmm8						;
-		movaps	xmm10,xmm8
-		movaps	xmm11,xmm8
-		shufps	xmm4,xmm4,00000000b
-		shufps	xmm5,xmm5,01010101b
-		shufps	xmm6,xmm6,10101010b
-		shufps	xmm7,xmm7,11111111b
-		shufps	xmm8, xmm8, 00000000b			;
-		shufps	xmm9, xmm9, 01010101b
-		shufps	xmm10,xmm10,10101010b
-		shufps	xmm11,xmm11,11111111b
-		mulps	xmm4,xmm0
-		mulps	xmm5,xmm1
-		mulps	xmm6,xmm2
-		mulps	xmm7,xmm3
-		mulps	xmm8, xmm0						;
-		mulps	xmm9, xmm1
-		mulps	xmm10,xmm2
-		mulps	xmm11,xmm3
-		addps	xmm4,xmm5
-		addps	xmm4,xmm6
-		addps	xmm4,xmm7
-		addps	xmm8,xmm9						;
-		addps	xmm8,xmm10
-		addps	xmm8,xmm11
-		movaps	XMMWORD PTR [rcx+32],xmm4
-		movaps	XMMWORD PTR [rcx+48],xmm8
-		ret		0
-_sse2_MatrixMultiply ENDP
-
-_sse2_MatrixTranslate PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [rcx]
-		movaps	xmm1, XMMWORD PTR [rcx+16]
-		movaps	xmm2, XMMWORD PTR [rcx+32]
-		movaps	xmm3, XMMWORD PTR [rcx+48]
-		movaps	xmm4, XMMWORD PTR [rdx]
-		movaps	xmm5, xmm4
-		movaps	xmm6, xmm4
-		movaps	xmm7, xmm4
-		shufps	xmm4, xmm4, 00000000b
-		shufps	xmm5, xmm5, 01010101b
-		shufps	xmm6, xmm6, 10101010b
-		mulps	xmm4, xmm0
-		mulps	xmm5, xmm1
-		mulps	xmm6, xmm2
-		addps	xmm4, xmm5
-		addps	xmm4, xmm6
-		addps	xmm4, xmm3
-		movaps	XMMWORD PTR [rcx+48], xmm4
-		ret		0
-_sse2_MatrixTranslate ENDP
-
-_sse2_MatrixScale PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [rcx]
-		movaps	xmm1, XMMWORD PTR [rcx+16]
-		movaps	xmm2, XMMWORD PTR [rcx+32]
-		movaps	xmm4, XMMWORD PTR [rdx]
-		movaps	xmm5, xmm4
-		movaps	xmm6, xmm4
-		shufps	xmm4, xmm4, 00000000b
-		shufps	xmm5, xmm5, 01010101b
-		shufps	xmm6, xmm6, 10101010b
-		mulps	xmm4, xmm0
-		mulps	xmm5, xmm1
-		mulps	xmm6, xmm2
-		movaps	XMMWORD PTR [rcx],xmm4
-		movaps	XMMWORD PTR [rcx+16],xmm5
-		movaps	XMMWORD PTR [rcx+32],xmm6
-		ret		0
-_sse2_MatrixScale ENDP
-
-end
--- a/desmume/src/matrix_sse2-x86.asm
+++ b/desmume/src/matrix_sse2-x86.asm
@ -1,214 +0,0 @@
-;
-;	Copyright (C) 2006 yopyop
-;	Copyright (C) 2008 CrazyMax
-;
-;    This file is part of DeSmuME
-;
-;    DeSmuME is free software; you can redistribute it and/or modify
-;    it under the terms of the GNU General Public License as published by
-;    the Free Software Foundation; either version 2 of the License, or
-;    (at your option) any later version.
-;
-;    DeSmuME is distributed in the hope that it will be useful,
-;    but WITHOUT ANY WARRANTY; without even the implied warranty of
-;    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;    GNU General Public License for more details.
-;
-;    You should have received a copy of the GNU General Public License
-;    along with DeSmuME; if not, write to the Free Software
-;    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-	TITLE	matrix_sse2-x86.asm
-	.686P
-	.XMM
-	.model	flat
-	.code
-	
-@_sse2_MatrixMultVec4x4@8 PROC PUBLIC
-		movaps	xmm4, XMMWORD PTR [edx]
-		pshufd	xmm5, xmm4, 01010101b
-		pshufd	xmm6, xmm4, 10101010b
-		pshufd	xmm7, xmm4, 11111111b
-		shufps	xmm4, xmm4, 00000000b
-		mulps	xmm4, XMMWORD PTR [ecx]
-		mulps	xmm5, XMMWORD PTR [ecx+16]
-		mulps	xmm6, XMMWORD PTR [ecx+32]
-		mulps	xmm7, XMMWORD PTR [ecx+48]
-		addps	xmm4, xmm5
-		addps	xmm4, xmm6
-		addps	xmm4, xmm7
-		movaps	XMMWORD PTR [edx], xmm4
-		ret		0
-@_sse2_MatrixMultVec4x4@8 ENDP
-
-@_sse2_MatrixMultVec4x4_M2@8 PROC PUBLIC
-		movaps	xmm4, XMMWORD PTR [edx]
-		pshufd	xmm5, xmm4, 01010101b
-		pshufd	xmm6, xmm4, 10101010b
-		pshufd	xmm7, xmm4, 11111111b
-		shufps	xmm4, xmm4, 00000000b
-		mulps	xmm4, XMMWORD PTR [ecx+64]
-		mulps	xmm5, XMMWORD PTR [ecx+80]
-		mulps	xmm6, XMMWORD PTR [ecx+96]
-		mulps	xmm7, XMMWORD PTR [ecx+112]
-		addps	xmm4, xmm5
-		addps	xmm4, xmm6
-		addps	xmm4, xmm7
-		pshufd	xmm5, xmm4, 01010101b
-		pshufd	xmm6, xmm4, 10101010b
-		pshufd	xmm7, xmm4, 11111111b
-		shufps	xmm4, xmm4, 00000000b
-		mulps	xmm4, XMMWORD PTR [ecx]
-		mulps	xmm5, XMMWORD PTR [ecx+16]
-		mulps	xmm6, XMMWORD PTR [ecx+32]
-		mulps	xmm7, XMMWORD PTR [ecx+48]
-		addps	xmm4, xmm5
-		addps	xmm4, xmm6
-		addps	xmm4, xmm7
-		movaps	XMMWORD PTR [edx], xmm4
-		ret		0
-@_sse2_MatrixMultVec4x4_M2@8 ENDP
-
-
-@_sse2_MatrixMultVec3x3@8 PROC PUBLIC
-		movaps	xmm4, XMMWORD PTR [edx]
-		pshufd	xmm5, xmm4, 01010101b
-		pshufd	xmm6, xmm4, 10101010b
-		shufps	xmm4, xmm4, 00000000b
-		mulps	xmm4, XMMWORD PTR [ecx]
-		mulps	xmm5, XMMWORD PTR [ecx+16]
-		mulps	xmm6, XMMWORD PTR [ecx+32]
-		addps	xmm4, xmm5
-		addps	xmm4, xmm6
-		movaps	XMMWORD PTR [edx], xmm4
-		ret		0
-@_sse2_MatrixMultVec3x3@8 ENDP
-
-@_sse2_MatrixMultiply@8 PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR [ecx]
-		movaps	xmm1, XMMWORD PTR [ecx+16]
-		movaps	xmm2, XMMWORD PTR [ecx+32]
-		movaps	xmm3, XMMWORD PTR [ecx+48]
-		movaps	xmm4, XMMWORD PTR [edx]			; r00, r01, r02, r03
-		pshufd	xmm5, xmm4, 01010101b
-		pshufd	xmm6, xmm4, 10101010b
-		pshufd	xmm7, xmm4, 11111111b
-		shufps	xmm4, xmm4, 00000000b
-		mulps	xmm4,xmm0
-		mulps	xmm5,xmm1
-		mulps	xmm6,xmm2
-		mulps	xmm7,xmm3
-		addps	xmm4,xmm5
-		addps	xmm4,xmm6
-		addps	xmm4,xmm7
-		movaps	XMMWORD PTR [ecx],xmm4
-
-		movaps	xmm4, XMMWORD PTR [edx+16]		; r04, r05, r06, r07
-		pshufd	xmm5, xmm4, 01010101b
-		pshufd	xmm6, xmm4, 10101010b
-		pshufd	xmm7, xmm4, 11111111b
-		shufps	xmm4, xmm4, 00000000b
-		mulps	xmm4,xmm0
-		mulps	xmm5,xmm1
-		mulps	xmm6,xmm2
-		mulps	xmm7,xmm3
-		addps	xmm4,xmm5
-		addps	xmm4,xmm6
-		addps	xmm4,xmm7
-		movaps	XMMWORD PTR [ecx+16],xmm4
-
-		movaps	xmm4, XMMWORD PTR [edx+32]		; r08, r09, r10, r11
-		pshufd	xmm5, xmm4, 01010101b
-		pshufd	xmm6, xmm4, 10101010b
-		pshufd	xmm7, xmm4, 11111111b
-		shufps	xmm4, xmm4, 00000000b
-		mulps	xmm4,xmm0
-		mulps	xmm5,xmm1
-		mulps	xmm6,xmm2
-		mulps	xmm7,xmm3
-		addps	xmm4,xmm5
-		addps	xmm4,xmm6
-		addps	xmm4,xmm7
-		movaps	XMMWORD PTR [ecx+32],xmm4
-
-		movaps	xmm4, XMMWORD PTR [edx+48]		; r12, r13, r14, r15
-		pshufd	xmm5, xmm4, 01010101b
-		pshufd	xmm6, xmm4, 10101010b
-		pshufd	xmm7, xmm4, 11111111b
-		shufps	xmm4, xmm4, 00000000b
-		mulps	xmm4,xmm0
-		mulps	xmm5,xmm1
-		mulps	xmm6,xmm2
-		mulps	xmm7,xmm3
-		addps	xmm4,xmm5
-		addps	xmm4,xmm6
-		addps	xmm4,xmm7
-		movaps	XMMWORD PTR [ecx+48],xmm4
-
-		ret		0
-@_sse2_MatrixMultiply@8 ENDP
-
-@_sse2_MatrixTranslate@8 PROC PUBLIC
-		movaps	xmm4, XMMWORD PTR [edx]
-		pshufd	xmm5, xmm4, 01010101b
-		pshufd	xmm6, xmm4, 10101010b
-		shufps	xmm4, xmm4, 00000000b
-		mulps	xmm4, XMMWORD PTR [ecx]
-		mulps	xmm5, XMMWORD PTR [ecx+16]
-		mulps	xmm6, XMMWORD PTR [ecx+32]
-		addps	xmm4, xmm5
-		addps	xmm4, xmm6
-		addps	xmm4, XMMWORD PTR [ecx+48]
-		movaps	XMMWORD PTR [ecx+48], xmm4
-		ret		0
-@_sse2_MatrixTranslate@8 ENDP
-
-@_sse2_MatrixScale@8 PROC PUBLIC
-		movaps	xmm4, XMMWORD PTR [edx]
-		pshufd	xmm5, xmm4, 01010101b
-		pshufd	xmm6, xmm4, 10101010b
-		shufps	xmm4, xmm4, 00000000b
-		mulps	xmm4, XMMWORD PTR [ecx]
-		mulps	xmm5, XMMWORD PTR [ecx+16]
-		mulps	xmm6, XMMWORD PTR [ecx+32]
-		movaps	XMMWORD PTR [ecx], xmm4
-		movaps	XMMWORD PTR [ecx+16], xmm5
-		movaps	XMMWORD PTR [ecx+32], xmm6
-		ret		0
-@_sse2_MatrixScale@8 ENDP
-
-@_sse2_fix2float_12@8 PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR[ecx]
-		movaps	xmm1, XMMWORD PTR[ecx+16]
-		movaps	xmm2, XMMWORD PTR[ecx+32]
-		movaps	xmm4, XMMWORD PTR [edx]
-		;prefetchnta [ecx+64]
-		divps	xmm0, xmm4
-		divps	xmm1, xmm4
-		divps	xmm2, xmm4
-		movaps	XMMWORD PTR[ecx],	xmm0
-		movaps	XMMWORD PTR[ecx+16],xmm1
-		movaps	XMMWORD PTR[ecx+32],xmm2
-		ret 0
-@_sse2_fix2float_12@8 ENDP
-
-@_sse2_fix2float_16@8 PROC PUBLIC
-		movaps	xmm0, XMMWORD PTR[ecx]
-		movaps	xmm1, XMMWORD PTR[ecx+16]
-		movaps	xmm2, XMMWORD PTR[ecx+32]
-		movaps	xmm3, XMMWORD PTR[ecx+48]
-		movaps	xmm4, XMMWORD PTR [edx]
-		;prefetchnta [ecx+64]
-		divps	xmm0, xmm4
-		divps	xmm1, xmm4
-		divps	xmm2, xmm4
-		divps	xmm3, xmm4
-		movaps	XMMWORD PTR[ecx],	xmm0
-		movaps	XMMWORD PTR[ecx+16],xmm1
-		movaps	XMMWORD PTR[ecx+32],xmm2
-		movaps	XMMWORD PTR[ecx+48],xmm3
-		ret 0
-@_sse2_fix2float_16@8 ENDP
-
-end
-
--- a/desmume/src/rasterize.cpp
+++ b/desmume/src/rasterize.cpp
@ -1022,12 +1022,10 @@ static void SoftRastConvertFramebuffer()

 	for(int i=0,y=0;y<192;y++)
 	{
-		#ifndef NOSSE2
-			u8* wanx = (u8*)&src[i];
-			#define ASS(X,Y) __asm { prefetchnta [wanx+32*0x##X##Y] }
-			#define PUNK(X) ASS(X,0) ASS(X,1) ASS(X,2) ASS(X,3) ASS(X,4) ASS(X,5) ASS(X,6) ASS(X,7) ASS(X,8) ASS(X,9) ASS(X,A) ASS(X,B) ASS(X,C) ASS(X,D) ASS(X,E) ASS(X,F) 
-			PUNK(0); PUNK(1);
-		#endif
+		//	u8* wanx = (u8*)&src[i];
+		//	#define ASS(X,Y) __asm { prefetchnta [wanx+32*0x##X##Y] }
+		//	#define PUNK(X) ASS(X,0) ASS(X,1) ASS(X,2) ASS(X,3) ASS(X,4) ASS(X,5) ASS(X,6) ASS(X,7) ASS(X,8) ASS(X,9) ASS(X,A) ASS(X,B) ASS(X,C) ASS(X,D) ASS(X,E) ASS(X,F) 
+		//	PUNK(0); PUNK(1);

 		for(int x=0;x<256;x++,i++)
 		{
--- a/desmume/src/texcache.h
+++ b/desmume/src/texcache.h
@ -10,11 +10,9 @@ enum TexCache_TexFormat
 };

 #define MAX_TEXTURE 500
-#ifndef NOSSE2
-struct ALIGN(16) TextureCache
-#else
-struct ALIGN(8) TextureCache
-#endif
+
+
+struct CACHE_ALIGN TextureCache
 {
 	u32					id;
 	u32					frm;
@ -33,7 +31,6 @@ struct ALIGN(8) TextureCache

 	//set if this texture is suspected be invalid due to a vram reconfigure
 	bool				suspectedInvalid;
-
 };

 extern TextureCache	*texcache;
--- a/desmume/src/types.h
+++ b/desmume/src/types.h
@ -26,13 +26,17 @@
 #include "config.h"
 #endif

-#ifndef _MSC_VER
-#define NOSSE2
+#ifdef _MSC_VER
+#define ENABLE_SSE
+#define ENABLE_SSE2
 #endif

-//if theres no sse2, also enforce no intrinsics
-#if defined(NOSSE2)
-#define SSE2_NOINTRIN
+#ifdef NOSSE
+#undef ENABLE_SSE
+#endif
+
+#ifdef NOSSE2
+#undef ENABLE_SSE2
 #endif

 #ifdef _WIN32
@ -92,20 +96,6 @@
 #endif
 #endif

-//#ifndef _PREFETCH
-//#if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(NOSSE2)
-//#include <xmmintrin.h>
-//#include <intrin.h>
-//#define _PREFETCH(X) _mm_prefetch((char*)(X),_MM_HINT_T0);
-//#define _PREFETCHNTA(X) _mm_prefetch((char*)(X),_MM_HINT_NTA);
-//#else
-#define _PREFETCH(X) {}
-#define _PREFETCHNTA(X) {}
-//#endif
-//#endif
-
-
-
 #if defined(__LP64__)
 typedef unsigned char u8;
 typedef unsigned short u16;
@ -360,7 +350,45 @@ char (*BLAHBLAHBLAH( UNALIGNED T (&)[N] ))[N];
 	if((N)&0x001) MACRODO1((N)&(0x100|0x080|0x040|0x020|0x010|0x008|0x004|0x002),TODO); \
 }

+//---------------------------
+//Binary constant generator macro By Tom Torfs - donated to the public domain

+//turn a numeric literal into a hex constant
+//(avoids problems with leading zeroes)
+//8-bit constants max value 0x11111111, always fits in unsigned long
+#define HEX__(n) 0x##n##LU

+//8-bit conversion function 
+#define B8__(x) ((x&0x0000000FLU)?1:0) \
+((x&0x000000F0LU)?2:0) \
+((x&0x00000F00LU)?4:0) \
+((x&0x0000F000LU)?8:0) \
+((x&0x000F0000LU)?16:0) \
+((x&0x00F00000LU)?32:0) \
+((x&0x0F000000LU)?64:0) \
+((x&0xF0000000LU)?128:0)
+
+//for upto 8-bit binary constants
+#define B8(d) ((unsigned char)B8__(HEX__(d)))
+
+// for upto 16-bit binary constants, MSB first
+#define B16(dmsb,dlsb) (((unsigned short)B8(dmsb)<<8) \
+ B8(dlsb))
+
+// for upto 32-bit binary constants, MSB first */
+#define B32(dmsb,db2,db3,dlsb) (((unsigned long)B8(dmsb)<<24) \
+ ((unsigned long)B8(db2)<<16) \
+ ((unsigned long)B8(db3)<<8) \
+ B8(dlsb))
+
+//Sample usage:
+//B8(01010101) = 85
+//B16(10101010,01010101) = 43605
+//B32(10000000,11111111,10101010,01010101) = 2164238933
+//---------------------------
+
+#ifndef CTASSERT
+#define	CTASSERT(x)		typedef char __assert ## y[(x) ? 1 : -1]
+#endif

 #endif
--- a/desmume/src/version.h
+++ b/desmume/src/version.h
@ -41,8 +41,12 @@
 #endif
 #endif

-#ifdef NOSSE2
-#define DESMUME_CPUEXT_STRING " NOSSE2"
+#ifndef ENABLE_SSE2
+	#ifndef ENABLE_SSE
+		#define DESMUME_CPUEXT_STRING " NOSSE"
+	#else
+		#define DESMUME_CPUEXT_STRING " NOSSE2"
+	#endif
 #else
 #define DESMUME_CPUEXT_STRING ""
 #endif
--- a/desmume/src/windows/DeSmuME_2005.vcproj
+++ b/desmume/src/windows/DeSmuME_2005.vcproj
@ -710,42 +710,6 @@
 				</File>
 			</Filter>
 		</Filter>
-		<Filter
-			Name="asm"
-			>
-			<File
-				RelativePath="..\matrix_sse2-x64.asm"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="MASM"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="MASM"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release FastBuild|Win32"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="MASM"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\matrix_sse2-x86.asm"
-				>
-			</File>
-		</Filter>
 		<Filter
 			Name="addons"
 			>
--- a/desmume/src/windows/DeSmuME_2008.vcproj
+++ b/desmume/src/windows/DeSmuME_2008.vcproj
@ -521,142 +521,6 @@
 						</File>
 					</Filter>
 				</Filter>
-				<Filter
-					Name="asm"
-					>
-					<File
-						RelativePath="..\matrix_sse2-x64.asm"
-						>
-						<FileConfiguration
-							Name="Debug|Win32"
-							ExcludedFromBuild="true"
-							>
-							<Tool
-								Name="VCCustomBuildTool"
-								Description="Assembling x64..."
-								CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-							/>
-						</FileConfiguration>
-						<FileConfiguration
-							Name="Release|Win32"
-							ExcludedFromBuild="true"
-							>
-							<Tool
-								Name="VCCustomBuildTool"
-								Description="Assembling x64..."
-								CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-								Outputs="$(IntDir)\$(InputName).obj"
-							/>
-						</FileConfiguration>
-						<FileConfiguration
-							Name="Release FastBuild|Win32"
-							ExcludedFromBuild="true"
-							>
-							<Tool
-								Name="VCCustomBuildTool"
-								Description="Assembling x64..."
-								CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-								Outputs="$(IntDir)\$(InputName).obj"
-							/>
-						</FileConfiguration>
-						<FileConfiguration
-							Name="Debug|x64"
-							ExcludedFromBuild="true"
-							>
-							<Tool
-								Name="VCCustomBuildTool"
-								Description="Assembling x64..."
-								CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-							/>
-						</FileConfiguration>
-						<FileConfiguration
-							Name="Interim SSE2|x64"
-							>
-							<Tool
-								Name="VCCustomBuildTool"
-								Description="Assembling x64..."
-								CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-								Outputs="$(IntDir)\$(InputName).obj"
-							/>
-						</FileConfiguration>
-						<FileConfiguration
-							Name="Interim|x64"
-							ExcludedFromBuild="true"
-							>
-							<Tool
-								Name="VCCustomBuildTool"
-								Description="Assembling x64..."
-								CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-								Outputs="$(IntDir)\$(InputName).obj"
-							/>
-						</FileConfiguration>
-						<FileConfiguration
-							Name="Release (public)|x64"
-							ExcludedFromBuild="true"
-							>
-							<Tool
-								Name="VCCustomBuildTool"
-								Description="Assembling x64..."
-								CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-								Outputs="$(IntDir)\$(InputName).obj"
-							/>
-						</FileConfiguration>
-						<FileConfiguration
-							Name="Release SSE2 (public)|x64"
-							>
-							<Tool
-								Name="VCCustomBuildTool"
-								Description="Assembling x64..."
-								CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-								Outputs="$(IntDir)\$(InputName).obj"
-							/>
-						</FileConfiguration>
-					</File>
-					<File
-						RelativePath="..\matrix_sse2-x86.asm"
-						>
-						<FileConfiguration
-							Name="Debug|x64"
-							ExcludedFromBuild="true"
-							>
-							<Tool
-								Name="MASM"
-							/>
-						</FileConfiguration>
-						<FileConfiguration
-							Name="Interim SSE2|x64"
-							ExcludedFromBuild="true"
-							>
-							<Tool
-								Name="MASM"
-							/>
-						</FileConfiguration>
-						<FileConfiguration
-							Name="Interim|x64"
-							ExcludedFromBuild="true"
-							>
-							<Tool
-								Name="MASM"
-							/>
-						</FileConfiguration>
-						<FileConfiguration
-							Name="Release (public)|x64"
-							ExcludedFromBuild="true"
-							>
-							<Tool
-								Name="MASM"
-							/>
-						</FileConfiguration>
-						<FileConfiguration
-							Name="Release SSE2 (public)|x64"
-							ExcludedFromBuild="true"
-							>
-							<Tool
-								Name="MASM"
-							/>
-						</FileConfiguration>
-					</File>
-				</Filter>
 				<Filter
 					Name="addons"
 					>
--- a/desmume/src/windows/defaultconfig/userconfig.h
+++ b/desmume/src/windows/defaultconfig/userconfig.h
@ -5,10 +5,13 @@
 //to customize your build, place a customized copy in the userconfig directory
 //(alongside this defaultconfig directory)

-//#define NOSSE2 //disables SSE2 optimizations (better change it in the vc++ codegen options too)
+//disables SSE and SSE2 optimizations (better change it in the vc++ codegen options too)
+//note that you may have to use this if your compiler doesn't support standard SSE intrinsics
+//#define NOSSE
+//#define NOSSE2 
+
 //#define DEVELOPER //enables dev+ features
 //#define GDB_STUB //enables the gdb stub. for some reason this is separate from dev+ for now
-//#define SSE2_NOINTRIN //indicates that you have a crippled compiler with no sse2 intrinsics (only relevant for SSE2 builds)


 #endif //_USERCONFIG_H