diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp
index 0bf734cb4..afacfc199 100644
--- a/desmume/src/gfx3d.cpp
+++ b/desmume/src/gfx3d.cpp
@@ -337,8 +337,8 @@ CACHE_ALIGN MatrixStack	mtxStack[4] = {
 
 int _hack_getMatrixStackLevel(int which) { return mtxStack[which].position; }
 
-static CACHE_ALIGN float		mtxCurrent [4][16];
-static CACHE_ALIGN float		mtxTemporal[16];
+static CACHE_ALIGN s32		mtxCurrent [4][16];
+static CACHE_ALIGN s32		mtxTemporal[16];
 static u32 mode = 0;
 
 // Indexes for matrix loading/multiplication
@@ -349,21 +349,21 @@ static u8 MM4x3ind = 0;
 static u8 MM3x3ind = 0;
 
 // Data for vertex submission
-static CACHE_ALIGN u16		u16coord[4] = {0, 0, 0, 0};
+static CACHE_ALIGN s16		s16coord[4] = {0, 0, 0, 0};
 static char		coordind = 0;
 static u32 vtxFormat = 0;
 static BOOL inBegin = FALSE;
 
 // Data for basic transforms
-static CACHE_ALIGN float	trans[4] = {0.0, 0.0, 0.0, 0.0};
+static CACHE_ALIGN s32	trans[4] = {0, 0, 0, 0};
 static int		transind = 0;
-static CACHE_ALIGN float	scale[4] = {0.0, 0.0, 0.0, 0.0};
+static CACHE_ALIGN s32	scale[4] = {0, 0, 0, 0};
 static int		scaleind = 0;
 static u32 viewport = 0;
 
 //various other registers
-static float _t=0, _s=0;
-static float last_t, last_s;
+static s32 _t=0, _s=0;
+static s32 last_t, last_s;
 static u32 clCmd = 0;
 static u32 clInd = 0;
 
@@ -527,7 +527,7 @@ void gfx3d_reset()
 	texturePalette = 0;
 	polyAttrPending = 0;
 	mode = 0;
-	u16coord[0] = u16coord[1] = u16coord[2] = u16coord[3] = 0;
+	s16coord[0] = s16coord[1] = s16coord[2] = s16coord[3] = 0;
 	coordind = 0;
 	vtxFormat = 0;
 	memset(trans, 0, sizeof(trans));
@@ -592,22 +592,25 @@ void gfx3d_reset()
 //Submit a vertex to the GE
 static void SetVertex()
 {
-	float coord[3] = {
-			float16table[u16coord[0]],
-			float16table[u16coord[1]],
-			float16table[u16coord[2]]
+	s32 coord[3] = {
+		s16coord[0],
+		s16coord[1],
+		s16coord[2]
 	};
 
-	ALIGN(16) float coordTransformed[4] = { coord[0], coord[1], coord[2], 1.f };
+	ALIGN(16) s32 coordTransformed[4] = { coord[0], coord[1], coord[2], (1<<12) };
 
 	if (texCoordinateTransform == 3)
 	{
+		//UNTESTED since fixed point conversion, and almost certainly wrong.
 		last_s =((coord[0]*mtxCurrent[3][0] +
 					coord[1]*mtxCurrent[3][4] +
 					coord[2]*mtxCurrent[3][8]) + _s * 16.0f) / 16.0f;
 		last_t =((coord[0]*mtxCurrent[3][1] +
 					coord[1]*mtxCurrent[3][5] +
 					coord[2]*mtxCurrent[3][9]) + _t * 16.0f) / 16.0f;
+		last_s /= 4096.0f;
+		last_t /= 4096.0f;
 	}
 
 	
@@ -624,6 +627,10 @@ static void SetVertex()
 	//when we need to)
 	MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed);
 
+	//printf("%f %f %f\n",s16coord[0]/4096.0f,s16coord[1]/4096.0f,s16coord[2]/4096.0f);
+	//printf("x %f %f %f %f\n",mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f);
+	//printf(" = %f %f %f %f\n",coordTransformed[0]/4096.0f,coordTransformed[1]/4096.0f,coordTransformed[2]/4096.0f,coordTransformed[3]/4096.0f);
+
 	//TODO - culling should be done here.
 	//TODO - viewport transform?
 
@@ -653,12 +660,12 @@ static void SetVertex()
 	//	//MatrixPrint(mtxCurrent[1]);
 	//}
 
-	vert.texcoord[0] = last_s;
-	vert.texcoord[1] = last_t;
-	vert.coord[0] = coordTransformed[0];
-	vert.coord[1] = coordTransformed[1];
-	vert.coord[2] = coordTransformed[2];
-	vert.coord[3] = coordTransformed[3];
+	vert.texcoord[0] = last_s/16.0f;
+	vert.texcoord[1] = last_t/16.0f;
+	vert.coord[0] = coordTransformed[0]/4096.0f;
+	vert.coord[1] = coordTransformed[1]/4096.0f;
+	vert.coord[2] = coordTransformed[2]/4096.0f;
+	vert.coord[3] = coordTransformed[3]/4096.0f;
 	vert.color[0] = GFX3D_5TO6(colorRGB[0]);
 	vert.color[1] = GFX3D_5TO6(colorRGB[1]);
 	vert.color[2] = GFX3D_5TO6(colorRGB[2]);
@@ -778,7 +785,8 @@ static void gfx3d_glLightDirection_cache(int index)
 	cacheLightDirection[index][3] = 0;
 
 	/* Multiply the vector by the directional matrix */
-	MatrixMultVec3x3(mtxCurrent[2], cacheLightDirection[index]);
+	CACHE_ALIGN float temp[16] = {mtxCurrent[2][0]/4096.0f,mtxCurrent[2][1]/4096.0f,mtxCurrent[2][2]/4096.0f,mtxCurrent[2][3]/4096.0f,mtxCurrent[2][4]/4096.0f,mtxCurrent[2][5]/4096.0f,mtxCurrent[2][6]/4096.0f,mtxCurrent[2][7]/4096.0f,mtxCurrent[2][8]/4096.0f,mtxCurrent[2][9]/4096.0f,mtxCurrent[2][10]/4096.0f,mtxCurrent[2][11]/4096.0f,mtxCurrent[2][12]/4096.0f,mtxCurrent[2][13]/4096.0f,mtxCurrent[2][14]/4096.0f,mtxCurrent[2][15]/4096.0f};
+	MatrixMultVec3x3(temp, cacheLightDirection[index]);
 
 	/* Calculate the half vector */
 	float lineOfSight[4] = {0.0f, 0.0f, -1.0f, 0.0f};
@@ -824,12 +832,12 @@ static void gfx3d_glPopMatrix(s32 i)
 	//this was necessary to fix sims apartment pets
 	//i = (i<<26)>>26;
 
-	MatrixStackPopMatrix((float*)mtxCurrent[mymode], &mtxStack[mymode], i);
+	MatrixStackPopMatrix(mtxCurrent[mymode], &mtxStack[mymode], i);
 
 	GFX_DELAY(36);
 
 	if (mymode == 2)
-		MatrixStackPopMatrix((float*)mtxCurrent[1], &mtxStack[1], i);
+		MatrixStackPopMatrix(mtxCurrent[1], &mtxStack[1], i);
 
 	MMU_new.gxstat.sb = 1; // set busy
 }
@@ -891,7 +899,7 @@ static void gfx3d_glLoadIdentity()
 
 static BOOL gfx3d_glLoadMatrix4x4(s32 v)
 {
-	mtxCurrent[mode][ML4x4ind] = (float)((v<<4)>>4);
+	mtxCurrent[mode][ML4x4ind] = v;
 
 	++ML4x4ind;
 	if(ML4x4ind<16) return FALSE;
@@ -899,7 +907,7 @@ static BOOL gfx3d_glLoadMatrix4x4(s32 v)
 
 	GFX_DELAY(19);
 
-	vector_fix2float<4>(mtxCurrent[mode], 4096.f);
+	//vector_fix2float<4>(mtxCurrent[mode], 4096.f);
 
 	if (mode == 2)
 		MatrixCopy (mtxCurrent[1], mtxCurrent[2]);
@@ -910,18 +918,18 @@ static BOOL gfx3d_glLoadMatrix4x4(s32 v)
 
 static BOOL gfx3d_glLoadMatrix4x3(s32 v)
 {
-	mtxCurrent[mode][ML4x3ind] = (float)((v<<4)>>4);
+	mtxCurrent[mode][ML4x3ind] = v;
 
 	ML4x3ind++;
 	if((ML4x3ind & 0x03) == 3) ML4x3ind++;
 	if(ML4x3ind<16) return FALSE;
 	ML4x3ind = 0;
 
-	vector_fix2float<4>(mtxCurrent[mode], 4096.f);
+	//vector_fix2float<4>(mtxCurrent[mode], 4096.f);
 
 	//fill in the unusued matrix values
-	mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0.f;
-	mtxCurrent[mode][15] = 1.f;
+	mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0;
+	mtxCurrent[mode][15] = (1<<12);
 
 	GFX_DELAY(30);
 
@@ -933,7 +941,7 @@ static BOOL gfx3d_glLoadMatrix4x3(s32 v)
 
 static BOOL gfx3d_glMultMatrix4x4(s32 v)
 {
-	mtxTemporal[MM4x4ind] = (float)((v<<4)>>4);
+	mtxTemporal[MM4x4ind] = v;
 
 	MM4x4ind++;
 	if(MM4x4ind<16) return FALSE;
@@ -941,7 +949,7 @@ static BOOL gfx3d_glMultMatrix4x4(s32 v)
 
 	GFX_DELAY(35);
 
-	vector_fix2float<4>(mtxTemporal, 4096.f);
+	//vector_fix2float<4>(mtxTemporal, 4096.f);
 
 	MatrixMultiply (mtxCurrent[mode], mtxTemporal);
 
@@ -959,7 +967,7 @@ static BOOL gfx3d_glMultMatrix4x4(s32 v)
 
 static BOOL gfx3d_glMultMatrix4x3(s32 v)
 {
-	mtxTemporal[MM4x3ind] = (float)((v<<4)>>4);
+	mtxTemporal[MM4x3ind] = v;
 
 	MM4x3ind++;
 	if((MM4x3ind & 0x03) == 3) MM4x3ind++;
@@ -968,11 +976,11 @@ static BOOL gfx3d_glMultMatrix4x3(s32 v)
 
 	GFX_DELAY(31);
 
-	vector_fix2float<4>(mtxTemporal, 4096.f);
+	//vector_fix2float<4>(mtxTemporal, 4096.f);
 
 	//fill in the unusued matrix values
-	mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0.f;
-	mtxTemporal[15] = 1.f;
+	mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0;
+	mtxTemporal[15] = 1<<12;
 
 	MatrixMultiply (mtxCurrent[mode], mtxTemporal);
 
@@ -991,7 +999,7 @@ static BOOL gfx3d_glMultMatrix4x3(s32 v)
 
 static BOOL gfx3d_glMultMatrix3x3(s32 v)
 {
-	mtxTemporal[MM3x3ind] = (float)((v<<4)>>4);
+	mtxTemporal[MM3x3ind] = v;
 
 
 	MM3x3ind++;
@@ -1001,11 +1009,11 @@ static BOOL gfx3d_glMultMatrix3x3(s32 v)
 
 	GFX_DELAY(28);
 
-	vector_fix2float<3>(mtxTemporal, 4096.f);
+	//vector_fix2float<3>(mtxTemporal, 4096.f);
 
 	//fill in the unusued matrix values
 	mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0;
-	mtxTemporal[15] = 1;
+	mtxTemporal[15] = 1<<12;
 	mtxTemporal[12] = mtxTemporal[13] = mtxTemporal[14] = 0;
 
 	MatrixMultiply (mtxCurrent[mode], mtxTemporal);
@@ -1026,7 +1034,7 @@ static BOOL gfx3d_glMultMatrix3x3(s32 v)
 
 static BOOL gfx3d_glScale(s32 v)
 {
-	scale[scaleind] = fix2float(v);
+	scale[scaleind] = v;
 
 	++scaleind;
 
@@ -1048,7 +1056,7 @@ static BOOL gfx3d_glScale(s32 v)
 
 static BOOL gfx3d_glTranslate(s32 v)
 {
-	trans[transind] = fix2float(v);
+	trans[transind] = v;
 
 	++transind;
 
@@ -1078,24 +1086,25 @@ static void gfx3d_glColor3b(u32 v)
 	GFX_DELAY(1);
 }
 
-static void gfx3d_glNormal(u32 v)
+static void gfx3d_glNormal(s32 v)
 {
-	int i,c;
-	ALIGN(16) float normal[4] = { normalTable[v&1023],
-						normalTable[(v>>10)&1023],
-						normalTable[(v>>20)&1023],
-						1};
+	s16 nx = ((v<<22)>>22)<<3;
+	s16 ny = ((v<<12)>>22)<<3;
+	s16 nz = ((v<<2)>>22)<<3;
 
 	if (texCoordinateTransform == 2)
 	{
-		last_s =(	(normal[0] *mtxCurrent[3][0] + normal[1] *mtxCurrent[3][4] +
-					 normal[2] *mtxCurrent[3][8]) + (_s*16.0f)) / 16.0f;
-		last_t =(	(normal[0] *mtxCurrent[3][1] + normal[1] *mtxCurrent[3][5] +
-					 normal[2] *mtxCurrent[3][9]) + (_t*16.0f)) / 16.0f;
+		//SM64 highlight rendered star in main menu tests this
+		last_s = (s32)(((s64)nx * mtxCurrent[3][0] + (s64)ny * mtxCurrent[3][4] + (s64)nz * mtxCurrent[3][8] + (_s<<24))>>24);
+		last_t = (s32)(((s64)nx * mtxCurrent[3][1] + (s64)ny * mtxCurrent[3][5] + (s64)nz * mtxCurrent[3][9] + (_t<<24))>>24);
 	}
 
+	CACHE_ALIGN float normal[4] =  { nx/4096.0f, ny/4096.0f, nz/4096.0f, 1.0f };
+
+
 	//use the current normal transform matrix
-	MatrixMultVec3x3 (mtxCurrent[2], normal);
+	CACHE_ALIGN float temp[16] = {mtxCurrent[2][0]/4096.0f,mtxCurrent[2][1]/4096.0f,mtxCurrent[2][2]/4096.0f,mtxCurrent[2][3]/4096.0f,mtxCurrent[2][4]/4096.0f,mtxCurrent[2][5]/4096.0f,mtxCurrent[2][6]/4096.0f,mtxCurrent[2][7]/4096.0f,mtxCurrent[2][8]/4096.0f,mtxCurrent[2][9]/4096.0f,mtxCurrent[2][10]/4096.0f,mtxCurrent[2][11]/4096.0f,mtxCurrent[2][12]/4096.0f,mtxCurrent[2][13]/4096.0f,mtxCurrent[2][14]/4096.0f,mtxCurrent[2][15]/4096.0f};
+	MatrixMultVec3x3 (temp, normal);
 
 	//apply lighting model
 	{
@@ -1121,7 +1130,7 @@ static void gfx3d_glNormal(u32 v)
 
 		int vertexColor[3] = { emission[0], emission[1], emission[2] };
 
-		for(i=0; i<4; i++)
+		for(int i=0; i<4; i++)
 		{
 			if(!((lightMask>>i)&1)) continue;
 
@@ -1154,7 +1163,7 @@ static void gfx3d_glNormal(u32 v)
 				shininessLevel = gfx3d.state.shininessTable[shininessIndex];
 			}
 
-			for(c = 0; c < 3; c++)
+			for(int c = 0; c < 3; c++)
 			{
 				vertexColor[c] += (int)(((specular[c] * _lightColor[c] * shininessLevel)
 					+ (diffuse[c] * _lightColor[c] * diffuseLevel)
@@ -1162,7 +1171,7 @@ static void gfx3d_glNormal(u32 v)
 			}
 		}
 
-		for(c=0;c<3;c++)
+		for(int c=0;c<3;c++)
 			colorRGB[c] = std::min(31,vertexColor[c]);
 	}
 
@@ -1173,22 +1182,18 @@ static void gfx3d_glNormal(u32 v)
 	GFX_DELAY_M2((lightMask>>3) & 0x01);
 }
 
-static void gfx3d_glTexCoord(u32 val)
+static void gfx3d_glTexCoord(s32 val)
 {
-	_t = (s16)(val>>16);
-	_s = (s16)(val&0xFFFF);
-
-	_s /= 16.0f;
-	_t /= 16.0f;
+	_s = ((val<<16)>>16);
+	_t = (val>>16);
 
 	if (texCoordinateTransform == 1)
 	{
-		last_s =_s*mtxCurrent[3][0] + _t*mtxCurrent[3][4] +
-				0.0625f*mtxCurrent[3][8] + 0.0625f*mtxCurrent[3][12];
-		last_t =_s*mtxCurrent[3][1] + _t*mtxCurrent[3][5] +
-				0.0625f*mtxCurrent[3][9] + 0.0625f*mtxCurrent[3][13];
+		//dragon quest 4 overworld will test this
+		last_s = (s32)(((s64)(_s<<12) * mtxCurrent[3][0] + (s64)(_t<<12) * mtxCurrent[3][4] + ((s64)mtxCurrent[3][8]<<12) + ((s64)mtxCurrent[3][12]<<12))>>24);
+		last_t = (s32)(((s64)(_s<<12) * mtxCurrent[3][1] + (s64)(_t<<12) * mtxCurrent[3][5] + ((s64)mtxCurrent[3][9]<<12) + ((s64)mtxCurrent[3][13]<<12))>>24);
 	}
-	else
+	else if(texCoordinateTransform == 0)
 	{
 		last_s=_s;
 		last_t=_t;
@@ -1196,21 +1201,18 @@ static void gfx3d_glTexCoord(u32 val)
 	GFX_DELAY(1);
 }
 
-static BOOL gfx3d_glVertex16b(unsigned int v)
+static BOOL gfx3d_glVertex16b(s32 v)
 {
 	if(coordind==0)
 	{
-		//coord[0]		= float16table[v&0xFFFF];
-		//coord[1]		= float16table[v>>16];
-		u16coord[0] = v&0xFFFF;
-		u16coord[1] = (v>>16)&0xFFFF;
+		s16coord[0] = (v<<16)>>16;
+		s16coord[1] = (v>>16)&0xFFFF;
 
 		++coordind;
 		return FALSE;
 	}
 
-	//coord[2]	  = float16table[v&0xFFFF];
-	u16coord[2] = v&0xFFFF;
+	s16coord[2] = (v<<16)>>16;
 
 	coordind = 0;
 	SetVertex ();
@@ -1219,40 +1221,37 @@ static BOOL gfx3d_glVertex16b(unsigned int v)
 	return TRUE;
 }
 
-static void gfx3d_glVertex10b(u32 v)
+static void gfx3d_glVertex10b(s32 v)
 {
-	//coord[0]		= float10Table[v&1023];
-	//coord[1]		= float10Table[(v>>10)&1023];
-	//coord[2]		= float10Table[(v>>20)&1023];
-	u16coord[0] = (v&1023)<<6;
-	u16coord[1] = ((v>>10)&1023)<<6;
-	u16coord[2] = ((v>>20)&1023)<<6;
+	//TODO TODO TODO - contemplate the sign extension - shift in zeroes or ones? zeroes is certainly more normal..
+	s16coord[0] = ((v<<22)>>22)<<6;
+	s16coord[1] = ((v<<12)>>22)<<6;
+	s16coord[2] = ((v<<2)>>22)<<6;
 
 	GFX_DELAY(8);
 	SetVertex ();
 }
 
-static void gfx3d_glVertex3_cord(unsigned int one, unsigned int two, unsigned int v)
+template<int ONE, int TWO>
+static void gfx3d_glVertex3_cord(s32 v)
 {
-	//coord[one]		= float16table[v&0xffff];
-	//coord[two]		= float16table[v>>16];
-	u16coord[one]		= v&0xffff;
-	u16coord[two]		= (v>>16)&0xFFFF;
+	s16coord[ONE]		= (v<<16)>>16;
+	s16coord[TWO]		= (v>>16);
 
 	SetVertex ();
 
 	GFX_DELAY(8);
 }
 
-static void gfx3d_glVertex_rel(u32 v)
+static void gfx3d_glVertex_rel(s32 v)
 {
-	//coord[0]		+= float10RelTable[v&1023];
-	//coord[1]		+= float10RelTable[(v>>10)&1023];
-	//coord[2]		+= float10RelTable[(v>>20)&1023];
+	s16 x = ((v<<22)>>22);
+	s16 y = ((v<<12)>>22);
+	s16 z = ((v<<2)>>22);
 
-	u16coord[0] += (u16)(((s16)((v&1023)<<6))>>6);
-	u16coord[1] += (u16)(((s16)(((v>>10)&1023)<<6))>>6);
-	u16coord[2] += (u16)(((s16)(((v>>20)&1023)<<6))>>6);
+	s16coord[0] += x;
+	s16coord[1] += y;
+	s16coord[2] += z;
 
 
 	SetVertex ();
@@ -1377,6 +1376,7 @@ static void gfx3d_glViewPort(u32 v)
 
 static BOOL gfx3d_glBoxTest(u32 v)
 {
+	printf("boxtest\n");
 	MMU_new.gxstat.tr = 0;		// clear boxtest bit
 	MMU_new.gxstat.tb = 1;		// busy
 
@@ -1473,11 +1473,10 @@ static BOOL gfx3d_glBoxTest(u32 v)
 	for(int i=0;i<8;i++) {
 		//MatrixMultVec4x4_M2(mtxCurrent[0], verts[i].coord);
 
-		//yuck.. cant use the sse2 accelerated ones because vert.coords is not cache aligned or something
-		//i dunno
-		
-		_NOSSE_MatrixMultVec4x4(mtxCurrent[1],verts[i].coord);
-		_NOSSE_MatrixMultVec4x4(mtxCurrent[0],verts[i].coord);
+		CACHE_ALIGN float temp1[16] = {mtxCurrent[1][0]/4096.0f,mtxCurrent[1][1]/4096.0f,mtxCurrent[1][2]/4096.0f,mtxCurrent[1][3]/4096.0f,mtxCurrent[1][4]/4096.0f,mtxCurrent[1][5]/4096.0f,mtxCurrent[1][6]/4096.0f,mtxCurrent[1][7]/4096.0f,mtxCurrent[1][8]/4096.0f,mtxCurrent[1][9]/4096.0f,mtxCurrent[1][10]/4096.0f,mtxCurrent[1][11]/4096.0f,mtxCurrent[1][12]/4096.0f,mtxCurrent[1][13]/4096.0f,mtxCurrent[1][14]/4096.0f,mtxCurrent[1][15]/4096.0f};
+		CACHE_ALIGN float temp0[16] = {mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f,mtxCurrent[0][4]/4096.0f,mtxCurrent[0][5]/4096.0f,mtxCurrent[0][6]/4096.0f,mtxCurrent[0][7]/4096.0f,mtxCurrent[0][8]/4096.0f,mtxCurrent[0][9]/4096.0f,mtxCurrent[0][10]/4096.0f,mtxCurrent[0][11]/4096.0f,mtxCurrent[0][12]/4096.0f,mtxCurrent[0][13]/4096.0f,mtxCurrent[0][14]/4096.0f,mtxCurrent[0][15]/4096.0f};
+		_NOSSE_MatrixMultVec4x4(temp1,verts[i].coord);
+		_NOSSE_MatrixMultVec4x4(temp0,verts[i].coord);
 	}
 
 	//clip each poly
@@ -1511,6 +1510,7 @@ static BOOL gfx3d_glBoxTest(u32 v)
 
 static BOOL gfx3d_glPosTest(u32 v)
 {
+	printf("postest\n");
 	//this is apparently tested by transformers decepticons and ultimate spiderman
 
 	//printf("POSTEST\n");
@@ -1524,8 +1524,11 @@ static BOOL gfx3d_glPosTest(u32 v)
 	
 	PTcoords[3] = 1.0f;
 
-	MatrixMultVec4x4(mtxCurrent[1], PTcoords);
-	MatrixMultVec4x4(mtxCurrent[0], PTcoords);
+	CACHE_ALIGN float temp1[16] = {mtxCurrent[1][0]/4096.0f,mtxCurrent[1][1]/4096.0f,mtxCurrent[1][2]/4096.0f,mtxCurrent[1][3]/4096.0f,mtxCurrent[1][4]/4096.0f,mtxCurrent[1][5]/4096.0f,mtxCurrent[1][6]/4096.0f,mtxCurrent[1][7]/4096.0f,mtxCurrent[1][8]/4096.0f,mtxCurrent[1][9]/4096.0f,mtxCurrent[1][10]/4096.0f,mtxCurrent[1][11]/4096.0f,mtxCurrent[1][12]/4096.0f,mtxCurrent[1][13]/4096.0f,mtxCurrent[1][14]/4096.0f,mtxCurrent[1][15]/4096.0f};
+	CACHE_ALIGN float temp0[16] = {mtxCurrent[0][0]/4096.0f,mtxCurrent[0][1]/4096.0f,mtxCurrent[0][2]/4096.0f,mtxCurrent[0][3]/4096.0f,mtxCurrent[0][4]/4096.0f,mtxCurrent[0][5]/4096.0f,mtxCurrent[0][6]/4096.0f,mtxCurrent[0][7]/4096.0f,mtxCurrent[0][8]/4096.0f,mtxCurrent[0][9]/4096.0f,mtxCurrent[0][10]/4096.0f,mtxCurrent[0][11]/4096.0f,mtxCurrent[0][12]/4096.0f,mtxCurrent[0][13]/4096.0f,mtxCurrent[0][14]/4096.0f,mtxCurrent[0][15]/4096.0f};
+
+	MatrixMultVec4x4(temp1, PTcoords);
+	MatrixMultVec4x4(temp0, PTcoords);
 
 	MMU_new.gxstat.tb = 0;
 
@@ -1536,6 +1539,7 @@ static BOOL gfx3d_glPosTest(u32 v)
 
 static void gfx3d_glVecTest(u32 v)
 {
+	printf("vectest\n");
 	GFX_DELAY(5);
 
 	//this is tested by phoenix wright in its evidence inspector modelviewer
@@ -1547,7 +1551,8 @@ static void gfx3d_glVecTest(u32 v)
 						normalTable[(v>>20)&1023],
 						0};
 
-	MatrixMultVec4x4(mtxCurrent[2], normal);
+	CACHE_ALIGN float temp[16] = {mtxCurrent[2][0]/4096.0f,mtxCurrent[2][1]/4096.0f,mtxCurrent[2][2]/4096.0f,mtxCurrent[2][3]/4096.0f,mtxCurrent[2][4]/4096.0f,mtxCurrent[2][5]/4096.0f,mtxCurrent[2][6]/4096.0f,mtxCurrent[2][7]/4096.0f,mtxCurrent[2][8]/4096.0f,mtxCurrent[2][9]/4096.0f,mtxCurrent[2][10]/4096.0f,mtxCurrent[2][11]/4096.0f,mtxCurrent[2][12]/4096.0f,mtxCurrent[2][13]/4096.0f,mtxCurrent[2][14]/4096.0f,mtxCurrent[2][15]/4096.0f};
+	MatrixMultVec4x4(temp, normal);
 
 	s16 x = (s16)(normal[0]*4096);
 	s16 y = (s16)(normal[1]*4096);
@@ -1620,9 +1625,9 @@ void gfx3d_UpdateToonTable(u8 offset, u32 val)
 
 s32 gfx3d_GetClipMatrix (unsigned int index)
 {
-	float val = MatrixGetMultipliedIndex (index, mtxCurrent[0], mtxCurrent[1]);
+	s32 val = MatrixGetMultipliedIndex (index, mtxCurrent[0], mtxCurrent[1]);
 
-	val *= (1<<12);
+	//val *= (1<<12);
 
 	return (s32)val;
 }
@@ -1631,7 +1636,8 @@ s32 gfx3d_GetDirectionalMatrix (unsigned int index)
 {
 	int _index = (((index / 3) * 4) + (index % 3));
 
-	return (s32)(mtxCurrent[2][_index]*(1<<12));
+	//return (s32)(mtxCurrent[2][_index]*(1<<12));
+	return mtxCurrent[2][_index];
 }
 
 void gfx3d_glAlphaFunc(u32 v)
@@ -1834,13 +1840,13 @@ static void gfx3d_execute(u8 cmd, u32 param)
 			gfx3d_glVertex10b(param);
 		break;
 		case 0x25:		// VTX_XY - Set Vertex XY Coordinates (W)
-			gfx3d_glVertex3_cord(0, 1, param);
+			gfx3d_glVertex3_cord<0,1>(param);
 		break;
 		case 0x26:		// VTX_XZ - Set Vertex XZ Coordinates (W)
-			gfx3d_glVertex3_cord(0, 2, param);
+			gfx3d_glVertex3_cord<0,2>(param);
 		break;
 		case 0x27:		// VTX_YZ - Set Vertex YZ Coordinates (W)
-			gfx3d_glVertex3_cord(1, 2, param);
+			gfx3d_glVertex3_cord<1,2>(param);
 		break;
 		case 0x28:		// VTX_DIFF - Set Relative Vertex Coordinates (W)
 			gfx3d_glVertex_rel(param);
@@ -2205,13 +2211,13 @@ void gfx3d_Control(u32 v)
 //other misc stuff
 void gfx3d_glGetMatrix(unsigned int m_mode, int index, float* dest)
 {
-	if(index == -1)
-	{
-		MatrixCopy(dest, mtxCurrent[m_mode]);
-		return;
-	}
+	//if(index == -1)
+	//{
+	//	MatrixCopy(dest, mtxCurrent[m_mode]);
+	//	return;
+	//}
 
-	MatrixCopy(dest, MatrixStackGetPos(&mtxStack[m_mode], index));
+	//MatrixCopy(dest, MatrixStackGetPos(&mtxStack[m_mode], index));
 }
 
 void gfx3d_glGetLightDirection(unsigned int index, unsigned int* dest)
@@ -2267,7 +2273,7 @@ SFORMAT SF_GFX3D[]={
 	{ "MM4I", 1, 1, &MM4x4ind},
 	{ "MM3I", 1, 1, &MM4x3ind},
 	{ "MMxI", 1, 1, &MM3x3ind},
-	{ "GSCO", 4, 1, u16coord},
+	{ "GSCO", 4, 1, s16coord},
 	{ "GCOI", 1, 1, &coordind},
 	{ "GVFM", 4, 1, &vtxFormat},
 	{ "GTRN", 4, 4, trans},
diff --git a/desmume/src/matrix.cpp b/desmume/src/matrix.cpp
index 53b0d1543..224e9b0e8 100644
--- a/desmume/src/matrix.cpp
+++ b/desmume/src/matrix.cpp
@@ -39,6 +39,19 @@ void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr)
 	vecPtr[3] = x * matrix[3] + y * matrix[7] + z * matrix[11] + w * matrix[15];
 }
 
+void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr)
+{
+	s64 x = vecPtr[0];
+	s64 y = vecPtr[1];
+	s64 z = vecPtr[2];
+	s64 w = vecPtr[3];
+
+	vecPtr[0] = (s32)((x * matrix[0] + y * matrix[4] + z * matrix[ 8] + w * matrix[12])>>12);
+	vecPtr[1] = (s32)((x * matrix[1] + y * matrix[5] + z * matrix[ 9] + w * matrix[13])>>12);
+	vecPtr[2] = (s32)((x * matrix[2] + y * matrix[6] + z * matrix[10] + w * matrix[14])>>12);
+	vecPtr[3] = (s32)((x * matrix[3] + y * matrix[7] + z * matrix[11] + w * matrix[15])>>12);
+}
+
 
 //-------------------------
 //switched SSE functions: implementations for no SSE
@@ -116,6 +129,12 @@ void MatrixScale (float *matrix, const float *ptr)
 #endif //switched c/asm functions
 //-----------------------------------------
 
+void MatrixInit  (s32 *matrix)
+{
+	memset (matrix, 0, sizeof(float)*16);
+	matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1<<12;
+}
+
 void MatrixInit  (float *matrix)
 {
 	memset (matrix, 0, sizeof(float)*16);
@@ -135,6 +154,13 @@ void MatrixTranspose(float *matrix)
 #undef swap
 }
 
+void	MatrixIdentity			(s32 *matrix)
+{
+	matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0;
+	matrix[6] = matrix[7] = matrix[8] = matrix[9] = 0;
+	matrix[11] = matrix[12] = matrix[13] = matrix[14] = 0;
+	matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1<<12;
+}
 void MatrixIdentity	(float *matrix)
 {
 	matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0.0f;
@@ -151,6 +177,16 @@ float MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix)
 			(matrix[iMod+8]*rightMatrix[iDiv+2])+(matrix[iMod+12]*rightMatrix[iDiv+3]);
 }
 
+s32 MatrixGetMultipliedIndex (int index, s32 *matrix, s32 *rightMatrix)
+{
+	int iMod = index%4, iDiv = (index>>2)<<2;
+
+	s64 temp = ((s64)matrix[iMod  ]*rightMatrix[iDiv  ])+((s64)matrix[iMod+ 4]*rightMatrix[iDiv+1])+
+			((s64)matrix[iMod+8]*rightMatrix[iDiv+2])+((s64)matrix[iMod+12]*rightMatrix[iDiv+3]);
+
+	return (s32)(temp>>12);
+}
+
 void MatrixSet (float *matrix, int x, int y, float value)	// TODO
 {
 	matrix [x+(y<<2)] = value;
@@ -177,6 +213,27 @@ void MatrixCopy (float* matrixDST, const float* matrixSRC)
 
 }
 
+void MatrixCopy (s32* matrixDST, const s32* matrixSRC)
+{
+	matrixDST[0] = matrixSRC[0];
+	matrixDST[1] = matrixSRC[1];
+	matrixDST[2] = matrixSRC[2];
+	matrixDST[3] = matrixSRC[3];
+	matrixDST[4] = matrixSRC[4];
+	matrixDST[5] = matrixSRC[5];
+	matrixDST[6] = matrixSRC[6];
+	matrixDST[7] = matrixSRC[7];
+	matrixDST[8] = matrixSRC[8];
+	matrixDST[9] = matrixSRC[9];
+	matrixDST[10] = matrixSRC[10];
+	matrixDST[11] = matrixSRC[11];
+	matrixDST[12] = matrixSRC[12];
+	matrixDST[13] = matrixSRC[13];
+	matrixDST[14] = matrixSRC[14];
+	matrixDST[15] = matrixSRC[15];
+
+}
+
 int MatrixCompare (const float* matrixDST, const float* matrixSRC)
 {
 	return memcmp((void*)matrixDST, matrixSRC, sizeof(float)*16);
@@ -200,7 +257,7 @@ void MatrixStackSetMaxSize (MatrixStack *stack, int size)
 	if (stack->matrix != NULL) {
 		free (stack->matrix);
 	}
-	stack->matrix = (float*) malloc (stack->size*16*sizeof(float));
+	stack->matrix = new s32[stack->size*16*sizeof(s32)];
 
 	for (i = 0; i < stack->size; i++)
 	{
@@ -226,7 +283,7 @@ static void MatrixStackSetStackPosition (MatrixStack *stack, int pos)
 	stack->position &= stack->size;
 }
 
-void MatrixStackPushMatrix (MatrixStack *stack, const float *ptr)
+void MatrixStackPushMatrix (MatrixStack *stack, const s32 *ptr)
 {
 	//printf("Push %i pos %i\n", stack->type, stack->position);
 	if ((stack->type == 0) || (stack->type == 3))
@@ -236,7 +293,7 @@ void MatrixStackPushMatrix (MatrixStack *stack, const float *ptr)
 	MatrixStackSetStackPosition (stack, 1);
 }
 
-void MatrixStackPopMatrix (float *mtxCurr, MatrixStack *stack, int size)
+void MatrixStackPopMatrix (s32 *mtxCurr, MatrixStack *stack, int size)
 {
 	//printf("Pop %i pos %i (change %d)\n", stack->type, stack->position, -size);
 	MatrixStackSetStackPosition(stack, -size);
@@ -246,18 +303,18 @@ void MatrixStackPopMatrix (float *mtxCurr, MatrixStack *stack, int size)
 		MatrixCopy (mtxCurr, &stack->matrix[stack->position*16]);
 }
 
-float * MatrixStackGetPos (MatrixStack *stack, int pos)
+s32 * MatrixStackGetPos (MatrixStack *stack, int pos)
 {
 	assert(pos<31);
 	return &stack->matrix[pos*16];
 }
 
-float * MatrixStackGet (MatrixStack *stack)
+s32 * MatrixStackGet (MatrixStack *stack)
 {
 	return &stack->matrix[stack->position*16];
 }
 
-void MatrixStackLoadMatrix (MatrixStack *stack, int pos, const float *ptr)
+void MatrixStackLoadMatrix (MatrixStack *stack, int pos, const s32 *ptr)
 {
 	assert(pos<31);
 	MatrixCopy (&stack->matrix[pos*16], ptr);
@@ -355,3 +412,54 @@ void Vector4Copy(float *dst, const float *src)
 }
 
 
+void MatrixMultiply (s32 *matrix, const s32 *rightMatrix)
+{
+	s64 tmpMatrix[16];
+
+	tmpMatrix[0]  = (matrix[0]*(s64)rightMatrix[0])+(matrix[4]*(s64)rightMatrix[1])+(matrix[8]*(s64)rightMatrix[2])+(matrix[12]*(s64)rightMatrix[3]);
+	tmpMatrix[1]  = (matrix[1]*(s64)rightMatrix[0])+(matrix[5]*(s64)rightMatrix[1])+(matrix[9]*(s64)rightMatrix[2])+(matrix[13]*(s64)rightMatrix[3]);
+	tmpMatrix[2]  = (matrix[2]*(s64)rightMatrix[0])+(matrix[6]*(s64)rightMatrix[1])+(matrix[10]*(s64)rightMatrix[2])+(matrix[14]*(s64)rightMatrix[3]);
+	tmpMatrix[3]  = (matrix[3]*(s64)rightMatrix[0])+(matrix[7]*(s64)rightMatrix[1])+(matrix[11]*(s64)rightMatrix[2])+(matrix[15]*(s64)rightMatrix[3]);
+
+	tmpMatrix[4]  = (matrix[0]*(s64)rightMatrix[4])+(matrix[4]*(s64)rightMatrix[5])+(matrix[8]*(s64)rightMatrix[6])+(matrix[12]*(s64)rightMatrix[7]);
+	tmpMatrix[5]  = (matrix[1]*(s64)rightMatrix[4])+(matrix[5]*(s64)rightMatrix[5])+(matrix[9]*(s64)rightMatrix[6])+(matrix[13]*(s64)rightMatrix[7]);
+	tmpMatrix[6]  = (matrix[2]*(s64)rightMatrix[4])+(matrix[6]*(s64)rightMatrix[5])+(matrix[10]*(s64)rightMatrix[6])+(matrix[14]*(s64)rightMatrix[7]);
+	tmpMatrix[7]  = (matrix[3]*(s64)rightMatrix[4])+(matrix[7]*(s64)rightMatrix[5])+(matrix[11]*(s64)rightMatrix[6])+(matrix[15]*(s64)rightMatrix[7]);
+
+	tmpMatrix[8]  = (matrix[0]*(s64)rightMatrix[8])+(matrix[4]*(s64)rightMatrix[9])+(matrix[8]*(s64)rightMatrix[10])+(matrix[12]*(s64)rightMatrix[11]);
+	tmpMatrix[9]  = (matrix[1]*(s64)rightMatrix[8])+(matrix[5]*(s64)rightMatrix[9])+(matrix[9]*(s64)rightMatrix[10])+(matrix[13]*(s64)rightMatrix[11]);
+	tmpMatrix[10] = (matrix[2]*(s64)rightMatrix[8])+(matrix[6]*(s64)rightMatrix[9])+(matrix[10]*(s64)rightMatrix[10])+(matrix[14]*(s64)rightMatrix[11]);
+	tmpMatrix[11] = (matrix[3]*(s64)rightMatrix[8])+(matrix[7]*(s64)rightMatrix[9])+(matrix[11]*(s64)rightMatrix[10])+(matrix[15]*(s64)rightMatrix[11]);
+
+	tmpMatrix[12] = (matrix[0]*(s64)rightMatrix[12])+(matrix[4]*(s64)rightMatrix[13])+(matrix[8]*(s64)rightMatrix[14])+(matrix[12]*(s64)rightMatrix[15]);
+	tmpMatrix[13] = (matrix[1]*(s64)rightMatrix[12])+(matrix[5]*(s64)rightMatrix[13])+(matrix[9]*(s64)rightMatrix[14])+(matrix[13]*(s64)rightMatrix[15]);
+	tmpMatrix[14] = (matrix[2]*(s64)rightMatrix[12])+(matrix[6]*(s64)rightMatrix[13])+(matrix[10]*(s64)rightMatrix[14])+(matrix[14]*(s64)rightMatrix[15]);
+	tmpMatrix[15] = (matrix[3]*(s64)rightMatrix[12])+(matrix[7]*(s64)rightMatrix[13])+(matrix[11]*(s64)rightMatrix[14])+(matrix[15]*(s64)rightMatrix[15]);
+
+	for(int i=0;i<16;i++)
+		matrix[i] = (s32)(tmpMatrix[i]>>12);
+}
+
+void MatrixScale(s32 *matrix, const s32 *ptr)
+{
+	for(int i=0;i<12;i++)
+		matrix[i]  = (s32)(((s64)matrix[i]*ptr[i>>2])>>12);
+}
+
+void MatrixTranslate(s32 *matrix, const s32 *ptr)
+{
+	for(int i=0;i<4;i++)
+	{
+		s64 temp = ((s64)matrix[i+12])<<12;
+		temp += (s64)matrix[i]*ptr[0];
+		temp += (s64)matrix[i+4]*ptr[1];
+		temp += (s64)matrix[i+8]*ptr[2];
+		matrix[i+12] = (s32)(temp>>12);
+	}
+}
+
+void MatrixMultVec4x4_M2(const s32 *matrix, s32 *vecPtr)
+{
+	MatrixMultVec4x4(matrix+16,vecPtr);
+	MatrixMultVec4x4(matrix,vecPtr);
+}
diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h
index 9394f625a..bed198a6a 100644
--- a/desmume/src/matrix.h
+++ b/desmume/src/matrix.h
@@ -38,32 +38,36 @@
 struct MatrixStack
 {
 	MatrixStack(int size, int type);
-	float	*matrix;
+	s32		*matrix;
 	s32		position;
 	s32		size;
 	u8		type;
 };
 
 void	MatrixInit				(float *matrix);
+void	MatrixInit				(s32 *matrix);
 
 //In order to conditionally use these asm optimized functions in visual studio
 //without having to make new build types to exclude the assembly files.
 //a bit sloppy, but there aint much to it
 
 float	MatrixGetMultipliedIndex	(int index, float *matrix, float *rightMatrix);
+s32	MatrixGetMultipliedIndex	(int index, s32 *matrix, s32 *rightMatrix);
 void	MatrixSet				(float *matrix, int x, int y, float value);
 void	MatrixCopy				(float * matrixDST, const float * matrixSRC);
+void	MatrixCopy				(s32 * matrixDST, const s32 * matrixSRC);
 int		MatrixCompare				(const float * matrixDST, const float * matrixSRC);
 void	MatrixIdentity			(float *matrix);
+void	MatrixIdentity			(s32 *matrix);
 
 void	MatrixTranspose				(float *matrix);
 void	MatrixStackInit				(MatrixStack *stack);
 void	MatrixStackSetMaxSize		(MatrixStack *stack, int size);
-void	MatrixStackPushMatrix		(MatrixStack *stack, const float *ptr);
-void	MatrixStackPopMatrix		(float *mtxCurr, MatrixStack *stack, int size);
-float*	MatrixStackGetPos			(MatrixStack *stack, int pos);
-float*	MatrixStackGet				(MatrixStack *stack);
-void	MatrixStackLoadMatrix		(MatrixStack *stack, int pos, const float *ptr);
+void	MatrixStackPushMatrix		(MatrixStack *stack, const s32 *ptr);
+void	MatrixStackPopMatrix		(s32 *mtxCurr, MatrixStack *stack, int size);
+s32*	MatrixStackGetPos			(MatrixStack *stack, int pos);
+s32*	MatrixStackGet				(MatrixStack *stack);
+void	MatrixStackLoadMatrix		(MatrixStack *stack, int pos, const s32 *ptr);
 
 void Vector2Copy(float *dst, const float *src);
 void Vector2Add(float *dst, const float *src);
@@ -330,5 +334,12 @@ static FORCEINLINE void memset_u8(void* dst, u8 val)
 
 #endif //switched SSE functions
 
+void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr);
 
+void MatrixMultVec4x4_M2(const s32 *matrix, s32 *vecPtr);
+
+void MatrixMultiply(s32* matrix, const s32* rightMatrix);
+void MatrixScale(s32 *matrix, const s32 *ptr);
+void MatrixTranslate(s32 *matrix, const s32 *ptr);
 #endif
+
diff --git a/desmume/src/windows/matrixView.cpp b/desmume/src/windows/matrixView.cpp
index b8c442d80..6bfcccd3e 100644
--- a/desmume/src/windows/matrixView.cpp
+++ b/desmume/src/windows/matrixView.cpp
@@ -62,7 +62,7 @@ void MatrixView_OnPaintPositionMatrix(HWND hwnd)
 
 	stackIndex = SendMessage(hStackCombo, CB_GETCURSEL, 0, 0) - 1;
 
-	gfx3d_glGetMatrix(1, stackIndex, matrix);
+	//gfx3d_glGetMatrix(1, stackIndex, matrix);
 	MatrixView_SetMatrix(hwnd, idcGroup, matrix);
 }
 
@@ -85,7 +85,7 @@ void MatrixView_OnPaintDirectionMatrix(HWND hwnd)
 
 	stackIndex = SendMessage(hStackCombo, CB_GETCURSEL, 0, 0) - 1;
 
-	gfx3d_glGetMatrix(2, stackIndex, matrix);
+	//gfx3d_glGetMatrix(2, stackIndex, matrix);
 	MatrixView_SetMatrix(hwnd, idcGroup, matrix);
 }
 
@@ -104,7 +104,7 @@ void MatrixView_OnPaintProjectionMatrix(HWND hwnd)
 
 	float mat[16];
 
-	gfx3d_glGetMatrix(0, -1, mat);
+	//gfx3d_glGetMatrix(0, -1, mat);
 	MatrixView_SetMatrix(hwnd, idcGroup, mat);
 }
 
@@ -123,7 +123,7 @@ void MatrixView_OnPaintTextureMatrix(HWND hwnd)
 
 	float mat[16];
 
-	gfx3d_glGetMatrix(3, -1, mat);
+	//gfx3d_glGetMatrix(3, -1, mat);
 	MatrixView_SetMatrix(hwnd, idcGroup, mat);
 }