Glide64: Update to make asm more closer to original code

2013-04-20 12:14:46 +10:00 · 2013-04-20 12:14:46 +10:00 · 8f16f58311
parent 047eab9b70
commit 8f16f58311
17 changed files with 675 additions and 637 deletions
--- a/Source/Glide64/3dmath.cpp
+++ b/Source/Glide64/3dmath.cpp
@ -203,400 +203,17 @@ TRANSFORMVECTOR InverseTransformVector = InverseTransformVectorC;
 DOTPRODUCT DotProduct = DotProductC;
 NORMALIZEVECTOR NormalizeVector = NormalizeVectorC;

-void  TransformVectorSSE(float *src, float *dst, float mat[4][4])
-{
-	__asm
-	{
-		mov       ecx,[src]
-		mov       eax,[dst]
-		mov       edx,[mat]
-
-		movss     xmm0,[ecx]    ; 0 0 0 src[0]
-		movss     xmm5,[edx]    ; 0 0 0 mat[0][0]
-		movhps    xmm5,[edx+4]  ; mat[0][2] mat[0][1] 0 mat[0][0]
-		shufps    xmm0,xmm0, 0  ; src[0] src[0] src[0] src[0]
-		movss     xmm1,[ecx+4]  ; 0 0 0 src[1]
-		movss     xmm3,[edx+16] ; 0 0 0 mat[1][0]
-		movhps    xmm3,[edx+20] ; mat[1][2] mat[1][1] 0 mat[1][0]
-		shufps    xmm1,xmm1, 0  ; src[1] src[1] src[1] src[1]
-		mulps     xmm0,xmm5     ; mat[0][2]*src[0] mat[0][1]*src[0] 0 mat[0][0]*src[0]
-		mulps     xmm1,xmm3     ; mat[1][2]*src[1] mat[1][1]*src[1] 0 mat[1][0]*src[1]
-		movss     xmm2,[ecx+8]  ; 0 0 0 src[2]
-		shufps    xmm2,xmm2, 0  ; src[2] src[2] src[2] src[2]
-		movss     xmm4,[edx+32] ; 0 0 0 mat[2][0]
-		movhps    xmm4,[edx+36] ; mat[2][2] mat[2][1] 0 mat[2][0]
-		addps     xmm0,xmm1     ; mat[0][2]*src[0]+mat[1][2]*src[1] mat[0][1]*src[0]+mat[1][1]*src[1] 0 mat[0][0]*src[0]+mat[1][0]*src[1]
-		mulps     xmm2,xmm4     ; mat[2][2]*src[2] mat[2][1]*src[2] 0 mat[2][0]*src[2]
-		addps     xmm0,xmm2     ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2] mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2] 0 mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
-		movss     [eax],xmm0    ; mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
-		movhps    [eax+4],xmm0  ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2] mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2]
-	}
-}
-
-void TransformVector3DNOW(float *src, float *dst, float mat[4][4])
-{
-	_asm {
-		femms
-		mov         ecx,[src]
-		mov         eax,[dst]
-		mov         edx,[mat]
-		movq        mm0,[ecx]     ; src[1] src[0]
-		movd        mm2,[ecx+8]   ; 0 src[2]
-		movq        mm1,mm0       ; src[1] src[0]
-		punpckldq   mm0,mm0       ; src[0] src[0]
-		punpckhdq   mm1,mm1       ; src[1] src[1]
-		punpckldq   mm2,mm2       ; src[2] src[2]
-		movq        mm3,mm0       ; src[0] src[0]
-		movq        mm4,mm1       ; src[1] src[1]
-		movq        mm5,mm2       ; src[2] src[2]
-		pfmul       mm0,[edx]     ; src[0]*mat[0][1] src[0]*mat[0][0]
-		pfmul       mm3,[edx+8]   ; 0 src[0]*mat[0][2]
-		pfmul       mm1,[edx+16]  ; src[1]*mat[1][1] src[1]*mat[1][0]
-		pfmul       mm4,[edx+24]  ; 0 src[1]*mat[1][2]
-		pfmul       mm2,[edx+32]  ; src[2]*mat[2][1] src[2]*mat[2][0]
-		pfmul       mm5,[edx+40]  ; 0 src[2]*mat[2][2]
-		pfadd       mm0,mm1       ; src[0]*mat[0][1]+src[1]*mat[1][1] src[0]*mat[0][0]+src[1]*mat[1][0]
-		pfadd       mm3,mm4       ; 0 src[0]*mat[0][2]+src[1]*mat[1][2]
-		pfadd       mm0,mm2       ; src[0]*mat[0][1]+src[1]*mat[1][1]+src[2]*mat[2][1] src[0]*mat[0][0]+src[1]*mat[1][0]+src[2]*mat[2][0]
-		pfadd       mm3,mm5       ; 0 src[0]*mat[0][2]+src[1]*mat[1][2]+src[2]*mat[2][2]
-		movq        [eax],mm0     ; mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2] mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
-		movd        [eax+8],mm3   ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2]
-		femms
-	}
-}
-
-void InverseTransformVector3DNOW(float *src, float *dst, float mat[4][4])
-{
-	_asm {
-		femms
-		mov         ecx,[src]
-		mov         eax,[dst]
-		mov         edx,[mat]
-		movq        mm0,[ecx]     ; src[1] src[0]
-		movd        mm4,[ecx+8]   ; 0 src[2]
-		movq        mm1,mm0       ; src[1] src[0]
-		pfmul       mm0,[edx]     ; src[1]*mat[0][1] src[0]*mat[0][0]
-		movq        mm5,mm4       ; 0 src[2]
-		pfmul       mm4,[edx+8]   ; 0 src[2]*mat[0][2]
-		movq        mm2,mm1       ; src[1] src[0]
-		pfmul       mm1,[edx+16]  ; src[1]*mat[1][1] src[0]*mat[1][0]
-		movq        mm6,mm5       ; 0 src[2]
-		pfmul       mm5,[edx+24]  ; 0 src[2]*mat[1][2]
-		movq        mm3,mm2       ; src[1] src[0]
-		pfmul       mm2,[edx+32]  ; src[1]*mat[2][1] src[0]*mat[2][0]
-		movq        mm7,mm6       ; 0 src[2]
-		pfmul       mm6,[edx+40]  ; 0 src[2]*mat[2][2]
-		pfacc       mm0,mm4       ; src[2]*mat[0][2] src[1]*mat[0][1]+src[0]*mat[0][0]
-		pfacc       mm1,mm5       ; src[2]*mat[1][2] src[1]*mat[1][1]+src[0]*mat[1][0]
-		pfacc       mm2,mm6       ; src[2]*mat[2][2] src[1]*mat[2][1]+src[0]*mat[2][0]
-		pfacc       mm0,mm1       ; src[2]*mat[1][2]+src[1]*mat[1][1]+src[0]*mat[1][0] src[2]*mat[0][2]+src[1]*mat[0][1]+src[0]*mat[0][0]
-		pfacc       mm2,mm3       ; 0 src[2]*mat[2][2]+src[1]*mat[2][1]+src[0]*mat[2][0]
-		movq        [eax],mm0     ; mat[1][0]*src[0]+mat[1][1]*src[1]+mat[1][2]*src[2] mat[0][0]*src[0]+mat[0][1]*src[1]+mat[0][2]*src[2]
-		movd        [eax+8],mm2   ; mat[2][0]*src[0]+mat[2][1]*src[1]+mat[2][2]*src[2]
-		femms                    
-	}
-}
-
-void  MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4])
-{
-	__asm
-	{
-		mov       eax,[r]      
-		mov       ecx,[m1]
-		mov       edx,[m2]
-
-		movaps    xmm0,[edx]
-		movaps    xmm1,[edx+16]
-		movaps    xmm2,[edx+32]
-		movaps    xmm3,[edx+48]
-
-		; r[0][0],r[0][1],r[0][2],r[0][3]
-
-		movaps    xmm4,[ecx]
-		movaps    xmm5,xmm4
-		movaps    xmm6,xmm4
-		movaps    xmm7,xmm4
-
-		shufps    xmm4,xmm4,00000000b
-		shufps    xmm5,xmm5,01010101b
-		shufps    xmm6,xmm6,10101010b
-		shufps    xmm7,xmm7,11111111b
-
-		mulps     xmm4,xmm0
-		mulps     xmm5,xmm1
-		mulps     xmm6,xmm2
-		mulps     xmm7,xmm3
-
-		addps     xmm4,xmm5
-		addps     xmm4,xmm6
-		addps     xmm4,xmm7
-
-		movaps    [eax],xmm4
-
-		; r[1][0],r[1][1],r[1][2],r[1][3]
-
-		movaps    xmm4,[ecx+16]
-		movaps    xmm5,xmm4
-		movaps    xmm6,xmm4
-		movaps    xmm7,xmm4
-
-		shufps    xmm4,xmm4,00000000b
-		shufps    xmm5,xmm5,01010101b
-		shufps    xmm6,xmm6,10101010b
-		shufps    xmm7,xmm7,11111111b
-
-		mulps     xmm4,xmm0
-		mulps     xmm5,xmm1
-		mulps     xmm6,xmm2
-		mulps     xmm7,xmm3
-
-		addps     xmm4,xmm5
-		addps     xmm4,xmm6
-		addps     xmm4,xmm7
-
-		movaps    [eax+16],xmm4
-
-
-		; r[2][0],r[2][1],r[2][2],r[2][3]
-
-		movaps    xmm4,[ecx+32]
-		movaps    xmm5,xmm4
-		movaps    xmm6,xmm4
-		movaps    xmm7,xmm4
-
-		shufps    xmm4,xmm4,00000000b
-		shufps    xmm5,xmm5,01010101b
-		shufps    xmm6,xmm6,10101010b
-		shufps    xmm7,xmm7,11111111b
-
-		mulps     xmm4,xmm0
-		mulps     xmm5,xmm1
-		mulps     xmm6,xmm2
-		mulps     xmm7,xmm3
-
-		addps     xmm4,xmm5
-		addps     xmm4,xmm6
-		addps     xmm4,xmm7
-
-		movaps    [eax+32],xmm4
-
-		; r[3][0],r[3][1],r[3][2],r[3][3]
-
-		movaps    xmm4,[ecx+48]
-		movaps    xmm5,xmm4
-		movaps    xmm6,xmm4
-		movaps    xmm7,xmm4
-
-		shufps    xmm4,xmm4,00000000b
-		shufps    xmm5,xmm5,01010101b
-		shufps    xmm6,xmm6,10101010b
-		shufps    xmm7,xmm7,11111111b
-
-		mulps     xmm4,xmm0
-		mulps     xmm5,xmm1
-		mulps     xmm6,xmm2
-		mulps     xmm7,xmm3
-
-		addps     xmm4,xmm5
-		addps     xmm4,xmm6
-		addps     xmm4,xmm7
-
-		movaps    [eax+48],xmm4
-	}
-}
-
-void  MulMatrices3DNOW(float m1[4][4],float m2[4][4],float r[4][4])
-{
-	_asm 
-	{
-		femms
-		mov         ecx,[m1]
-		mov         eax,[r]
-		mov         edx,[m2]
-
-		movq        mm0,[ecx]
-		movq        mm1,[ecx+8]
-		movq        mm4,[edx]
-		punpckhdq   mm2,mm0
-		movq        mm5,[edx+16]
-		punpckhdq   mm3,mm1
-		movq        mm6,[edx+32]
-		punpckldq   mm0,mm0
-		punpckldq   mm1,mm1
-		pfmul       mm4,mm0
-		punpckhdq   mm2,mm2
-		pfmul       mm0,[edx+8]
-		movq        mm7,[edx+48]
-		pfmul       mm5,mm2
-		punpckhdq   mm3,mm3
-		pfmul       mm2,[edx+24]
-		pfmul       mm6,mm1
-		pfadd       mm5,mm4
-		pfmul       mm1,[edx+40]
-		pfadd       mm2,mm0
-		pfmul       mm7,mm3
-		pfadd       mm6,mm5
-		pfmul       mm3,[edx+56]
-		pfadd       mm2,mm1
-		pfadd       mm7,mm6
-		movq        mm0,[ecx+16]
-		pfadd       mm3,mm2
-		movq        mm1,[ecx+24]
-		movq        [eax],mm7
-		movq        mm4,[edx]
-		movq        [eax+8],mm3
-
-		punpckhdq   mm2,mm0
-		movq        mm5,[edx+16]
-		punpckhdq   mm3,mm1
-		movq        mm6,[edx+32]
-		punpckldq   mm0,mm0
-		punpckldq   mm1,mm1
-		pfmul       mm4,mm0
-		punpckhdq   mm2,mm2
-		pfmul       mm0,[edx+8]
-		movq        mm7,[edx+48]
-		pfmul       mm5,mm2
-		punpckhdq   mm3,mm3
-		pfmul       mm2,[edx+24]
-		pfmul       mm6,mm1
-		pfadd       mm5,mm4
-		pfmul       mm1,[edx+40]
-		pfadd       mm2,mm0
-		pfmul       mm7,mm3
-		pfadd       mm6,mm5
-		pfmul       mm3,[edx+56]
-		pfadd       mm2,mm1
-		pfadd       mm7,mm6
-		movq        mm0,[ecx+32]
-		pfadd       mm3,mm2
-		movq        mm1,[ecx+40]
-		movq        [eax+16],mm7
-		movq        mm4,[edx]
-		movq        [eax+24],mm3
-
-		punpckhdq   mm2,mm0
-		movq        mm5,[edx+16]
-		punpckhdq   mm3,mm1
-		movq        mm6,[edx+32]
-		punpckldq   mm0,mm0
-		punpckldq   mm1,mm1
-		pfmul       mm4,mm0
-		punpckhdq   mm2,mm2
-		pfmul       mm0,[edx+8]
-		movq        mm7,[edx+48]
-		pfmul       mm5,mm2
-		punpckhdq   mm3,mm3
-		pfmul       mm2,[edx+24]
-		pfmul       mm6,mm1
-		pfadd       mm5,mm4
-		pfmul       mm1,[edx+40]
-		pfadd       mm2,mm0
-		pfmul       mm7,mm3
-		pfadd       mm6,mm5
-		pfmul       mm3,[edx+56]
-		pfadd       mm2,mm1
-		pfadd       mm7,mm6
-		movq        mm0,[ecx+48]
-		pfadd       mm3,mm2
-		movq        mm1,[ecx+56]
-		movq        [eax+32],mm7
-		movq        mm4,[edx]
-		movq        [eax+40],mm3
-
-		punpckhdq   mm2,mm0
-		movq        mm5,[edx+16]
-		punpckhdq   mm3,mm1
-		movq        mm6,[edx+32]
-		punpckldq   mm0,mm0
-		punpckldq   mm1,mm1
-		pfmul       mm4,mm0
-		punpckhdq   mm2,mm2
-		pfmul       mm0,[edx+8]
-		movq        mm7,[edx+48]
-		pfmul       mm5,mm2
-		punpckhdq   mm3,mm3
-		pfmul       mm2,[edx+24]
-		pfmul       mm6,mm1
-		pfadd       mm5,mm4
-		pfmul       mm1,[edx+40]
-		pfadd       mm2,mm0
-		pfmul       mm7,mm3
-		pfadd       mm6,mm5
-		pfmul       mm3,[edx+56]
-		pfadd       mm2,mm1
-		pfadd       mm7,mm6
-		pfadd       mm3,mm2
-		movq        [eax+48],mm7
-		movq        [eax+56],mm3
-		femms
-	}
-}
-
-float DotProductSSE3(register float *v1, register float *v2)
-{
-	_asm {
-		mov eax,[v1]
-		mov edx,[v2]
-		movaps xmm0, [eax]
-		mulps xmm0, [edx]
-		haddps xmm0, xmm0
-		haddps xmm0, xmm0
-		;      movss eax, xmm0
-	}
-}
-
+extern "C" void  TransformVectorSSE(float *src, float *dst, float mat[4][4]);
+extern "C" void  TransformVector3DNOW(float *src, float *dst, float mat[4][4]);
+extern "C" void  InverseTransformVector3DNOW(float *src, float *dst, float mat[4][4]);
+extern "C" void  MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4]);
+extern "C" void  MulMatrices3DNOW(float m1[4][4],float m2[4][4],float r[4][4]);
+extern "C" float DotProductSSE3(register float *v1, register float *v2);
 extern "C" float DotProduct3DNOW(register float *v1, register float *v2);
 extern "C" void NormalizeVectorSSE(float *v);
+extern "C" void NormalizeVector3DNOW(float *v);

-void NormalizeVector3DNOW(float *v)
-{
-	_asm{
-		femms
-		mov          edx,[v]
-	movq         mm0,[edx]
-	movq         mm3,[edx+8]
-	movq         mm1,mm0
-		movq         mm2,mm3
-		pfmul        mm0,mm0
-		pfmul        mm3,mm3
-		pfacc        mm0,mm0
-		pfadd        mm0,mm3
-		;movq mm4,mm0 ; prepare for 24bit precision
-		;punpckldq mm4,mm4 ; prepare for 24bit precision
-		pfrsqrt      mm0,mm0 ; 15bit precision 1/sqrtf(v)
-		;movq mm3,mm0
-		;pfmul mm0,mm0
-		;pfrsqit1 mm0,mm4
-		;pfrcpit2 mm0,mm3 ; 24bit precision 1/sqrtf(v)
-		pfmul        mm1,mm0
-		pfmul        mm2,mm0
-		movq         [edx],mm1
-		movq         [edx+8],mm2
-		femms
-	}
-}
-
-void DetectSIMD(int func, int * iedx, int * iecx)
-{
-	unsigned long reg, reg2;
-	__asm
-	{
-		mov eax, func
-		cpuid
-		mov reg, edx
-		mov reg2, ecx
-	}
-
-	if (iedx)
-	{
-		*iedx = reg;
-	}
-	if (iecx)
-	{
-		*iecx = reg2;
-	}
-}
+extern "C" void DetectSIMD(int function, int * iedx, int * iecx);

 void math_init()
 {
--- a/Source/Glide64/3dmathSIMD.asm
+++ b/Source/Glide64/3dmathSIMD.asm
@ -1,109 +0,0 @@
-;/*
-;* Glide64 - Glide video plugin for Nintendo 64 emulators.
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-;*/
-;
-;****************************************************************
-;
-; Glide64 - Glide Plugin for Nintendo 64 emulators
-; Project started on December 29th, 2001
-;
-; Authors:
-; Dave2001, original author, founded the project in 2001, left it in 2002
-; Gugaman, joined the project in 2002, left it in 2002
-; Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
-; Hiroshi 'KoolSmoky' Morii, joined the project in 2007
-;
-;****************************************************************
-;
-; To modify Glide64:
-; * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
-; * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
-;
-;****************************************************************
-
-%include "inc/c32.mac"
-
-segment .text
-
-;****************************************************************
-;
-;                     ******** SSE ********
-;
-;****************************************************************
-
-
-proc MulMatricesSSE
-CPU P3 
-endproc ;MulMatricesSSE
-
-proc NormalizeVectorSSE
-CPU P3
-      %$v arg
-      
-      mov edx, [ebp + %$v]
-      movaps xmm0, [edx]      ; x y z 0
-      movaps xmm2, xmm0       ; x y z 0
-      mulps  xmm0, xmm0       ; x*x y*y z*z 0
-      movaps xmm1, xmm0       ; x*x y*y z*z 0
-      shufps xmm0, xmm1, 0x4e ; z*z 0 x*x y*y
-      addps  xmm0, xmm1       ; x*x+z*z y*y z*z+x*x y*y
-      movaps xmm1, xmm0       ; x*x+z*z y*y z*z+x*x y*y
-      shufps xmm1, xmm1, 0x11 ; y*y z*z+x*x y*y z*z+x*x
-      addps  xmm0, xmm1       ; x*x+z*z+y*y
-      rsqrtps xmm0, xmm0      ; 1.0/sqrt(x*x+z*z+y*y)
-      mulps  xmm2, xmm0       ; x/sqrt(x*x+z*z+y*y) y/sqrt(x*x+z*z+y*y) z/sqrt(x*x+z*z+y*y) 0
-      movaps [edx], xmm2
-      
-endproc ;NormalizeVectorSSE
-
-;****************************************************************
-;
-;                     ******** SSE3 ********
-;
-;****************************************************************
-
-
-;****************************************************************
-;
-;                     ******** 3DNOW ********
-;
-;****************************************************************
-
-
-
-
-proc DotProduct3DNOW
-CPU 586
-      %$v1        arg
-      %$v2        arg
-      
-      femms
-      mov         edx,[ebp + %$v1]
-      mov         eax,[ebp + %$v2]
-      movq        mm0,[edx]
-      movq        mm3,[eax]
-      pfmul       mm0,mm3
-      movq        mm2,[edx+8]
-      movq        mm1,[eax+8]
-      pfacc       mm0,mm0
-      pfmul       mm1,mm2
-      pfadd       mm0,mm1
-      movd        eax,mm0
-      femms
-
-endproc ;DotProduct3DNOW
-
--- a/Source/Glide64/3dmathSIMD.asm.cpp
+++ b/Source/Glide64/3dmathSIMD.asm.cpp
@ -0,0 +1,527 @@
+;/*
+;* Glide64 - Glide video plugin for Nintendo 64 emulators.
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+;
+;****************************************************************
+;
+; Glide64 - Glide Plugin for Nintendo 64 emulators
+; Project started on December 29th, 2001
+;
+; Authors:
+; Dave2001, original author, founded the project in 2001, left it in 2002
+; Gugaman, joined the project in 2002, left it in 2002
+; Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
+; Hiroshi 'KoolSmoky' Morii, joined the project in 2007
+;
+;****************************************************************
+;
+; To modify Glide64:
+; * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
+; * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
+;
+;****************************************************************
+
+%include "inc/c32.mac"
+
+segment .text
+*/
+
+extern "C" void __declspec(naked) DetectSIMD(int func, int * iedx, int * iecx)
+{
+	_asm {
+		push ebp
+		mov ebp,esp
+		mov       eax,[func]
+		cpuid
+		mov       eax,[iedx]
+		mov       [eax],edx
+		mov       eax,[iecx]
+		mov       [eax],ecx
+		leave
+		ret
+	}
+}
+
+/****************************************************************
+;
+;                     ******** SSE ********
+;
+;****************************************************************/
+
+extern "C" void __declspec(naked) TransformVectorSSE(float *src, float *dst, float mat[4][4])
+{
+	__asm
+	{
+		push ebp
+		mov ebp,esp
+
+		mov       ecx,[src]
+		mov       eax,[dst]
+		mov       edx,[mat]
+		           
+		movss     xmm0,[ecx]    ; 0 0 0 src[0]
+		movss     xmm5,[edx]    ; 0 0 0 mat[0][0]
+		movhps    xmm5,[edx+4]  ; mat[0][2] mat[0][1] 0 mat[0][0]
+		shufps    xmm0,xmm0, 0  ; src[0] src[0] src[0] src[0]
+		movss     xmm1,[ecx+4]  ; 0 0 0 src[1]
+		movss     xmm3,[edx+16] ; 0 0 0 mat[1][0]
+		movhps    xmm3,[edx+20] ; mat[1][2] mat[1][1] 0 mat[1][0]
+		shufps    xmm1,xmm1, 0  ; src[1] src[1] src[1] src[1]
+		mulps     xmm0,xmm5     ; mat[0][2]*src[0] mat[0][1]*src[0] 0 mat[0][0]*src[0]
+		mulps     xmm1,xmm3     ; mat[1][2]*src[1] mat[1][1]*src[1] 0 mat[1][0]*src[1]
+		movss     xmm2,[ecx+8]  ; 0 0 0 src[2]
+		shufps    xmm2,xmm2, 0  ; src[2] src[2] src[2] src[2]
+		movss     xmm4,[edx+32] ; 0 0 0 mat[2][0]
+		movhps    xmm4,[edx+36] ; mat[2][2] mat[2][1] 0 mat[2][0]
+		addps     xmm0,xmm1     ; mat[0][2]*src[0]+mat[1][2]*src[1] mat[0][1]*src[0]+mat[1][1]*src[1] 0 mat[0][0]*src[0]+mat[1][0]*src[1]
+		mulps     xmm2,xmm4     ; mat[2][2]*src[2] mat[2][1]*src[2] 0 mat[2][0]*src[2]
+		addps     xmm0,xmm2     ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2] mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2] 0 mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
+		movss     [eax],xmm0    ; mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
+		movhps    [eax+4],xmm0  ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2] mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2]
+		leave
+		ret
+	}
+}
+
+extern "C" void __declspec(naked) MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4])
+{
+	__asm
+	{
+		push ebp
+		mov ebp,esp
+
+      mov       eax,[r]      
+      mov       ecx,[m1]
+      mov       edx,[m2]
+      
+      movaps    xmm0,[edx]
+      movaps    xmm1,[edx+16]
+      movaps    xmm2,[edx+32]
+      movaps    xmm3,[edx+48]
+      
+      ; r[0][0],r[0][1],r[0][2],r[0][3]
+      
+      movaps    xmm4,[ecx]
+      movaps    xmm5,xmm4
+      movaps    xmm6,xmm4
+      movaps    xmm7,xmm4
+      
+      shufps    xmm4,xmm4,00000000b
+      shufps    xmm5,xmm5,01010101b
+      shufps    xmm6,xmm6,10101010b
+      shufps    xmm7,xmm7,11111111b
+      
+      mulps     xmm4,xmm0
+      mulps     xmm5,xmm1
+      mulps     xmm6,xmm2
+      mulps     xmm7,xmm3
+      
+      addps     xmm4,xmm5
+      addps     xmm4,xmm6
+      addps     xmm4,xmm7
+      
+      movaps    [eax],xmm4
+      
+      ; r[1][0],r[1][1],r[1][2],r[1][3]
+      
+      movaps    xmm4,[ecx+16]
+      movaps    xmm5,xmm4
+      movaps    xmm6,xmm4
+      movaps    xmm7,xmm4
+      
+      shufps    xmm4,xmm4,00000000b
+      shufps    xmm5,xmm5,01010101b
+      shufps    xmm6,xmm6,10101010b
+      shufps    xmm7,xmm7,11111111b
+      
+      mulps     xmm4,xmm0
+      mulps     xmm5,xmm1
+      mulps     xmm6,xmm2
+      mulps     xmm7,xmm3
+      
+      addps     xmm4,xmm5
+      addps     xmm4,xmm6
+      addps     xmm4,xmm7
+      
+      movaps    [eax+16],xmm4
+      
+      
+      ; r[2][0],r[2][1],r[2][2],r[2][3]
+      
+      movaps    xmm4,[ecx+32]
+      movaps    xmm5,xmm4
+      movaps    xmm6,xmm4
+      movaps    xmm7,xmm4
+      
+      shufps    xmm4,xmm4,00000000b
+      shufps    xmm5,xmm5,01010101b
+      shufps    xmm6,xmm6,10101010b
+      shufps    xmm7,xmm7,11111111b
+      
+      mulps     xmm4,xmm0
+      mulps     xmm5,xmm1
+      mulps     xmm6,xmm2
+      mulps     xmm7,xmm3
+      
+      addps     xmm4,xmm5
+      addps     xmm4,xmm6
+      addps     xmm4,xmm7
+      
+      movaps    [eax+32],xmm4
+      
+      ; r[3][0],r[3][1],r[3][2],r[3][3]
+      
+      movaps    xmm4,[ecx+48]
+      movaps    xmm5,xmm4
+      movaps    xmm6,xmm4
+      movaps    xmm7,xmm4
+      
+      shufps    xmm4,xmm4,00000000b
+      shufps    xmm5,xmm5,01010101b
+      shufps    xmm6,xmm6,10101010b
+      shufps    xmm7,xmm7,11111111b
+      
+      mulps     xmm4,xmm0
+      mulps     xmm5,xmm1
+      mulps     xmm6,xmm2
+      mulps     xmm7,xmm3
+      
+      addps     xmm4,xmm5
+      addps     xmm4,xmm6
+      addps     xmm4,xmm7
+      
+      movaps    [eax+48],xmm4
+	  leave
+	  ret
+	}
+}
+
+extern "C" void __declspec(naked) NormalizeVectorSSE (float *v)
+{
+	_asm
+	{
+		push ebp
+		mov ebp,esp
+
+      mov edx, [v]
+      movaps xmm0, [edx]      ; x y z 0
+      movaps xmm2, xmm0       ; x y z 0
+      mulps  xmm0, xmm0       ; x*x y*y z*z 0
+      movaps xmm1, xmm0       ; x*x y*y z*z 0
+      shufps xmm0, xmm1, 0x4e ; z*z 0 x*x y*y
+      addps  xmm0, xmm1       ; x*x+z*z y*y z*z+x*x y*y
+      movaps xmm1, xmm0       ; x*x+z*z y*y z*z+x*x y*y
+      shufps xmm1, xmm1, 0x11 ; y*y z*z+x*x y*y z*z+x*x
+      addps  xmm0, xmm1       ; x*x+z*z+y*y
+      rsqrtps xmm0, xmm0      ; 1.0/sqrt(x*x+z*z+y*y)
+      mulps  xmm2, xmm0       ; x/sqrt(x*x+z*z+y*y) y/sqrt(x*x+z*z+y*y) z/sqrt(x*x+z*z+y*y) 0
+      movaps [edx], xmm2
+	  leave
+	  ret
+	}
+}
+
+/*****************************************************************
+;
+;                     ******** SSE3 ********
+;
+;****************************************************************/
+
+float __declspec(naked) DotProductSSE3(register float *v1, register float *v2)
+{
+	_asm {
+		push ebp
+		mov ebp,esp
+
+      mov eax,[v1]
+      mov edx,[v2]
+      movaps xmm0, [eax]
+      mulps xmm0, [edx]
+      haddps xmm0, xmm0
+      haddps xmm0, xmm0
+;      movss eax, xmm0
+		leave
+		ret
+	}
+}
+
+/****************************************************************
+;
+;                     ******** 3DNOW ********
+;
+;****************************************************************/
+
+extern "C" void __declspec(naked) TransformVector3DNOW(float *src, float *dst, float mat[4][4])
+{
+	_asm {
+		push ebp
+		mov ebp,esp
+
+    femms
+      mov         ecx,[src]
+      mov         eax,[dst]
+      mov         edx,[mat]
+      movq        mm0,[ecx]     ; src[1] src[0]
+      movd        mm2,[ecx+8]   ; 0 src[2]
+      movq        mm1,mm0       ; src[1] src[0]
+      punpckldq   mm0,mm0       ; src[0] src[0]
+      punpckhdq   mm1,mm1       ; src[1] src[1]
+      punpckldq   mm2,mm2       ; src[2] src[2]
+      movq        mm3,mm0       ; src[0] src[0]
+      movq        mm4,mm1       ; src[1] src[1]
+      movq        mm5,mm2       ; src[2] src[2]
+      pfmul       mm0,[edx]     ; src[0]*mat[0][1] src[0]*mat[0][0]
+      pfmul       mm3,[edx+8]   ; 0 src[0]*mat[0][2]
+      pfmul       mm1,[edx+16]  ; src[1]*mat[1][1] src[1]*mat[1][0]
+      pfmul       mm4,[edx+24]  ; 0 src[1]*mat[1][2]
+      pfmul       mm2,[edx+32]  ; src[2]*mat[2][1] src[2]*mat[2][0]
+      pfmul       mm5,[edx+40]  ; 0 src[2]*mat[2][2]
+      pfadd       mm0,mm1       ; src[0]*mat[0][1]+src[1]*mat[1][1] src[0]*mat[0][0]+src[1]*mat[1][0]
+      pfadd       mm3,mm4       ; 0 src[0]*mat[0][2]+src[1]*mat[1][2]
+      pfadd       mm0,mm2       ; src[0]*mat[0][1]+src[1]*mat[1][1]+src[2]*mat[2][1] src[0]*mat[0][0]+src[1]*mat[1][0]+src[2]*mat[2][0]
+      pfadd       mm3,mm5       ; 0 src[0]*mat[0][2]+src[1]*mat[1][2]+src[2]*mat[2][2]
+      movq        [eax],mm0     ; mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2] mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
+      movd        [eax+8],mm3   ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2]
+      femms
+		  leave
+		  ret
+	}
+}
+
+extern "C" void __declspec(naked) InverseTransformVector3DNOW(float *src, float *dst, float mat[4][4])
+{
+	_asm {
+		push ebp
+		mov ebp,esp
+
+    femms
+      mov         ecx,[src]
+      mov         eax,[dst]
+      mov         edx,[mat]
+      movq        mm0,[ecx]     ; src[1] src[0]
+      movd        mm4,[ecx+8]   ; 0 src[2]
+      movq        mm1,mm0       ; src[1] src[0]
+      pfmul       mm0,[edx]     ; src[1]*mat[0][1] src[0]*mat[0][0]
+      movq        mm5,mm4       ; 0 src[2]
+      pfmul       mm4,[edx+8]   ; 0 src[2]*mat[0][2]
+      movq        mm2,mm1       ; src[1] src[0]
+      pfmul       mm1,[edx+16]  ; src[1]*mat[1][1] src[0]*mat[1][0]
+      movq        mm6,mm5       ; 0 src[2]
+      pfmul       mm5,[edx+24]  ; 0 src[2]*mat[1][2]
+      movq        mm3,mm2       ; src[1] src[0]
+      pfmul       mm2,[edx+32]  ; src[1]*mat[2][1] src[0]*mat[2][0]
+      movq        mm7,mm6       ; 0 src[2]
+      pfmul       mm6,[edx+40]  ; 0 src[2]*mat[2][2]
+      pfacc       mm0,mm4       ; src[2]*mat[0][2] src[1]*mat[0][1]+src[0]*mat[0][0]
+      pfacc       mm1,mm5       ; src[2]*mat[1][2] src[1]*mat[1][1]+src[0]*mat[1][0]
+      pfacc       mm2,mm6       ; src[2]*mat[2][2] src[1]*mat[2][1]+src[0]*mat[2][0]
+      pfacc       mm0,mm1       ; src[2]*mat[1][2]+src[1]*mat[1][1]+src[0]*mat[1][0] src[2]*mat[0][2]+src[1]*mat[0][1]+src[0]*mat[0][0]
+      pfacc       mm2,mm3       ; 0 src[2]*mat[2][2]+src[1]*mat[2][1]+src[0]*mat[2][0]
+      movq        [eax],mm0     ; mat[1][0]*src[0]+mat[1][1]*src[1]+mat[1][2]*src[2] mat[0][0]*src[0]+mat[0][1]*src[1]+mat[0][2]*src[2]
+      movd        [eax+8],mm2   ; mat[2][0]*src[0]+mat[2][1]*src[1]+mat[2][2]*src[2]
+      femms                    
+		  leave
+		  ret
+	}
+}
+
+extern "C" void  __declspec(naked) MulMatrices3DNOW(float m1[4][4],float m2[4][4],float r[4][4])
+{
+	_asm {
+		push ebp
+		mov ebp,esp
+
+    femms
+      mov         ecx,[m1]
+      mov         eax,[r]
+      mov         edx,[m2]
+      
+      movq        mm0,[ecx]
+      movq        mm1,[ecx+8]
+      movq        mm4,[edx]
+      punpckhdq   mm2,mm0
+      movq        mm5,[edx+16]
+      punpckhdq   mm3,mm1
+      movq        mm6,[edx+32]
+      punpckldq   mm0,mm0
+      punpckldq   mm1,mm1
+      pfmul       mm4,mm0
+      punpckhdq   mm2,mm2
+      pfmul       mm0,[edx+8]
+      movq        mm7,[edx+48]
+      pfmul       mm5,mm2
+      punpckhdq   mm3,mm3
+      pfmul       mm2,[edx+24]
+      pfmul       mm6,mm1
+      pfadd       mm5,mm4
+      pfmul       mm1,[edx+40]
+      pfadd       mm2,mm0
+      pfmul       mm7,mm3
+      pfadd       mm6,mm5
+      pfmul       mm3,[edx+56]
+      pfadd       mm2,mm1
+      pfadd       mm7,mm6
+      movq        mm0,[ecx+16]
+      pfadd       mm3,mm2
+      movq        mm1,[ecx+24]
+      movq        [eax],mm7
+      movq        mm4,[edx]
+      movq        [eax+8],mm3
+      
+      punpckhdq   mm2,mm0
+      movq        mm5,[edx+16]
+      punpckhdq   mm3,mm1
+      movq        mm6,[edx+32]
+      punpckldq   mm0,mm0
+      punpckldq   mm1,mm1
+      pfmul       mm4,mm0
+      punpckhdq   mm2,mm2
+      pfmul       mm0,[edx+8]
+      movq        mm7,[edx+48]
+      pfmul       mm5,mm2
+      punpckhdq   mm3,mm3
+      pfmul       mm2,[edx+24]
+      pfmul       mm6,mm1
+      pfadd       mm5,mm4
+      pfmul       mm1,[edx+40]
+      pfadd       mm2,mm0
+      pfmul       mm7,mm3
+      pfadd       mm6,mm5
+      pfmul       mm3,[edx+56]
+      pfadd       mm2,mm1
+      pfadd       mm7,mm6
+      movq        mm0,[ecx+32]
+      pfadd       mm3,mm2
+      movq        mm1,[ecx+40]
+      movq        [eax+16],mm7
+      movq        mm4,[edx]
+      movq        [eax+24],mm3
+      
+      punpckhdq   mm2,mm0
+      movq        mm5,[edx+16]
+      punpckhdq   mm3,mm1
+      movq        mm6,[edx+32]
+      punpckldq   mm0,mm0
+      punpckldq   mm1,mm1
+      pfmul       mm4,mm0
+      punpckhdq   mm2,mm2
+      pfmul       mm0,[edx+8]
+      movq        mm7,[edx+48]
+      pfmul       mm5,mm2
+      punpckhdq   mm3,mm3
+      pfmul       mm2,[edx+24]
+      pfmul       mm6,mm1
+      pfadd       mm5,mm4
+      pfmul       mm1,[edx+40]
+      pfadd       mm2,mm0
+      pfmul       mm7,mm3
+      pfadd       mm6,mm5
+      pfmul       mm3,[edx+56]
+      pfadd       mm2,mm1
+      pfadd       mm7,mm6
+      movq        mm0,[ecx+48]
+      pfadd       mm3,mm2
+      movq        mm1,[ecx+56]
+      movq        [eax+32],mm7
+      movq        mm4,[edx]
+      movq        [eax+40],mm3
+      
+      punpckhdq   mm2,mm0
+      movq        mm5,[edx+16]
+      punpckhdq   mm3,mm1
+      movq        mm6,[edx+32]
+      punpckldq   mm0,mm0
+      punpckldq   mm1,mm1
+      pfmul       mm4,mm0
+      punpckhdq   mm2,mm2
+      pfmul       mm0,[edx+8]
+      movq        mm7,[edx+48]
+      pfmul       mm5,mm2
+      punpckhdq   mm3,mm3
+      pfmul       mm2,[edx+24]
+      pfmul       mm6,mm1
+      pfadd       mm5,mm4
+      pfmul       mm1,[edx+40]
+      pfadd       mm2,mm0
+      pfmul       mm7,mm3
+      pfadd       mm6,mm5
+      pfmul       mm3,[edx+56]
+      pfadd       mm2,mm1
+      pfadd       mm7,mm6
+      pfadd       mm3,mm2
+      movq        [eax+48],mm7
+      movq        [eax+56],mm3
+      femms
+	  leave
+	  ret
+	}
+}
+
+extern "C" float  __declspec(naked) DotProduct3DNOW(register float *v1, register float *v2)
+{
+	_asm {
+		push ebp
+		mov ebp,esp
+      
+      femms
+      mov         edx,[v1]
+      mov         eax,[v2]
+      movq        mm0,[edx]
+      movq        mm3,[eax]
+      pfmul       mm0,mm3
+      movq        mm2,[edx+8]
+      movq        mm1,[eax+8]
+      pfacc       mm0,mm0
+      pfmul       mm1,mm2
+      pfadd       mm0,mm1
+      movd        eax,mm0
+      femms
+	  leave
+	  ret
+	}
+}
+
+extern "C" void __declspec(naked) NormalizeVector3DNOW(float *v)
+{
+	_asm {
+		push ebp
+		mov ebp,esp      
+      femms
+      mov          edx,[v]
+      movq         mm0,[edx]
+      movq         mm3,[edx+8]
+      movq         mm1,mm0
+      movq         mm2,mm3
+      pfmul        mm0,mm0
+      pfmul        mm3,mm3
+      pfacc        mm0,mm0
+      pfadd        mm0,mm3
+      ;movq mm4,mm0 ; prepare for 24bit precision
+      ;punpckldq mm4,mm4 ; prepare for 24bit precision
+      pfrsqrt      mm0,mm0 ; 15bit precision 1/sqrtf(v)
+      ;movq mm3,mm0
+      ;pfmul mm0,mm0
+      ;pfrsqit1 mm0,mm4
+      ;pfrcpit2 mm0,mm3 ; 24bit precision 1/sqrtf(v)
+      pfmul        mm1,mm0
+      pfmul        mm2,mm0
+      movq         [edx],mm1
+      movq         [edx+8],mm2
+      femms
+	  leave
+	  ret
+	}
+}
--- a/Source/Glide64/Config.h
+++ b/Source/Glide64/Config.h
@ -44,8 +44,6 @@

 // -*- C++ -*- generated by wxGlade 0.6.3 on Wed Oct 08 18:56:23 2008

-#define wxNO_QA_LIB
-
 #include <wx/wx.h>
 #include <wx/image.h>
 // begin wxGlade: ::dependencies
--- a/Source/Glide64/DepthBufferRender.cpp
+++ b/Source/Glide64/DepthBufferRender.cpp
@ -85,37 +85,9 @@ static int right_height, left_height;
 static int right_x, right_dxdy, left_x, left_dxdy;
 static int left_z, left_dzdy;

-int imul16(int x, int y)
-{
-	_asm {
-		mov   eax, [x]
-		mov   edx, [y]
-		imul  edx        
-		shrd  eax,edx,16
-	}
-}
-
-int imul14(int x, int y)
-{
-	_asm {
-		mov   eax, [x]
-		mov   edx, [y]
-		imul  edx        
-		shrd  eax,edx,14
-	}
-}
-
-int idiv16(int x, int y)
-{
-	_asm {
-		mov   eax, [x]
-		mov   ebx, [y]
-		mov   edx,eax   
-		sar   edx,16
-		shl   eax,16    
-		idiv  ebx  
-	}
-}
+extern "C" int imul16(int x, int y);
+extern "C" int imul14(int x, int y);
+extern "C" int idiv16(int x, int y);

 __inline int iceil(int x)
 {
--- a/Source/Glide64/FixedPoint.asm
+++ b/Source/Glide64/FixedPoint.asm
@ -1,51 +0,0 @@
-;/*
-;* Glide64 - Glide video plugin for Nintendo 64 emulators.
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-;*/
-;
-;****************************************************************
-;
-; Glide64 - Glide Plugin for Nintendo 64 emulators
-; Project started on December 29th, 2001
-;
-; Authors:
-; Dave2001, original author, founded the project in 2001, left it in 2002
-; Gugaman, joined the project in 2002, left it in 2002
-; Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
-; Hiroshi 'KoolSmoky' Morii, joined the project in 2007
-;
-;****************************************************************
-;
-; To modify Glide64:
-; * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
-; * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
-;
-;****************************************************************
-
-%include "inc/c32.mac"
-
-segment .text
-
-endproc ;imul14
-
-;(x << 16) / y
-proc idiv16
-CPU 586
-
-  %$x   arg 
-  %$y   arg 
-
-endproc ;idiv16
--- a/Source/Glide64/FixedPoint.asm.cpp
+++ b/Source/Glide64/FixedPoint.asm.cpp
@ -0,0 +1,83 @@
+/*
+* Glide64 - Glide video plugin for Nintendo 64 emulators.
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+/****************************************************************
+;
+; Glide64 - Glide Plugin for Nintendo 64 emulators
+; Project started on December 29th, 2001
+;
+; Authors:
+; Dave2001, original author, founded the project in 2001, left it in 2002
+; Gugaman, joined the project in 2002, left it in 2002
+; Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
+; Hiroshi 'KoolSmoky' Morii, joined the project in 2007
+;
+;****************************************************************
+;
+; To modify Glide64:
+; * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
+; * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
+;
+;****************************************************************/
+
+// (x * y) >> 16
+extern "C" int __declspec(naked) imul16(int x, int y)
+{
+	_asm {
+		push ebp
+		mov ebp,esp
+		mov   eax, [x]
+		mov   edx, [y]
+		imul  edx        
+		shrd  eax,edx,16
+		leave
+		ret
+	}
+}
+
+//(x * y) >> 14
+extern "C" int  __declspec(naked) imul14(int x, int y)
+{
+	_asm {
+		push ebp
+		mov ebp,esp
+		mov   eax, [x]
+		mov   edx, [y]
+		imul  edx        
+		shrd  eax,edx,14
+		leave
+		ret
+	}
+}
+
+//(x << 16) / y
+extern "C" int __declspec(naked) idiv16(int x, int y)
+{
+	_asm {
+		push ebp
+		mov ebp,esp
+		mov   eax, [x]
+		mov   ebx, [y]
+		mov   edx,eax   
+		sar   edx,16
+		shl   eax,16    
+		idiv  ebx  
+		leave
+		ret
+	}
+}
--- a/Source/Glide64/Glide64.vcproj
+++ b/Source/Glide64/Glide64.vcproj
@ -345,7 +345,7 @@
 			>
 		</File>
 		<File
-			RelativePath="3dmathSIMD.asm"
+			RelativePath=".\3dmathSIMD.asm.cpp"
 			>
 		</File>
 		<File
@ -389,7 +389,7 @@
 			>
 		</File>
 		<File
-			RelativePath="FixedPoint.asm"
+			RelativePath=".\FixedPoint.asm.cpp"
 			>
 		</File>
 		<File
--- a/Source/Glide64/Main.cpp
+++ b/Source/Glide64/Main.cpp
@ -36,6 +36,7 @@
 // * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
 //
 //****************************************************************
+
 #include "Gfx #1.3.h"
 #include <wx/fileconf.h>
 #include <wx/wfstream.h>
--- a/Source/Glide64/MiClWr16b.h
+++ b/Source/Glide64/MiClWr16b.h
@ -37,9 +37,9 @@
 //
 //****************************************************************

-void asmMirror16bS (int tex, int start, int width, int height, int mask, int line, int full, int count);
-void asmWrap16bS (int tex, int start, int height, int mask, int line, int full, int count);
-void asmClamp16bS (int tex, int constant, int height,int line, int full, int count);
+extern "C" void asmMirror16bS (int tex, int start, int width, int height, int mask, int line, int full, int count);
+extern "C" void asmWrap16bS (int tex, int start, int height, int mask, int line, int full, int count);
+extern "C" void asmClamp16bS (int tex, int constant, int height,int line, int full, int count);

 //****************************************************************
 // 16-bit Horizontal Mirror
--- a/Source/Glide64/MiClWr32b.h
+++ b/Source/Glide64/MiClWr32b.h
@ -41,9 +41,9 @@
 //
 //****************************************************************

-void asmMirror32bS (int tex, int start, int width, int height, int mask, int line, int full, int count);
-void asmWrap32bS (int tex, int start, int height, int mask, int line, int full, int count);
-void asmClamp32bS (int tex, int constant, int height,int line, int full, int count);
+extern "C" void asmMirror32bS (int tex, int start, int width, int height, int mask, int line, int full, int count);
+extern "C" void asmWrap32bS (int tex, int start, int height, int mask, int line, int full, int count);
+extern "C" void asmClamp32bS (int tex, int constant, int height,int line, int full, int count);

 //****************************************************************
 // 32-bit Horizontal Mirror
--- a/Source/Glide64/MiClWr8b.h
+++ b/Source/Glide64/MiClWr8b.h
@ -40,9 +40,9 @@
 //****************************************************************
 // 8-bit Horizontal Mirror

-void asmMirror8bS (int tex, int start, int width, int height, int mask, int line, int full, int count);
-void asmWrap8bS (int tex, int start, int height, int mask, int line, int full, int count);
-void asmClamp8bS (int tex, int constant, int height,int line, int full, int count);
+extern "C" void asmMirror8bS (int tex, int start, int width, int height, int mask, int line, int full, int count);
+extern "C" void asmWrap8bS (int tex, int start, int height, int mask, int line, int full, int count);
+extern "C" void asmClamp8bS (int tex, int constant, int height,int line, int full, int count);

 void Mirror8bS (wxUint32 tex, wxUint32 mask, wxUint32 max_width, wxUint32 real_width, wxUint32 height)
 {
--- a/Source/Glide64/TexConv.h
+++ b/Source/Glide64/TexConv.h
@ -37,10 +37,10 @@
 //
 //****************************************************************

-void asmTexConv_ARGB1555_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int size);
-void asmTexConv_AI88_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int size);
-void asmTexConv_AI44_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int size);
-void asmTexConv_A8_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int size);
+extern "C" void asmTexConv_ARGB1555_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int size);
+extern "C" void asmTexConv_AI88_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int size);
+extern "C" void asmTexConv_AI44_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int size);
+extern "C" void asmTexConv_A8_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int size);

 void TexConv_ARGB1555_ARGB4444 (wxUIntPtr src, wxUIntPtr dst, int width, int height)
 {
--- a/Source/Glide64/TexLoad16b.h
+++ b/Source/Glide64/TexLoad16b.h
@ -37,15 +37,15 @@
 //
 //****************************************************************

-void asmLoad16bRGBA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext);
-void asmLoad16bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext);
+extern "C" void asmLoad16bRGBA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext);
+extern "C" void asmLoad16bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext);


 //****************************************************************
 // Size: 2, Format: 0
 //

-wxUint32 Load16bRGBA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile)
+wxUint32 Load16bRGBA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int /*tile*/)
 {
  if (wid_64 < 1) wid_64 = 1;
  if (height < 1) height = 1;
@ -60,7 +60,7 @@ wxUint32 Load16bRGBA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int
 // Size: 2, Format: 3
 //

-wxUint32 Load16bIA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile)
+wxUint32 Load16bIA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int /*tile*/)
 {
  if (wid_64 < 1) wid_64 = 1;
  if (height < 1) height = 1;
@ -115,7 +115,7 @@ wxUint16 yuv_to_rgb565(wxUint8 y, wxUint8 u, wxUint8 v)
 // Size: 2, Format: 1
 //

-wxUint32 Load16bYUV (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile)
+wxUint32 Load16bYUV (wxUIntPtr dst, wxUIntPtr src, int wid_64, int /*height*/, int /*line*/, int /*real_width*/, int tile)
 {
  wxUint32 * mb = (wxUint32*)(gfx.RDRAM+rdp.addr[rdp.tiles[tile].t_mem]); //pointer to the macro block
  wxUint16 * tex = (wxUint16*)dst;
--- a/Source/Glide64/TexLoad4b.h
+++ b/Source/Glide64/TexLoad4b.h
@ -37,10 +37,10 @@
 //
 //****************************************************************

-void asmLoad4bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal);
-void asmLoad4bIAPal (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal);
-void asmLoad4bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext);
-void asmLoad4bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext);
+extern "C" void asmLoad4bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal);
+extern "C" void asmLoad4bIAPal (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal);
+extern "C" void asmLoad4bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext);
+extern "C" void asmLoad4bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext);

 //****************************************************************
 // Size: 0, Format: 2
--- a/Source/Glide64/TexLoad8b.h
+++ b/Source/Glide64/TexLoad8b.h
@ -37,16 +37,16 @@
 //
 //****************************************************************

-void asmLoad8bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal);
-void asmLoad8bIA8 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal);
-void asmLoad8bIA4 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext);
-void asmLoad8bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext);
+extern "C" void asmLoad8bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal);
+extern "C" void asmLoad8bIA8 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal);
+extern "C" void asmLoad8bIA4 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext);
+extern "C" void asmLoad8bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext);

 //****************************************************************
 // Size: 1, Format: 2
 //

-wxUint32 Load8bCI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile)
+wxUint32 Load8bCI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int /*tile*/)
 {
  if (wid_64 < 1) wid_64 = 1;
  if (height < 1) height = 1;
--- a/Source/Glide64/Texture.asm.cpp
+++ b/Source/Glide64/Texture.asm.cpp
@ -54,7 +54,7 @@
 Size: 0, Format: 2
 2009 ported to NASM - Sergey (Gonetz) Lipski
 *****************************************************************/
-void __declspec(naked) asmLoad4bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
+extern "C" void __declspec(naked) asmLoad4bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
 {
 	_asm {
 		push ebp
@ -430,7 +430,7 @@ end_y_loop:
 	}
 }

-void  __declspec(naked) asmLoad4bIAPal (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
+extern "C" void  __declspec(naked) asmLoad4bIAPal (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
 {
 	_asm {
 		push ebp
@ -812,7 +812,7 @@ end_y_loop:
 ** BY GUGAMAN **
 2009 ported to NASM - Sergey (Gonetz) Lipski
 *****************************************************************/
-void  __declspec(naked) asmLoad4bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
+extern "C" void  __declspec(naked) asmLoad4bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
 {
 	_asm {
 		push ebp
@ -1623,7 +1623,7 @@ end_y_loop:
 // Size: 0, Format: 4
 // 2009 ported to NASM - Sergey (Gonetz) Lipski

-void  __declspec(naked) asmLoad4bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext)
+extern "C" void  __declspec(naked) asmLoad4bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext)
 {
 	_asm {
 		push ebp
@ -1966,7 +1966,7 @@ end_y_loop:
 // 2008.03.29 cleaned up - H.Morii
 // 2009 ported to NASM - Sergey (Gonetz) Lipski

-void  __declspec(naked) asmLoad8bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
+extern "C" void  __declspec(naked) asmLoad8bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
 {
 	_asm {
 		push ebp
@ -2198,7 +2198,7 @@ end_y_loop:
 	}
 }

-void  __declspec(naked) asmLoad8bIA8 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
+extern "C" void  __declspec(naked) asmLoad8bIA8 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
 {
 	_asm {
 		push ebp
@ -2423,7 +2423,7 @@ end_y_loop:
 // 2008.03.29 cleaned up - H.Morii
 // 2009 ported to NASM - Sergey (Gonetz) Lipski

-void  __declspec(naked) asmLoad8bIA4 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
+extern "C" void  __declspec(naked) asmLoad8bIA4 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
 {
 	_asm {
 		push ebp
@ -2531,7 +2531,7 @@ end_y_loop:
 // ** by Gugaman **
 // 2009 ported to NASM - Sergey (Gonetz) Lipski

-void  __declspec(naked) asmLoad8bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext)
+extern "C" void  __declspec(naked) asmLoad8bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext)
 {
 	_asm {
 		push ebp
@ -2616,7 +2616,7 @@ end_y_loop:
 // 2008.03.29 cleaned up - H.Morii
 // 2009 ported to NASM - Sergey (Gonetz) Lipski

-void __declspec(naked) asmLoad16bRGBA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
+extern "C" void __declspec(naked) asmLoad16bRGBA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
 {
 	_asm {
 		align 4
@ -2720,7 +2720,7 @@ end_y_loop:
 // 2008.03.29 cleaned up - H.Morii
 // 2009 ported to NASM - Sergey (Gonetz) Lipski

-void  __declspec(naked) asmLoad16bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
+extern "C" void  __declspec(naked) asmLoad16bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
 {
 	_asm {
 		ALIGN 4
@ -2795,7 +2795,7 @@ end_y_loop:
 //8b textures mirror/clamp/wrap
 //****************************************************************

-void  __declspec(naked) asmMirror8bS (int tex, int start, int width, int height, int mask, int line, int full, int count)
+extern "C" void  __declspec(naked) asmMirror8bS (int tex, int start, int width, int height, int mask, int line, int full, int count)
 {
 	_asm{
 		ALIGN 4
@ -2856,7 +2856,7 @@ end_mirror_check:
 	}
 }

-void  __declspec(naked) asmWrap8bS (int tex, int start, int height, int mask, int line, int full, int count)
+extern "C" void  __declspec(naked) asmWrap8bS (int tex, int start, int height, int mask, int line, int full, int count)
 {
 	_asm {
 		align 4
@ -2903,7 +2903,7 @@ loop_x:
 	}
 }

-void  __declspec(naked) asmClamp8bS (int tex, int constant, int height,int line, int full, int count)
+extern "C" void  __declspec(naked) asmClamp8bS (int tex, int constant, int height,int line, int full, int count)
 {
 	_asm {
 		align 4
@ -2949,7 +2949,7 @@ x_loop:
 //16b textures mirror/clamp/wrap
 //****************************************************************

-void  __declspec(naked) asmMirror16bS (int tex, int start, int width, int height, int mask, int line, int full, int count)
+extern "C" void  __declspec(naked) asmMirror16bS (int tex, int start, int width, int height, int mask, int line, int full, int count)
 {
 	_asm {
 		align 4
@ -3011,7 +3011,7 @@ end_mirror_check:
 	}
 }

-void  __declspec(naked) asmWrap16bS (int tex, int start, int height, int mask, int line, int full, int count)
+extern "C" void  __declspec(naked) asmWrap16bS (int tex, int start, int height, int mask, int line, int full, int count)
 {
 	_asm {
 		align 4
@ -3058,7 +3058,7 @@ loop_x:
 	}
 }

-void  __declspec(naked) asmClamp16bS (int tex, int constant, int height,int line, int full, int count)
+extern "C" void  __declspec(naked) asmClamp16bS (int tex, int constant, int height,int line, int full, int count)
 {
 	_asm {
 		align 4
@ -3104,7 +3104,7 @@ x_loop:
 //32b textures mirror/clamp/wrap
 //****************************************************************

-void  __declspec(naked) asmMirror32bS (int tex, int start, int width, int height, int mask, int line, int full, int count)
+extern "C" void  __declspec(naked) asmMirror32bS (int tex, int start, int width, int height, int mask, int line, int full, int count)
 {
 	_asm {
 		align 4
@ -3166,7 +3166,7 @@ end_mirror_check:
 	}
 }

-void  __declspec(naked) asmWrap32bS (int tex, int start, int height, int mask, int line, int full, int count)
+extern "C" void  __declspec(naked) asmWrap32bS (int tex, int start, int height, int mask, int line, int full, int count)
 {
 	_asm {
 		align 4
@ -3213,7 +3213,7 @@ loop_x:
 	}
 }

-void  __declspec(naked) asmClamp32bS (int tex, int constant, int height,int line, int full, int count)
+extern "C" void  __declspec(naked) asmClamp32bS (int tex, int constant, int height,int line, int full, int count)
 {
 	_asm {
 		align 4
@ -3261,7 +3261,7 @@ x_loop:
 //
 //****************************************************************

-void  __declspec(naked) asmTexConv_ARGB1555_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
+extern "C" void  __declspec(naked) asmTexConv_ARGB1555_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
 {
 	_asm {
 		align 4
@ -3320,7 +3320,7 @@ tc1_loop:
 	}
 }

-void  __declspec(naked) asmTexConv_AI88_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
+extern "C" void  __declspec(naked) asmTexConv_AI88_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
 {
 	_asm {
 		align 4
@ -3367,7 +3367,7 @@ tc1_loop:
 	}
 }

-void  __declspec(naked) asmTexConv_AI44_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
+extern "C" void  __declspec(naked) asmTexConv_AI44_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
 {
 	_asm {
 		align 4
@ -3445,7 +3445,7 @@ tc1_loop:
 	}
 }

-void  __declspec(naked) asmTexConv_A8_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
+extern "C" void  __declspec(naked) asmTexConv_A8_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
 {
 	_asm {
 		align 4
@ -3541,7 +3541,7 @@ tc1_loop:
 // esi = base_addr (preserved)
 // edx = offset (preserved)
 //****************************************************************
-__declspec(naked) void CopyBlock ( void )
+void __declspec(naked) CopyBlock ( void )
 {
 	_asm {
 		align 4