diff --git a/desmume/src/matrix_sse2-x64.asm b/desmume/src/matrix_sse2-x64.asm
new file mode 100644
index 000000000..9071bbcba
--- /dev/null
+++ b/desmume/src/matrix_sse2-x64.asm
@@ -0,0 +1,178 @@
+;
+; Copyright (C) 2006 yopyop
+; Copyright (C) 2008 CrazyMax
+;
+; This file is part of DeSmuME
+;
+; DeSmuME is free software; you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or
+; (at your option) any later version.
+;
+; DeSmuME is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with DeSmuME; if not, write to the Free Software
+; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ TITLE matrix_sse2-x64.asm
+ .code
+
+MatrixMultVec4x4 PROC PUBLIC
+ movaps xmm0, XMMWORD PTR [rcx]
+ movaps xmm1, XMMWORD PTR [rcx+16]
+ movaps xmm2, XMMWORD PTR [rcx+32]
+ movaps xmm3, XMMWORD PTR [rcx+48]
+ movaps xmm4, XMMWORD PTR [rdx]
+ movaps xmm5, xmm4
+ movaps xmm6, xmm4
+ movaps xmm7, xmm4
+ shufps xmm4, xmm4, 00000000b
+ shufps xmm5, xmm5, 01010101b
+ shufps xmm6, xmm6, 10101010b
+ mulps xmm4, xmm0
+ mulps xmm5, xmm1
+ mulps xmm6, xmm2
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ addps xmm4, xmm3
+ movaps XMMWORD PTR [rdx], xmm4
+ ret 0
+MatrixMultVec4x4 ENDP
+
+MatrixMultVec3x3 PROC PUBLIC
+ movaps xmm0, XMMWORD PTR [rcx]
+ movaps xmm1, XMMWORD PTR [rcx+16]
+ movaps xmm2, XMMWORD PTR [rcx+32]
+ movaps xmm4, XMMWORD PTR [rdx]
+ movaps xmm5, xmm4
+ movaps xmm6, xmm4
+ movaps xmm7, xmm4
+ shufps xmm4, xmm4, 00000000b
+ shufps xmm5, xmm5, 01010101b
+ shufps xmm6, xmm6, 10101010b
+ mulps xmm4, xmm0
+ mulps xmm5, xmm1
+ mulps xmm6, xmm2
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ movaps XMMWORD PTR [rdx], xmm4
+MatrixMultVec3x3 ENDP
+
+MatrixMultiply PROC PUBLIC
+ movaps xmm0, XMMWORD PTR [rcx]
+ movaps xmm1, XMMWORD PTR [rcx+16]
+ movaps xmm2, XMMWORD PTR [rcx+32]
+ movaps xmm3, XMMWORD PTR [rcx+48]
+ movaps xmm4, XMMWORD PTR [rdx] ; r00, r01, r02, r03
+ movaps xmm5,xmm4
+ movaps xmm6,xmm4
+ movaps xmm7,xmm4
+ shufps xmm4,xmm4,00000000b
+ shufps xmm5,xmm5,01010101b
+ shufps xmm6,xmm6,10101010b
+ shufps xmm7,xmm7,11111111b
+ mulps xmm4,xmm0
+ mulps xmm5,xmm1
+ mulps xmm6,xmm2
+ mulps xmm7,xmm3
+ addps xmm4,xmm5
+ addps xmm4,xmm6
+ addps xmm4,xmm7
+ movaps XMMWORD PTR [rcx],xmm4
+ movaps xmm4, XMMWORD PTR [rdx+16] ; r04, r05, r06, r07
+ movaps xmm5,xmm4
+ movaps xmm6,xmm4
+ movaps xmm7,xmm4
+ shufps xmm4,xmm4,00000000b
+ shufps xmm5,xmm5,01010101b
+ shufps xmm6,xmm6,10101010b
+ shufps xmm7,xmm7,11111111b
+ mulps xmm4,xmm0
+ mulps xmm5,xmm1
+ mulps xmm6,xmm2
+ mulps xmm7,xmm3
+ addps xmm4,xmm5
+ addps xmm4,xmm6
+ addps xmm4,xmm7
+ movaps XMMWORD PTR [rcx+16],xmm4
+ movaps xmm4, XMMWORD PTR [rdx+32] ; r08, r09, r10, r11
+ movaps xmm5,xmm4
+ movaps xmm6,xmm4
+ movaps xmm7,xmm4
+ shufps xmm4,xmm4,00000000b
+ shufps xmm5,xmm5,01010101b
+ shufps xmm6,xmm6,10101010b
+ shufps xmm7,xmm7,11111111b
+ mulps xmm4,xmm0
+ mulps xmm5,xmm1
+ mulps xmm6,xmm2
+ mulps xmm7,xmm3
+ addps xmm4,xmm5
+ addps xmm4,xmm6
+ addps xmm4,xmm7
+ movaps XMMWORD PTR [rcx+32],xmm4
+ movaps xmm4, XMMWORD PTR [rdx+48] ; r12, r13, r14, r15
+ movaps xmm5,xmm4
+ movaps xmm6,xmm4
+ movaps xmm7,xmm4
+ shufps xmm4,xmm4,00000000b
+ shufps xmm5,xmm5,01010101b
+ shufps xmm6,xmm6,10101010b
+ shufps xmm7,xmm7,11111111b
+ mulps xmm4,xmm0
+ mulps xmm5,xmm1
+ mulps xmm6,xmm2
+ mulps xmm7,xmm3
+ addps xmm4,xmm5
+ addps xmm4,xmm6
+ addps xmm4,xmm7
+ movaps XMMWORD PTR [rcx+48],xmm4
+ ret 0
+MatrixMultiply ENDP
+
+MatrixTranslate PROC PUBLIC
+ movaps xmm0, XMMWORD PTR [rcx]
+ movaps xmm1, XMMWORD PTR [rcx+16]
+ movaps xmm2, XMMWORD PTR [rcx+32]
+ movaps xmm3, XMMWORD PTR [rcx+48]
+ movaps xmm4, XMMWORD PTR [rdx]
+ movaps xmm5, xmm4
+ movaps xmm6, xmm4
+ movaps xmm7, xmm4
+ shufps xmm4, xmm4, 00000000b
+ shufps xmm5, xmm5, 01010101b
+ shufps xmm6, xmm6, 10101010b
+ mulps xmm4, xmm0
+ mulps xmm5, xmm1
+ mulps xmm6, xmm2
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ addps xmm4, xmm3
+ movaps XMMWORD PTR [rcx+48], xmm4
+ ret 0
+MatrixTranslate ENDP
+
+MatrixScale PROC PUBLIC
+ movaps xmm0, XMMWORD PTR [rcx]
+ movaps xmm1, XMMWORD PTR [rcx+16]
+ movaps xmm2, XMMWORD PTR [rcx+32]
+ movaps xmm4, XMMWORD PTR [rdx]
+ movaps xmm5, xmm4
+ movaps xmm6, xmm4
+ shufps xmm4, xmm4, 00000000b
+ shufps xmm5, xmm5, 01010101b
+ shufps xmm6, xmm6, 10101010b
+ mulps xmm4, xmm0
+ mulps xmm5, xmm1
+ mulps xmm6, xmm2
+ movaps XMMWORD PTR [rcx],xmm4
+ movaps XMMWORD PTR [rcx+16],xmm5
+ movaps XMMWORD PTR [rcx+32],xmm6
+ ret 0
+MatrixScale ENDP
+
+end
diff --git a/desmume/src/matrix_sse2-x86.asm b/desmume/src/matrix_sse2-x86.asm
new file mode 100644
index 000000000..5d5d76865
--- /dev/null
+++ b/desmume/src/matrix_sse2-x86.asm
@@ -0,0 +1,215 @@
+;
+; Copyright (C) 2006 yopyop
+; Copyright (C) 2008 CrazyMax
+;
+; This file is part of DeSmuME
+;
+; DeSmuME is free software; you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or
+; (at your option) any later version.
+;
+; DeSmuME is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with DeSmuME; if not, write to the Free Software
+; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ TITLE matrix_sse2-x86.asm
+ .686P
+ .XMM
+ .model flat
+ .code
+
+@MatrixMultVec4x4@8 PROC PUBLIC
+ push ebp
+ mov ebp, esp
+
+ movaps xmm0, XMMWORD PTR [ecx]
+ movaps xmm1, XMMWORD PTR [ecx+16]
+ movaps xmm2, XMMWORD PTR [ecx+32]
+ movaps xmm3, XMMWORD PTR [ecx+48]
+ movaps xmm4, XMMWORD PTR [edx]
+ movaps xmm5, xmm4
+ movaps xmm6, xmm4
+ movaps xmm7, xmm4
+ shufps xmm4, xmm4, 00000000b
+ shufps xmm5, xmm5, 01010101b
+ shufps xmm6, xmm6, 10101010b
+ mulps xmm4, xmm0
+ mulps xmm5, xmm1
+ mulps xmm6, xmm2
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ addps xmm4, xmm3
+ movaps XMMWORD PTR [edx], xmm4
+ mov esp, ebp
+ pop ebp
+ ret 0
+@MatrixMultVec4x4@8 ENDP
+
+PUBLIC @MatrixMultVec3x3@8
+@MatrixMultVec3x3@8 PROC
+ push ebp
+ mov ebp, esp
+
+ movaps xmm0, XMMWORD PTR [ecx]
+ movaps xmm1, XMMWORD PTR [ecx+16]
+ movaps xmm2, XMMWORD PTR [ecx+32]
+ movaps xmm4, XMMWORD PTR [edx]
+ movaps xmm5, xmm4
+ movaps xmm6, xmm4
+ movaps xmm7, xmm4
+ shufps xmm4, xmm4, 00000000b
+ shufps xmm5, xmm5, 01010101b
+ shufps xmm6, xmm6, 10101010b
+ mulps xmm4, xmm0
+ mulps xmm5, xmm1
+ mulps xmm6, xmm2
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ movaps XMMWORD PTR [edx], xmm4
+ mov esp, ebp
+ pop ebp
+ ret 0
+@MatrixMultVec3x3@8 ENDP
+
+
+PUBLIC @MatrixMultiply@8
+@MatrixMultiply@8 PROC
+; mov eax, DWORD PTR[esp+4]
+ push ebp
+ mov ebp, esp
+
+ movaps xmm0, XMMWORD PTR [ecx]
+ movaps xmm1, XMMWORD PTR [ecx+16]
+ movaps xmm2, XMMWORD PTR [ecx+32]
+ movaps xmm3, XMMWORD PTR [ecx+48]
+ movaps xmm4, XMMWORD PTR [edx] ; r00, r01, r02, r03
+ movaps xmm5,xmm4
+ movaps xmm6,xmm4
+ movaps xmm7,xmm4
+ shufps xmm4,xmm4,00000000b
+ shufps xmm5,xmm5,01010101b
+ shufps xmm6,xmm6,10101010b
+ shufps xmm7,xmm7,11111111b
+ mulps xmm4,xmm0
+ mulps xmm5,xmm1
+ mulps xmm6,xmm2
+ mulps xmm7,xmm3
+ addps xmm4,xmm5
+ addps xmm4,xmm6
+ addps xmm4,xmm7
+ movaps XMMWORD PTR [ecx],xmm4
+ movaps xmm4, XMMWORD PTR [edx+16] ; r04, r05, r06, r07
+ movaps xmm5,xmm4
+ movaps xmm6,xmm4
+ movaps xmm7,xmm4
+ shufps xmm4,xmm4,00000000b
+ shufps xmm5,xmm5,01010101b
+ shufps xmm6,xmm6,10101010b
+ shufps xmm7,xmm7,11111111b
+ mulps xmm4,xmm0
+ mulps xmm5,xmm1
+ mulps xmm6,xmm2
+ mulps xmm7,xmm3
+ addps xmm4,xmm5
+ addps xmm4,xmm6
+ addps xmm4,xmm7
+ movaps XMMWORD PTR [ecx+16],xmm4
+ movaps xmm4, XMMWORD PTR [edx+32] ; r08, r09, r10, r11
+ movaps xmm5,xmm4
+ movaps xmm6,xmm4
+ movaps xmm7,xmm4
+ shufps xmm4,xmm4,00000000b
+ shufps xmm5,xmm5,01010101b
+ shufps xmm6,xmm6,10101010b
+ shufps xmm7,xmm7,11111111b
+ mulps xmm4,xmm0
+ mulps xmm5,xmm1
+ mulps xmm6,xmm2
+ mulps xmm7,xmm3
+ addps xmm4,xmm5
+ addps xmm4,xmm6
+ addps xmm4,xmm7
+ movaps XMMWORD PTR [ecx+32],xmm4
+ movaps xmm4, XMMWORD PTR [edx+48] ; r12, r13, r14, r15
+ movaps xmm5,xmm4
+ movaps xmm6,xmm4
+ movaps xmm7,xmm4
+ shufps xmm4,xmm4,00000000b
+ shufps xmm5,xmm5,01010101b
+ shufps xmm6,xmm6,10101010b
+ shufps xmm7,xmm7,11111111b
+ mulps xmm4,xmm0
+ mulps xmm5,xmm1
+ mulps xmm6,xmm2
+ mulps xmm7,xmm3
+ addps xmm4,xmm5
+ addps xmm4,xmm6
+ addps xmm4,xmm7
+ movaps XMMWORD PTR [ecx+48],xmm4
+ mov esp, ebp
+ pop ebp
+ ret 0
+@MatrixMultiply@8 ENDP
+
+PUBLIC @MatrixTranslate@8
+@MatrixTranslate@8 PROC
+ push ebp
+ mov ebp, esp
+
+ movaps xmm0, XMMWORD PTR [ecx]
+ movaps xmm1, XMMWORD PTR [ecx+16]
+ movaps xmm2, XMMWORD PTR [ecx+32]
+ movaps xmm3, XMMWORD PTR [ecx+48]
+ movaps xmm4, XMMWORD PTR [edx]
+ movaps xmm5, xmm4
+ movaps xmm6, xmm4
+ movaps xmm7, xmm4
+ shufps xmm4, xmm4, 00000000b
+ shufps xmm5, xmm5, 01010101b
+ shufps xmm6, xmm6, 10101010b
+ mulps xmm4, xmm0
+ mulps xmm5, xmm1
+ mulps xmm6, xmm2
+ addps xmm4, xmm5
+ addps xmm4, xmm6
+ addps xmm4, xmm3
+ movaps XMMWORD PTR [ecx+48], xmm4
+ mov esp, ebp
+ pop ebp
+ ret 0
+@MatrixTranslate@8 ENDP
+
+PUBLIC @MatrixScale@8
+@MatrixScale@8 PROC
+ push ebp
+ mov ebp, esp
+
+ movaps xmm0, XMMWORD PTR [ecx]
+ movaps xmm1, XMMWORD PTR [ecx+16]
+ movaps xmm2, XMMWORD PTR [ecx+32]
+ movaps xmm4, XMMWORD PTR [edx]
+ movaps xmm5, xmm4
+ movaps xmm6, xmm4
+ shufps xmm4, xmm4, 00000000b
+ shufps xmm5, xmm5, 01010101b
+ shufps xmm6, xmm6, 10101010b
+ mulps xmm4, xmm0
+ mulps xmm5, xmm1
+ mulps xmm6, xmm2
+ movaps XMMWORD PTR [ecx],xmm4
+ movaps XMMWORD PTR [ecx+16],xmm5
+ movaps XMMWORD PTR [ecx+32],xmm6
+
+ mov esp, ebp
+ pop ebp
+ ret 0
+@MatrixScale@8 ENDP
+
+end
+
diff --git a/desmume/src/windows/DeSmuME_2005.vcproj b/desmume/src/windows/DeSmuME_2005.vcproj
index 8e2240270..c327f3a00 100644
--- a/desmume/src/windows/DeSmuME_2005.vcproj
+++ b/desmume/src/windows/DeSmuME_2005.vcproj
@@ -664,7 +664,7 @@
>