diff --git a/desmume/src/matrix_sse2-x64.asm b/desmume/src/matrix_sse2-x64.asm new file mode 100644 index 000000000..9071bbcba --- /dev/null +++ b/desmume/src/matrix_sse2-x64.asm @@ -0,0 +1,178 @@ +; +; Copyright (C) 2006 yopyop +; Copyright (C) 2008 CrazyMax +; +; This file is part of DeSmuME +; +; DeSmuME is free software; you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation; either version 2 of the License, or +; (at your option) any later version. +; +; DeSmuME is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with DeSmuME; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + TITLE matrix_sse2-x64.asm + .code + +MatrixMultVec4x4 PROC PUBLIC + movaps xmm0, XMMWORD PTR [rcx] + movaps xmm1, XMMWORD PTR [rcx+16] + movaps xmm2, XMMWORD PTR [rcx+32] + movaps xmm3, XMMWORD PTR [rcx+48] + movaps xmm4, XMMWORD PTR [rdx] + movaps xmm5, xmm4 + movaps xmm6, xmm4 + movaps xmm7, xmm4 + shufps xmm4, xmm4, 00000000b + shufps xmm5, xmm5, 01010101b + shufps xmm6, xmm6, 10101010b + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm6, xmm2 + addps xmm4, xmm5 + addps xmm4, xmm6 + addps xmm4, xmm3 + movaps XMMWORD PTR [rdx], xmm4 + ret 0 +MatrixMultVec4x4 ENDP + +MatrixMultVec3x3 PROC PUBLIC + movaps xmm0, XMMWORD PTR [rcx] + movaps xmm1, XMMWORD PTR [rcx+16] + movaps xmm2, XMMWORD PTR [rcx+32] + movaps xmm4, XMMWORD PTR [rdx] + movaps xmm5, xmm4 + movaps xmm6, xmm4 + movaps xmm7, xmm4 + shufps xmm4, xmm4, 00000000b + shufps xmm5, xmm5, 01010101b + shufps xmm6, xmm6, 10101010b + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm6, xmm2 + addps xmm4, xmm5 + addps xmm4, xmm6 + movaps XMMWORD PTR [rdx], xmm4 +MatrixMultVec3x3 ENDP + +MatrixMultiply PROC PUBLIC + movaps xmm0, XMMWORD PTR [rcx] + movaps xmm1, XMMWORD PTR [rcx+16] + movaps xmm2, XMMWORD PTR [rcx+32] + movaps xmm3, XMMWORD PTR [rcx+48] + movaps xmm4, XMMWORD PTR [rdx] ; r00, r01, r02, r03 + movaps xmm5,xmm4 + movaps xmm6,xmm4 + movaps xmm7,xmm4 + shufps xmm4,xmm4,00000000b + shufps xmm5,xmm5,01010101b + shufps xmm6,xmm6,10101010b + shufps xmm7,xmm7,11111111b + mulps xmm4,xmm0 + mulps xmm5,xmm1 + mulps xmm6,xmm2 + mulps xmm7,xmm3 + addps xmm4,xmm5 + addps xmm4,xmm6 + addps xmm4,xmm7 + movaps XMMWORD PTR [rcx],xmm4 + movaps xmm4, XMMWORD PTR [rdx+16] ; r04, r05, r06, r07 + movaps xmm5,xmm4 + movaps xmm6,xmm4 + movaps xmm7,xmm4 + shufps xmm4,xmm4,00000000b + shufps xmm5,xmm5,01010101b + shufps xmm6,xmm6,10101010b + shufps xmm7,xmm7,11111111b + mulps xmm4,xmm0 + mulps xmm5,xmm1 + mulps xmm6,xmm2 + mulps xmm7,xmm3 + addps xmm4,xmm5 + addps xmm4,xmm6 + addps xmm4,xmm7 + movaps XMMWORD PTR [rcx+16],xmm4 + movaps xmm4, XMMWORD PTR [rdx+32] ; r08, r09, r10, r11 + movaps xmm5,xmm4 + movaps xmm6,xmm4 + movaps xmm7,xmm4 + shufps xmm4,xmm4,00000000b + shufps xmm5,xmm5,01010101b + shufps xmm6,xmm6,10101010b + shufps xmm7,xmm7,11111111b + mulps xmm4,xmm0 + mulps xmm5,xmm1 + mulps xmm6,xmm2 + mulps xmm7,xmm3 + addps xmm4,xmm5 + addps xmm4,xmm6 + addps xmm4,xmm7 + movaps XMMWORD PTR [rcx+32],xmm4 + movaps xmm4, XMMWORD PTR [rdx+48] ; r12, r13, r14, r15 + movaps xmm5,xmm4 + movaps xmm6,xmm4 + movaps xmm7,xmm4 + shufps xmm4,xmm4,00000000b + shufps xmm5,xmm5,01010101b + shufps xmm6,xmm6,10101010b + shufps xmm7,xmm7,11111111b + mulps xmm4,xmm0 + mulps xmm5,xmm1 + mulps xmm6,xmm2 + mulps xmm7,xmm3 + addps xmm4,xmm5 + addps xmm4,xmm6 + addps xmm4,xmm7 + movaps XMMWORD PTR [rcx+48],xmm4 + ret 0 +MatrixMultiply ENDP + +MatrixTranslate PROC PUBLIC + movaps xmm0, XMMWORD PTR [rcx] + movaps xmm1, XMMWORD PTR [rcx+16] + movaps xmm2, XMMWORD PTR [rcx+32] + movaps xmm3, XMMWORD PTR [rcx+48] + movaps xmm4, XMMWORD PTR [rdx] + movaps xmm5, xmm4 + movaps xmm6, xmm4 + movaps xmm7, xmm4 + shufps xmm4, xmm4, 00000000b + shufps xmm5, xmm5, 01010101b + shufps xmm6, xmm6, 10101010b + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm6, xmm2 + addps xmm4, xmm5 + addps xmm4, xmm6 + addps xmm4, xmm3 + movaps XMMWORD PTR [rcx+48], xmm4 + ret 0 +MatrixTranslate ENDP + +MatrixScale PROC PUBLIC + movaps xmm0, XMMWORD PTR [rcx] + movaps xmm1, XMMWORD PTR [rcx+16] + movaps xmm2, XMMWORD PTR [rcx+32] + movaps xmm4, XMMWORD PTR [rdx] + movaps xmm5, xmm4 + movaps xmm6, xmm4 + shufps xmm4, xmm4, 00000000b + shufps xmm5, xmm5, 01010101b + shufps xmm6, xmm6, 10101010b + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm6, xmm2 + movaps XMMWORD PTR [rcx],xmm4 + movaps XMMWORD PTR [rcx+16],xmm5 + movaps XMMWORD PTR [rcx+32],xmm6 + ret 0 +MatrixScale ENDP + +end diff --git a/desmume/src/matrix_sse2-x86.asm b/desmume/src/matrix_sse2-x86.asm new file mode 100644 index 000000000..5d5d76865 --- /dev/null +++ b/desmume/src/matrix_sse2-x86.asm @@ -0,0 +1,215 @@ +; +; Copyright (C) 2006 yopyop +; Copyright (C) 2008 CrazyMax +; +; This file is part of DeSmuME +; +; DeSmuME is free software; you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation; either version 2 of the License, or +; (at your option) any later version. +; +; DeSmuME is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with DeSmuME; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + TITLE matrix_sse2-x86.asm + .686P + .XMM + .model flat + .code + +@MatrixMultVec4x4@8 PROC PUBLIC + push ebp + mov ebp, esp + + movaps xmm0, XMMWORD PTR [ecx] + movaps xmm1, XMMWORD PTR [ecx+16] + movaps xmm2, XMMWORD PTR [ecx+32] + movaps xmm3, XMMWORD PTR [ecx+48] + movaps xmm4, XMMWORD PTR [edx] + movaps xmm5, xmm4 + movaps xmm6, xmm4 + movaps xmm7, xmm4 + shufps xmm4, xmm4, 00000000b + shufps xmm5, xmm5, 01010101b + shufps xmm6, xmm6, 10101010b + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm6, xmm2 + addps xmm4, xmm5 + addps xmm4, xmm6 + addps xmm4, xmm3 + movaps XMMWORD PTR [edx], xmm4 + mov esp, ebp + pop ebp + ret 0 +@MatrixMultVec4x4@8 ENDP + +PUBLIC @MatrixMultVec3x3@8 +@MatrixMultVec3x3@8 PROC + push ebp + mov ebp, esp + + movaps xmm0, XMMWORD PTR [ecx] + movaps xmm1, XMMWORD PTR [ecx+16] + movaps xmm2, XMMWORD PTR [ecx+32] + movaps xmm4, XMMWORD PTR [edx] + movaps xmm5, xmm4 + movaps xmm6, xmm4 + movaps xmm7, xmm4 + shufps xmm4, xmm4, 00000000b + shufps xmm5, xmm5, 01010101b + shufps xmm6, xmm6, 10101010b + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm6, xmm2 + addps xmm4, xmm5 + addps xmm4, xmm6 + movaps XMMWORD PTR [edx], xmm4 + mov esp, ebp + pop ebp + ret 0 +@MatrixMultVec3x3@8 ENDP + + +PUBLIC @MatrixMultiply@8 +@MatrixMultiply@8 PROC +; mov eax, DWORD PTR[esp+4] + push ebp + mov ebp, esp + + movaps xmm0, XMMWORD PTR [ecx] + movaps xmm1, XMMWORD PTR [ecx+16] + movaps xmm2, XMMWORD PTR [ecx+32] + movaps xmm3, XMMWORD PTR [ecx+48] + movaps xmm4, XMMWORD PTR [edx] ; r00, r01, r02, r03 + movaps xmm5,xmm4 + movaps xmm6,xmm4 + movaps xmm7,xmm4 + shufps xmm4,xmm4,00000000b + shufps xmm5,xmm5,01010101b + shufps xmm6,xmm6,10101010b + shufps xmm7,xmm7,11111111b + mulps xmm4,xmm0 + mulps xmm5,xmm1 + mulps xmm6,xmm2 + mulps xmm7,xmm3 + addps xmm4,xmm5 + addps xmm4,xmm6 + addps xmm4,xmm7 + movaps XMMWORD PTR [ecx],xmm4 + movaps xmm4, XMMWORD PTR [edx+16] ; r04, r05, r06, r07 + movaps xmm5,xmm4 + movaps xmm6,xmm4 + movaps xmm7,xmm4 + shufps xmm4,xmm4,00000000b + shufps xmm5,xmm5,01010101b + shufps xmm6,xmm6,10101010b + shufps xmm7,xmm7,11111111b + mulps xmm4,xmm0 + mulps xmm5,xmm1 + mulps xmm6,xmm2 + mulps xmm7,xmm3 + addps xmm4,xmm5 + addps xmm4,xmm6 + addps xmm4,xmm7 + movaps XMMWORD PTR [ecx+16],xmm4 + movaps xmm4, XMMWORD PTR [edx+32] ; r08, r09, r10, r11 + movaps xmm5,xmm4 + movaps xmm6,xmm4 + movaps xmm7,xmm4 + shufps xmm4,xmm4,00000000b + shufps xmm5,xmm5,01010101b + shufps xmm6,xmm6,10101010b + shufps xmm7,xmm7,11111111b + mulps xmm4,xmm0 + mulps xmm5,xmm1 + mulps xmm6,xmm2 + mulps xmm7,xmm3 + addps xmm4,xmm5 + addps xmm4,xmm6 + addps xmm4,xmm7 + movaps XMMWORD PTR [ecx+32],xmm4 + movaps xmm4, XMMWORD PTR [edx+48] ; r12, r13, r14, r15 + movaps xmm5,xmm4 + movaps xmm6,xmm4 + movaps xmm7,xmm4 + shufps xmm4,xmm4,00000000b + shufps xmm5,xmm5,01010101b + shufps xmm6,xmm6,10101010b + shufps xmm7,xmm7,11111111b + mulps xmm4,xmm0 + mulps xmm5,xmm1 + mulps xmm6,xmm2 + mulps xmm7,xmm3 + addps xmm4,xmm5 + addps xmm4,xmm6 + addps xmm4,xmm7 + movaps XMMWORD PTR [ecx+48],xmm4 + mov esp, ebp + pop ebp + ret 0 +@MatrixMultiply@8 ENDP + +PUBLIC @MatrixTranslate@8 +@MatrixTranslate@8 PROC + push ebp + mov ebp, esp + + movaps xmm0, XMMWORD PTR [ecx] + movaps xmm1, XMMWORD PTR [ecx+16] + movaps xmm2, XMMWORD PTR [ecx+32] + movaps xmm3, XMMWORD PTR [ecx+48] + movaps xmm4, XMMWORD PTR [edx] + movaps xmm5, xmm4 + movaps xmm6, xmm4 + movaps xmm7, xmm4 + shufps xmm4, xmm4, 00000000b + shufps xmm5, xmm5, 01010101b + shufps xmm6, xmm6, 10101010b + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm6, xmm2 + addps xmm4, xmm5 + addps xmm4, xmm6 + addps xmm4, xmm3 + movaps XMMWORD PTR [ecx+48], xmm4 + mov esp, ebp + pop ebp + ret 0 +@MatrixTranslate@8 ENDP + +PUBLIC @MatrixScale@8 +@MatrixScale@8 PROC + push ebp + mov ebp, esp + + movaps xmm0, XMMWORD PTR [ecx] + movaps xmm1, XMMWORD PTR [ecx+16] + movaps xmm2, XMMWORD PTR [ecx+32] + movaps xmm4, XMMWORD PTR [edx] + movaps xmm5, xmm4 + movaps xmm6, xmm4 + shufps xmm4, xmm4, 00000000b + shufps xmm5, xmm5, 01010101b + shufps xmm6, xmm6, 10101010b + mulps xmm4, xmm0 + mulps xmm5, xmm1 + mulps xmm6, xmm2 + movaps XMMWORD PTR [ecx],xmm4 + movaps XMMWORD PTR [ecx+16],xmm5 + movaps XMMWORD PTR [ecx+32],xmm6 + + mov esp, ebp + pop ebp + ret 0 +@MatrixScale@8 ENDP + +end + diff --git a/desmume/src/windows/DeSmuME_2005.vcproj b/desmume/src/windows/DeSmuME_2005.vcproj index 8e2240270..c327f3a00 100644 --- a/desmume/src/windows/DeSmuME_2005.vcproj +++ b/desmume/src/windows/DeSmuME_2005.vcproj @@ -664,7 +664,7 @@ >