This commit is contained in:
mtabachenko 2009-06-16 18:01:56 +00:00
parent b7f41c4ff1
commit 08628592cf
2 changed files with 107 additions and 138 deletions

View File

@ -21,7 +21,7 @@
TITLE matrix_sse2-x64.asm TITLE matrix_sse2-x64.asm
.code .code
MatrixMultVec4x4 PROC PUBLIC _sse2_MatrixMultVec4x4 PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx] movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16] movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32] movaps xmm2, XMMWORD PTR [rcx+32]
@ -43,9 +43,9 @@ MatrixMultVec4x4 PROC PUBLIC
addps xmm4, xmm7 addps xmm4, xmm7
movaps XMMWORD PTR [rdx], xmm4 movaps XMMWORD PTR [rdx], xmm4
ret 0 ret 0
MatrixMultVec4x4 ENDP _sse2_MatrixMultVec4x4 ENDP
MatrixMultVec3x3 PROC PUBLIC _sse2_MatrixMultVec3x3 PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx] movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16] movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32] movaps xmm2, XMMWORD PTR [rcx+32]
@ -62,81 +62,83 @@ MatrixMultVec3x3 PROC PUBLIC
addps xmm4, xmm5 addps xmm4, xmm5
addps xmm4, xmm6 addps xmm4, xmm6
movaps XMMWORD PTR [rdx], xmm4 movaps XMMWORD PTR [rdx], xmm4
MatrixMultVec3x3 ENDP ret 0
_sse2_MatrixMultVec3x3 ENDP
MatrixMultiply PROC PUBLIC _sse2_MatrixMultiply PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx] movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16] movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32] movaps xmm2, XMMWORD PTR [rcx+32]
movaps xmm3, XMMWORD PTR [rcx+48] movaps xmm3, XMMWORD PTR [rcx+48]
movaps xmm4, XMMWORD PTR [rdx] ; r00, r01, r02, r03 movaps xmm4, XMMWORD PTR [rdx] ; r00, r01, r02, r03
movaps xmm8, XMMWORD PTR [rdx+16] ; r04, r05, r06, r07
movaps xmm5,xmm4 movaps xmm5,xmm4
movaps xmm6,xmm4 movaps xmm6,xmm4
movaps xmm7,xmm4 movaps xmm7,xmm4
movaps xmm9,xmm8 ;
movaps xmm10,xmm8
movaps xmm11,xmm8
shufps xmm4,xmm4,00000000b shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b shufps xmm7,xmm7,11111111b
shufps xmm8, xmm8, 00000000b ;
shufps xmm9, xmm9, 01010101b
shufps xmm10,xmm10,10101010b
shufps xmm11,xmm11,11111111b
mulps xmm4,xmm0 mulps xmm4,xmm0
mulps xmm5,xmm1 mulps xmm5,xmm1
mulps xmm6,xmm2 mulps xmm6,xmm2
mulps xmm7,xmm3 mulps xmm7,xmm3
mulps xmm8, xmm0 ;
mulps xmm9, xmm1
mulps xmm10,xmm2
mulps xmm11,xmm3
addps xmm4,xmm5 addps xmm4,xmm5
addps xmm4,xmm6 addps xmm4,xmm6
addps xmm4,xmm7 addps xmm4,xmm7
addps xmm8,xmm9 ;
addps xmm8,xmm10
addps xmm8,xmm11
movaps XMMWORD PTR [rcx],xmm4 movaps XMMWORD PTR [rcx],xmm4
movaps xmm4, XMMWORD PTR [rdx+16] ; r04, r05, r06, r07 movaps XMMWORD PTR [rcx+16],xmm8
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps XMMWORD PTR [rcx+16],xmm4
movaps xmm4, XMMWORD PTR [rdx+32] ; r08, r09, r10, r11
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps XMMWORD PTR [rcx+32],xmm4
movaps xmm4, XMMWORD PTR [rdx+48] ; r12, r13, r14, r15
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps XMMWORD PTR [rcx+48],xmm4
ret 0
MatrixMultiply ENDP
MatrixTranslate PROC PUBLIC movaps xmm4, XMMWORD PTR [rdx+32] ; r00, r01, r02, r03
movaps xmm8, XMMWORD PTR [rdx+48] ; r04, r05, r06, r07
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
movaps xmm9,xmm8 ;
movaps xmm10,xmm8
movaps xmm11,xmm8
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
shufps xmm8, xmm8, 00000000b ;
shufps xmm9, xmm9, 01010101b
shufps xmm10,xmm10,10101010b
shufps xmm11,xmm11,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
mulps xmm8, xmm0 ;
mulps xmm9, xmm1
mulps xmm10,xmm2
mulps xmm11,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
addps xmm8,xmm9 ;
addps xmm8,xmm10
addps xmm8,xmm11
movaps XMMWORD PTR [rcx+32],xmm4
movaps XMMWORD PTR [rcx+48],xmm8
ret 0
_sse2_MatrixMultiply ENDP
_sse2_MatrixTranslate PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx] movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16] movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32] movaps xmm2, XMMWORD PTR [rcx+32]
@ -156,9 +158,9 @@ MatrixTranslate PROC PUBLIC
addps xmm4, xmm3 addps xmm4, xmm3
movaps XMMWORD PTR [rcx+48], xmm4 movaps XMMWORD PTR [rcx+48], xmm4
ret 0 ret 0
MatrixTranslate ENDP _sse2_MatrixTranslate ENDP
MatrixScale PROC PUBLIC _sse2_MatrixScale PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx] movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16] movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32] movaps xmm2, XMMWORD PTR [rcx+32]
@ -175,6 +177,6 @@ MatrixScale PROC PUBLIC
movaps XMMWORD PTR [rcx+16],xmm5 movaps XMMWORD PTR [rcx+16],xmm5
movaps XMMWORD PTR [rcx+32],xmm6 movaps XMMWORD PTR [rcx+32],xmm6
ret 0 ret 0
MatrixScale ENDP _sse2_MatrixScale ENDP
end end

View File

@ -25,22 +25,15 @@
.code .code
@_sse2_MatrixMultVec4x4@8 PROC PUBLIC @_sse2_MatrixMultVec4x4@8 PROC PUBLIC
movaps xmm0, XMMWORD PTR [ecx]
movaps xmm1, XMMWORD PTR [ecx+16]
movaps xmm2, XMMWORD PTR [ecx+32]
movaps xmm3, XMMWORD PTR [ecx+48]
movaps xmm4, XMMWORD PTR [edx] movaps xmm4, XMMWORD PTR [edx]
movaps xmm5, xmm4 pshufd xmm5, xmm4, 01010101b
movaps xmm6, xmm4 pshufd xmm6, xmm4, 10101010b
movaps xmm7, xmm4 pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b shufps xmm4, xmm4, 00000000b
shufps xmm5, xmm5, 01010101b mulps xmm4, XMMWORD PTR [ecx]
shufps xmm6, xmm6, 10101010b mulps xmm5, XMMWORD PTR [ecx+16]
shufps xmm7, xmm7, 11111111b mulps xmm6, XMMWORD PTR [ecx+32]
mulps xmm4, xmm0 mulps xmm7, XMMWORD PTR [ecx+48]
mulps xmm5, xmm1
mulps xmm6, xmm2
mulps xmm7, xmm3
addps xmm4, xmm5 addps xmm4, xmm5
addps xmm4, xmm6 addps xmm4, xmm6
addps xmm4, xmm7 addps xmm4, xmm7
@ -49,19 +42,13 @@
@_sse2_MatrixMultVec4x4@8 ENDP @_sse2_MatrixMultVec4x4@8 ENDP
@_sse2_MatrixMultVec3x3@8 PROC PUBLIC @_sse2_MatrixMultVec3x3@8 PROC PUBLIC
movaps xmm0, XMMWORD PTR [ecx]
movaps xmm1, XMMWORD PTR [ecx+16]
movaps xmm2, XMMWORD PTR [ecx+32]
movaps xmm4, XMMWORD PTR [edx] movaps xmm4, XMMWORD PTR [edx]
movaps xmm5, xmm4 pshufd xmm5, xmm4, 01010101b
movaps xmm6, xmm4 pshufd xmm6, xmm4, 10101010b
movaps xmm7, xmm4
shufps xmm4, xmm4, 00000000b shufps xmm4, xmm4, 00000000b
shufps xmm5, xmm5, 01010101b mulps xmm4, XMMWORD PTR [ecx]
shufps xmm6, xmm6, 10101010b mulps xmm5, XMMWORD PTR [ecx+16]
mulps xmm4, xmm0 mulps xmm6, XMMWORD PTR [ecx+32]
mulps xmm5, xmm1
mulps xmm6, xmm2
addps xmm4, xmm5 addps xmm4, xmm5
addps xmm4, xmm6 addps xmm4, xmm6
movaps XMMWORD PTR [edx], xmm4 movaps XMMWORD PTR [edx], xmm4
@ -74,13 +61,10 @@
movaps xmm2, XMMWORD PTR [ecx+32] movaps xmm2, XMMWORD PTR [ecx+32]
movaps xmm3, XMMWORD PTR [ecx+48] movaps xmm3, XMMWORD PTR [ecx+48]
movaps xmm4, XMMWORD PTR [edx] ; r00, r01, r02, r03 movaps xmm4, XMMWORD PTR [edx] ; r00, r01, r02, r03
movaps xmm5,xmm4 pshufd xmm5, xmm4, 01010101b
movaps xmm6,xmm4 pshufd xmm6, xmm4, 10101010b
movaps xmm7,xmm4 pshufd xmm7, xmm4, 11111111b
shufps xmm4,xmm4,00000000b shufps xmm4, xmm4, 00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0 mulps xmm4,xmm0
mulps xmm5,xmm1 mulps xmm5,xmm1
mulps xmm6,xmm2 mulps xmm6,xmm2
@ -89,14 +73,12 @@
addps xmm4,xmm6 addps xmm4,xmm6
addps xmm4,xmm7 addps xmm4,xmm7
movaps XMMWORD PTR [ecx],xmm4 movaps XMMWORD PTR [ecx],xmm4
movaps xmm4, XMMWORD PTR [edx+16] ; r04, r05, r06, r07 movaps xmm4, XMMWORD PTR [edx+16] ; r04, r05, r06, r07
movaps xmm5,xmm4 pshufd xmm5, xmm4, 01010101b
movaps xmm6,xmm4 pshufd xmm6, xmm4, 10101010b
movaps xmm7,xmm4 pshufd xmm7, xmm4, 11111111b
shufps xmm4,xmm4,00000000b shufps xmm4, xmm4, 00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0 mulps xmm4,xmm0
mulps xmm5,xmm1 mulps xmm5,xmm1
mulps xmm6,xmm2 mulps xmm6,xmm2
@ -105,14 +87,12 @@
addps xmm4,xmm6 addps xmm4,xmm6
addps xmm4,xmm7 addps xmm4,xmm7
movaps XMMWORD PTR [ecx+16],xmm4 movaps XMMWORD PTR [ecx+16],xmm4
movaps xmm4, XMMWORD PTR [edx+32] ; r08, r09, r10, r11 movaps xmm4, XMMWORD PTR [edx+32] ; r08, r09, r10, r11
movaps xmm5,xmm4 pshufd xmm5, xmm4, 01010101b
movaps xmm6,xmm4 pshufd xmm6, xmm4, 10101010b
movaps xmm7,xmm4 pshufd xmm7, xmm4, 11111111b
shufps xmm4,xmm4,00000000b shufps xmm4, xmm4, 00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0 mulps xmm4,xmm0
mulps xmm5,xmm1 mulps xmm5,xmm1
mulps xmm6,xmm2 mulps xmm6,xmm2
@ -121,14 +101,12 @@
addps xmm4,xmm6 addps xmm4,xmm6
addps xmm4,xmm7 addps xmm4,xmm7
movaps XMMWORD PTR [ecx+32],xmm4 movaps XMMWORD PTR [ecx+32],xmm4
movaps xmm4, XMMWORD PTR [edx+48] ; r12, r13, r14, r15 movaps xmm4, XMMWORD PTR [edx+48] ; r12, r13, r14, r15
movaps xmm5,xmm4 pshufd xmm5, xmm4, 01010101b
movaps xmm6,xmm4 pshufd xmm6, xmm4, 10101010b
movaps xmm7,xmm4 pshufd xmm7, xmm4, 11111111b
shufps xmm4,xmm4,00000000b shufps xmm4, xmm4, 00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0 mulps xmm4,xmm0
mulps xmm5,xmm1 mulps xmm5,xmm1
mulps xmm6,xmm2 mulps xmm6,xmm2
@ -137,47 +115,36 @@
addps xmm4,xmm6 addps xmm4,xmm6
addps xmm4,xmm7 addps xmm4,xmm7
movaps XMMWORD PTR [ecx+48],xmm4 movaps XMMWORD PTR [ecx+48],xmm4
ret 0 ret 0
@_sse2_MatrixMultiply@8 ENDP @_sse2_MatrixMultiply@8 ENDP
@_sse2_MatrixTranslate@8 PROC PUBLIC @_sse2_MatrixTranslate@8 PROC PUBLIC
movaps xmm0, XMMWORD PTR [ecx]
movaps xmm1, XMMWORD PTR [ecx+16]
movaps xmm2, XMMWORD PTR [ecx+32]
movaps xmm3, XMMWORD PTR [ecx+48]
movaps xmm4, XMMWORD PTR [edx] movaps xmm4, XMMWORD PTR [edx]
movaps xmm5, xmm4 pshufd xmm5, xmm4, 01010101b
movaps xmm6, xmm4 pshufd xmm6, xmm4, 10101010b
movaps xmm7, xmm4
shufps xmm4, xmm4, 00000000b shufps xmm4, xmm4, 00000000b
shufps xmm5, xmm5, 01010101b mulps xmm4, XMMWORD PTR [ecx]
shufps xmm6, xmm6, 10101010b mulps xmm5, XMMWORD PTR [ecx+16]
mulps xmm4, xmm0 mulps xmm6, XMMWORD PTR [ecx+32]
mulps xmm5, xmm1
mulps xmm6, xmm2
addps xmm4, xmm5 addps xmm4, xmm5
addps xmm4, xmm6 addps xmm4, xmm6
addps xmm4, xmm3 addps xmm4, XMMWORD PTR [ecx+48]
movaps XMMWORD PTR [ecx+48], xmm4 movaps XMMWORD PTR [ecx+48], xmm4
ret 0 ret 0
@_sse2_MatrixTranslate@8 ENDP @_sse2_MatrixTranslate@8 ENDP
@_sse2_MatrixScale@8 PROC PUBLIC @_sse2_MatrixScale@8 PROC PUBLIC
movaps xmm0, XMMWORD PTR [ecx]
movaps xmm1, XMMWORD PTR [ecx+16]
movaps xmm2, XMMWORD PTR [ecx+32]
movaps xmm4, XMMWORD PTR [edx] movaps xmm4, XMMWORD PTR [edx]
movaps xmm5, xmm4 pshufd xmm5, xmm4, 01010101b
movaps xmm6, xmm4 pshufd xmm6, xmm4, 10101010b
shufps xmm4, xmm4, 00000000b shufps xmm4, xmm4, 00000000b
shufps xmm5, xmm5, 01010101b mulps xmm4, XMMWORD PTR [ecx]
shufps xmm6, xmm6, 10101010b mulps xmm5, XMMWORD PTR [ecx+16]
mulps xmm4, xmm0 mulps xmm6, XMMWORD PTR [ecx+32]
mulps xmm5, xmm1 movaps XMMWORD PTR [ecx], xmm4
mulps xmm6, xmm2 movaps XMMWORD PTR [ecx+16], xmm5
movaps XMMWORD PTR [ecx],xmm4 movaps XMMWORD PTR [ecx+32], xmm6
movaps XMMWORD PTR [ecx+16],xmm5
movaps XMMWORD PTR [ecx+32],xmm6
ret 0 ret 0
@_sse2_MatrixScale@8 ENDP @_sse2_MatrixScale@8 ENDP