This commit is contained in:
mtabachenko 2009-06-16 18:01:56 +00:00
parent b7f41c4ff1
commit 08628592cf
2 changed files with 107 additions and 138 deletions

View File

@ -21,7 +21,7 @@
TITLE matrix_sse2-x64.asm
.code
MatrixMultVec4x4 PROC PUBLIC
_sse2_MatrixMultVec4x4 PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32]
@ -43,9 +43,9 @@ MatrixMultVec4x4 PROC PUBLIC
addps xmm4, xmm7
movaps XMMWORD PTR [rdx], xmm4
ret 0
MatrixMultVec4x4 ENDP
_sse2_MatrixMultVec4x4 ENDP
MatrixMultVec3x3 PROC PUBLIC
_sse2_MatrixMultVec3x3 PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32]
@ -62,81 +62,83 @@ MatrixMultVec3x3 PROC PUBLIC
addps xmm4, xmm5
addps xmm4, xmm6
movaps XMMWORD PTR [rdx], xmm4
MatrixMultVec3x3 ENDP
ret 0
_sse2_MatrixMultVec3x3 ENDP
MatrixMultiply PROC PUBLIC
_sse2_MatrixMultiply PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32]
movaps xmm3, XMMWORD PTR [rcx+48]
movaps xmm4, XMMWORD PTR [rdx] ; r00, r01, r02, r03
movaps xmm8, XMMWORD PTR [rdx+16] ; r04, r05, r06, r07
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
movaps xmm9,xmm8 ;
movaps xmm10,xmm8
movaps xmm11,xmm8
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
shufps xmm8, xmm8, 00000000b ;
shufps xmm9, xmm9, 01010101b
shufps xmm10,xmm10,10101010b
shufps xmm11,xmm11,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
mulps xmm8, xmm0 ;
mulps xmm9, xmm1
mulps xmm10,xmm2
mulps xmm11,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
addps xmm8,xmm9 ;
addps xmm8,xmm10
addps xmm8,xmm11
movaps XMMWORD PTR [rcx],xmm4
movaps xmm4, XMMWORD PTR [rdx+16] ; r04, r05, r06, r07
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps XMMWORD PTR [rcx+16],xmm4
movaps xmm4, XMMWORD PTR [rdx+32] ; r08, r09, r10, r11
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps XMMWORD PTR [rcx+32],xmm4
movaps xmm4, XMMWORD PTR [rdx+48] ; r12, r13, r14, r15
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps XMMWORD PTR [rcx+48],xmm4
ret 0
MatrixMultiply ENDP
movaps XMMWORD PTR [rcx+16],xmm8
MatrixTranslate PROC PUBLIC
movaps xmm4, XMMWORD PTR [rdx+32] ; r00, r01, r02, r03
movaps xmm8, XMMWORD PTR [rdx+48] ; r04, r05, r06, r07
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
movaps xmm9,xmm8 ;
movaps xmm10,xmm8
movaps xmm11,xmm8
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
shufps xmm8, xmm8, 00000000b ;
shufps xmm9, xmm9, 01010101b
shufps xmm10,xmm10,10101010b
shufps xmm11,xmm11,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
mulps xmm8, xmm0 ;
mulps xmm9, xmm1
mulps xmm10,xmm2
mulps xmm11,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
addps xmm8,xmm9 ;
addps xmm8,xmm10
addps xmm8,xmm11
movaps XMMWORD PTR [rcx+32],xmm4
movaps XMMWORD PTR [rcx+48],xmm8
ret 0
_sse2_MatrixMultiply ENDP
_sse2_MatrixTranslate PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32]
@ -156,9 +158,9 @@ MatrixTranslate PROC PUBLIC
addps xmm4, xmm3
movaps XMMWORD PTR [rcx+48], xmm4
ret 0
MatrixTranslate ENDP
_sse2_MatrixTranslate ENDP
MatrixScale PROC PUBLIC
_sse2_MatrixScale PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32]
@ -175,6 +177,6 @@ MatrixScale PROC PUBLIC
movaps XMMWORD PTR [rcx+16],xmm5
movaps XMMWORD PTR [rcx+32],xmm6
ret 0
MatrixScale ENDP
_sse2_MatrixScale ENDP
end

View File

@ -25,22 +25,15 @@
.code
@_sse2_MatrixMultVec4x4@8 PROC PUBLIC
movaps xmm0, XMMWORD PTR [ecx]
movaps xmm1, XMMWORD PTR [ecx+16]
movaps xmm2, XMMWORD PTR [ecx+32]
movaps xmm3, XMMWORD PTR [ecx+48]
movaps xmm4, XMMWORD PTR [edx]
movaps xmm5, xmm4
movaps xmm6, xmm4
movaps xmm7, xmm4
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b
shufps xmm5, xmm5, 01010101b
shufps xmm6, xmm6, 10101010b
shufps xmm7, xmm7, 11111111b
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
mulps xmm7, xmm3
mulps xmm4, XMMWORD PTR [ecx]
mulps xmm5, XMMWORD PTR [ecx+16]
mulps xmm6, XMMWORD PTR [ecx+32]
mulps xmm7, XMMWORD PTR [ecx+48]
addps xmm4, xmm5
addps xmm4, xmm6
addps xmm4, xmm7
@ -49,19 +42,13 @@
@_sse2_MatrixMultVec4x4@8 ENDP
@_sse2_MatrixMultVec3x3@8 PROC PUBLIC
movaps xmm0, XMMWORD PTR [ecx]
movaps xmm1, XMMWORD PTR [ecx+16]
movaps xmm2, XMMWORD PTR [ecx+32]
movaps xmm4, XMMWORD PTR [edx]
movaps xmm5, xmm4
movaps xmm6, xmm4
movaps xmm7, xmm4
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
shufps xmm4, xmm4, 00000000b
shufps xmm5, xmm5, 01010101b
shufps xmm6, xmm6, 10101010b
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
mulps xmm4, XMMWORD PTR [ecx]
mulps xmm5, XMMWORD PTR [ecx+16]
mulps xmm6, XMMWORD PTR [ecx+32]
addps xmm4, xmm5
addps xmm4, xmm6
movaps XMMWORD PTR [edx], xmm4
@ -74,13 +61,10 @@
movaps xmm2, XMMWORD PTR [ecx+32]
movaps xmm3, XMMWORD PTR [ecx+48]
movaps xmm4, XMMWORD PTR [edx] ; r00, r01, r02, r03
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
@ -89,14 +73,12 @@
addps xmm4,xmm6
addps xmm4,xmm7
movaps XMMWORD PTR [ecx],xmm4
movaps xmm4, XMMWORD PTR [edx+16] ; r04, r05, r06, r07
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
@ -105,14 +87,12 @@
addps xmm4,xmm6
addps xmm4,xmm7
movaps XMMWORD PTR [ecx+16],xmm4
movaps xmm4, XMMWORD PTR [edx+32] ; r08, r09, r10, r11
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
@ -121,14 +101,12 @@
addps xmm4,xmm6
addps xmm4,xmm7
movaps XMMWORD PTR [ecx+32],xmm4
movaps xmm4, XMMWORD PTR [edx+48] ; r12, r13, r14, r15
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
@ -137,47 +115,36 @@
addps xmm4,xmm6
addps xmm4,xmm7
movaps XMMWORD PTR [ecx+48],xmm4
ret 0
@_sse2_MatrixMultiply@8 ENDP
@_sse2_MatrixTranslate@8 PROC PUBLIC
movaps xmm0, XMMWORD PTR [ecx]
movaps xmm1, XMMWORD PTR [ecx+16]
movaps xmm2, XMMWORD PTR [ecx+32]
movaps xmm3, XMMWORD PTR [ecx+48]
movaps xmm4, XMMWORD PTR [edx]
movaps xmm5, xmm4
movaps xmm6, xmm4
movaps xmm7, xmm4
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
shufps xmm4, xmm4, 00000000b
shufps xmm5, xmm5, 01010101b
shufps xmm6, xmm6, 10101010b
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
mulps xmm4, XMMWORD PTR [ecx]
mulps xmm5, XMMWORD PTR [ecx+16]
mulps xmm6, XMMWORD PTR [ecx+32]
addps xmm4, xmm5
addps xmm4, xmm6
addps xmm4, xmm3
addps xmm4, XMMWORD PTR [ecx+48]
movaps XMMWORD PTR [ecx+48], xmm4
ret 0
@_sse2_MatrixTranslate@8 ENDP
@_sse2_MatrixScale@8 PROC PUBLIC
movaps xmm0, XMMWORD PTR [ecx]
movaps xmm1, XMMWORD PTR [ecx+16]
movaps xmm2, XMMWORD PTR [ecx+32]
movaps xmm4, XMMWORD PTR [edx]
movaps xmm5, xmm4
movaps xmm6, xmm4
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
shufps xmm4, xmm4, 00000000b
shufps xmm5, xmm5, 01010101b
shufps xmm6, xmm6, 10101010b
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
movaps XMMWORD PTR [ecx],xmm4
movaps XMMWORD PTR [ecx+16],xmm5
movaps XMMWORD PTR [ecx+32],xmm6
mulps xmm4, XMMWORD PTR [ecx]
mulps xmm5, XMMWORD PTR [ecx+16]
mulps xmm6, XMMWORD PTR [ecx+32]
movaps XMMWORD PTR [ecx], xmm4
movaps XMMWORD PTR [ecx+16], xmm5
movaps XMMWORD PTR [ecx+32], xmm6
ret 0
@_sse2_MatrixScale@8 ENDP