- sse2 optimizations (speed test http://www.turboupload.com/uzmjnuzfbvf1/test_matrix_SSE2_09-06-16.ZIP)
This commit is contained in:
parent
b7f41c4ff1
commit
08628592cf
|
@ -21,7 +21,7 @@
|
|||
TITLE matrix_sse2-x64.asm
|
||||
.code
|
||||
|
||||
MatrixMultVec4x4 PROC PUBLIC
|
||||
_sse2_MatrixMultVec4x4 PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [rcx]
|
||||
movaps xmm1, XMMWORD PTR [rcx+16]
|
||||
movaps xmm2, XMMWORD PTR [rcx+32]
|
||||
|
@ -43,9 +43,9 @@ MatrixMultVec4x4 PROC PUBLIC
|
|||
addps xmm4, xmm7
|
||||
movaps XMMWORD PTR [rdx], xmm4
|
||||
ret 0
|
||||
MatrixMultVec4x4 ENDP
|
||||
_sse2_MatrixMultVec4x4 ENDP
|
||||
|
||||
MatrixMultVec3x3 PROC PUBLIC
|
||||
_sse2_MatrixMultVec3x3 PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [rcx]
|
||||
movaps xmm1, XMMWORD PTR [rcx+16]
|
||||
movaps xmm2, XMMWORD PTR [rcx+32]
|
||||
|
@ -62,81 +62,83 @@ MatrixMultVec3x3 PROC PUBLIC
|
|||
addps xmm4, xmm5
|
||||
addps xmm4, xmm6
|
||||
movaps XMMWORD PTR [rdx], xmm4
|
||||
MatrixMultVec3x3 ENDP
|
||||
ret 0
|
||||
_sse2_MatrixMultVec3x3 ENDP
|
||||
|
||||
MatrixMultiply PROC PUBLIC
|
||||
_sse2_MatrixMultiply PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [rcx]
|
||||
movaps xmm1, XMMWORD PTR [rcx+16]
|
||||
movaps xmm2, XMMWORD PTR [rcx+32]
|
||||
movaps xmm3, XMMWORD PTR [rcx+48]
|
||||
movaps xmm4, XMMWORD PTR [rdx] ; r00, r01, r02, r03
|
||||
movaps xmm8, XMMWORD PTR [rdx+16] ; r04, r05, r06, r07
|
||||
movaps xmm5,xmm4
|
||||
movaps xmm6,xmm4
|
||||
movaps xmm7,xmm4
|
||||
movaps xmm9,xmm8 ;
|
||||
movaps xmm10,xmm8
|
||||
movaps xmm11,xmm8
|
||||
shufps xmm4,xmm4,00000000b
|
||||
shufps xmm5,xmm5,01010101b
|
||||
shufps xmm6,xmm6,10101010b
|
||||
shufps xmm7,xmm7,11111111b
|
||||
shufps xmm8, xmm8, 00000000b ;
|
||||
shufps xmm9, xmm9, 01010101b
|
||||
shufps xmm10,xmm10,10101010b
|
||||
shufps xmm11,xmm11,11111111b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
mulps xmm7,xmm3
|
||||
mulps xmm8, xmm0 ;
|
||||
mulps xmm9, xmm1
|
||||
mulps xmm10,xmm2
|
||||
mulps xmm11,xmm3
|
||||
addps xmm4,xmm5
|
||||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
addps xmm8,xmm9 ;
|
||||
addps xmm8,xmm10
|
||||
addps xmm8,xmm11
|
||||
movaps XMMWORD PTR [rcx],xmm4
|
||||
movaps xmm4, XMMWORD PTR [rdx+16] ; r04, r05, r06, r07
|
||||
movaps xmm5,xmm4
|
||||
movaps xmm6,xmm4
|
||||
movaps xmm7,xmm4
|
||||
shufps xmm4,xmm4,00000000b
|
||||
shufps xmm5,xmm5,01010101b
|
||||
shufps xmm6,xmm6,10101010b
|
||||
shufps xmm7,xmm7,11111111b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
mulps xmm7,xmm3
|
||||
addps xmm4,xmm5
|
||||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
movaps XMMWORD PTR [rcx+16],xmm4
|
||||
movaps xmm4, XMMWORD PTR [rdx+32] ; r08, r09, r10, r11
|
||||
movaps xmm5,xmm4
|
||||
movaps xmm6,xmm4
|
||||
movaps xmm7,xmm4
|
||||
shufps xmm4,xmm4,00000000b
|
||||
shufps xmm5,xmm5,01010101b
|
||||
shufps xmm6,xmm6,10101010b
|
||||
shufps xmm7,xmm7,11111111b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
mulps xmm7,xmm3
|
||||
addps xmm4,xmm5
|
||||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
movaps XMMWORD PTR [rcx+32],xmm4
|
||||
movaps xmm4, XMMWORD PTR [rdx+48] ; r12, r13, r14, r15
|
||||
movaps xmm5,xmm4
|
||||
movaps xmm6,xmm4
|
||||
movaps xmm7,xmm4
|
||||
shufps xmm4,xmm4,00000000b
|
||||
shufps xmm5,xmm5,01010101b
|
||||
shufps xmm6,xmm6,10101010b
|
||||
shufps xmm7,xmm7,11111111b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
mulps xmm7,xmm3
|
||||
addps xmm4,xmm5
|
||||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
movaps XMMWORD PTR [rcx+48],xmm4
|
||||
ret 0
|
||||
MatrixMultiply ENDP
|
||||
movaps XMMWORD PTR [rcx+16],xmm8
|
||||
|
||||
MatrixTranslate PROC PUBLIC
|
||||
movaps xmm4, XMMWORD PTR [rdx+32] ; r00, r01, r02, r03
|
||||
movaps xmm8, XMMWORD PTR [rdx+48] ; r04, r05, r06, r07
|
||||
movaps xmm5,xmm4
|
||||
movaps xmm6,xmm4
|
||||
movaps xmm7,xmm4
|
||||
movaps xmm9,xmm8 ;
|
||||
movaps xmm10,xmm8
|
||||
movaps xmm11,xmm8
|
||||
shufps xmm4,xmm4,00000000b
|
||||
shufps xmm5,xmm5,01010101b
|
||||
shufps xmm6,xmm6,10101010b
|
||||
shufps xmm7,xmm7,11111111b
|
||||
shufps xmm8, xmm8, 00000000b ;
|
||||
shufps xmm9, xmm9, 01010101b
|
||||
shufps xmm10,xmm10,10101010b
|
||||
shufps xmm11,xmm11,11111111b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
mulps xmm7,xmm3
|
||||
mulps xmm8, xmm0 ;
|
||||
mulps xmm9, xmm1
|
||||
mulps xmm10,xmm2
|
||||
mulps xmm11,xmm3
|
||||
addps xmm4,xmm5
|
||||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
addps xmm8,xmm9 ;
|
||||
addps xmm8,xmm10
|
||||
addps xmm8,xmm11
|
||||
movaps XMMWORD PTR [rcx+32],xmm4
|
||||
movaps XMMWORD PTR [rcx+48],xmm8
|
||||
ret 0
|
||||
_sse2_MatrixMultiply ENDP
|
||||
|
||||
_sse2_MatrixTranslate PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [rcx]
|
||||
movaps xmm1, XMMWORD PTR [rcx+16]
|
||||
movaps xmm2, XMMWORD PTR [rcx+32]
|
||||
|
@ -156,9 +158,9 @@ MatrixTranslate PROC PUBLIC
|
|||
addps xmm4, xmm3
|
||||
movaps XMMWORD PTR [rcx+48], xmm4
|
||||
ret 0
|
||||
MatrixTranslate ENDP
|
||||
_sse2_MatrixTranslate ENDP
|
||||
|
||||
MatrixScale PROC PUBLIC
|
||||
_sse2_MatrixScale PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [rcx]
|
||||
movaps xmm1, XMMWORD PTR [rcx+16]
|
||||
movaps xmm2, XMMWORD PTR [rcx+32]
|
||||
|
@ -175,6 +177,6 @@ MatrixScale PROC PUBLIC
|
|||
movaps XMMWORD PTR [rcx+16],xmm5
|
||||
movaps XMMWORD PTR [rcx+32],xmm6
|
||||
ret 0
|
||||
MatrixScale ENDP
|
||||
_sse2_MatrixScale ENDP
|
||||
|
||||
end
|
||||
|
|
|
@ -25,22 +25,15 @@
|
|||
.code
|
||||
|
||||
@_sse2_MatrixMultVec4x4@8 PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [ecx]
|
||||
movaps xmm1, XMMWORD PTR [ecx+16]
|
||||
movaps xmm2, XMMWORD PTR [ecx+32]
|
||||
movaps xmm3, XMMWORD PTR [ecx+48]
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
movaps xmm5, xmm4
|
||||
movaps xmm6, xmm4
|
||||
movaps xmm7, xmm4
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
pshufd xmm7, xmm4, 11111111b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
shufps xmm5, xmm5, 01010101b
|
||||
shufps xmm6, xmm6, 10101010b
|
||||
shufps xmm7, xmm7, 11111111b
|
||||
mulps xmm4, xmm0
|
||||
mulps xmm5, xmm1
|
||||
mulps xmm6, xmm2
|
||||
mulps xmm7, xmm3
|
||||
mulps xmm4, XMMWORD PTR [ecx]
|
||||
mulps xmm5, XMMWORD PTR [ecx+16]
|
||||
mulps xmm6, XMMWORD PTR [ecx+32]
|
||||
mulps xmm7, XMMWORD PTR [ecx+48]
|
||||
addps xmm4, xmm5
|
||||
addps xmm4, xmm6
|
||||
addps xmm4, xmm7
|
||||
|
@ -49,19 +42,13 @@
|
|||
@_sse2_MatrixMultVec4x4@8 ENDP
|
||||
|
||||
@_sse2_MatrixMultVec3x3@8 PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [ecx]
|
||||
movaps xmm1, XMMWORD PTR [ecx+16]
|
||||
movaps xmm2, XMMWORD PTR [ecx+32]
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
movaps xmm5, xmm4
|
||||
movaps xmm6, xmm4
|
||||
movaps xmm7, xmm4
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
shufps xmm5, xmm5, 01010101b
|
||||
shufps xmm6, xmm6, 10101010b
|
||||
mulps xmm4, xmm0
|
||||
mulps xmm5, xmm1
|
||||
mulps xmm6, xmm2
|
||||
mulps xmm4, XMMWORD PTR [ecx]
|
||||
mulps xmm5, XMMWORD PTR [ecx+16]
|
||||
mulps xmm6, XMMWORD PTR [ecx+32]
|
||||
addps xmm4, xmm5
|
||||
addps xmm4, xmm6
|
||||
movaps XMMWORD PTR [edx], xmm4
|
||||
|
@ -74,13 +61,10 @@
|
|||
movaps xmm2, XMMWORD PTR [ecx+32]
|
||||
movaps xmm3, XMMWORD PTR [ecx+48]
|
||||
movaps xmm4, XMMWORD PTR [edx] ; r00, r01, r02, r03
|
||||
movaps xmm5,xmm4
|
||||
movaps xmm6,xmm4
|
||||
movaps xmm7,xmm4
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
pshufd xmm7, xmm4, 11111111b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
shufps xmm5,xmm5,01010101b
|
||||
shufps xmm6,xmm6,10101010b
|
||||
shufps xmm7,xmm7,11111111b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
|
@ -89,14 +73,12 @@
|
|||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
movaps XMMWORD PTR [ecx],xmm4
|
||||
|
||||
movaps xmm4, XMMWORD PTR [edx+16] ; r04, r05, r06, r07
|
||||
movaps xmm5,xmm4
|
||||
movaps xmm6,xmm4
|
||||
movaps xmm7,xmm4
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
pshufd xmm7, xmm4, 11111111b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
shufps xmm5,xmm5,01010101b
|
||||
shufps xmm6,xmm6,10101010b
|
||||
shufps xmm7,xmm7,11111111b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
|
@ -105,14 +87,12 @@
|
|||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
movaps XMMWORD PTR [ecx+16],xmm4
|
||||
|
||||
movaps xmm4, XMMWORD PTR [edx+32] ; r08, r09, r10, r11
|
||||
movaps xmm5,xmm4
|
||||
movaps xmm6,xmm4
|
||||
movaps xmm7,xmm4
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
pshufd xmm7, xmm4, 11111111b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
shufps xmm5,xmm5,01010101b
|
||||
shufps xmm6,xmm6,10101010b
|
||||
shufps xmm7,xmm7,11111111b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
|
@ -121,14 +101,12 @@
|
|||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
movaps XMMWORD PTR [ecx+32],xmm4
|
||||
|
||||
movaps xmm4, XMMWORD PTR [edx+48] ; r12, r13, r14, r15
|
||||
movaps xmm5,xmm4
|
||||
movaps xmm6,xmm4
|
||||
movaps xmm7,xmm4
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
pshufd xmm7, xmm4, 11111111b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
shufps xmm5,xmm5,01010101b
|
||||
shufps xmm6,xmm6,10101010b
|
||||
shufps xmm7,xmm7,11111111b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
|
@ -137,44 +115,33 @@
|
|||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
movaps XMMWORD PTR [ecx+48],xmm4
|
||||
|
||||
ret 0
|
||||
@_sse2_MatrixMultiply@8 ENDP
|
||||
|
||||
@_sse2_MatrixTranslate@8 PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [ecx]
|
||||
movaps xmm1, XMMWORD PTR [ecx+16]
|
||||
movaps xmm2, XMMWORD PTR [ecx+32]
|
||||
movaps xmm3, XMMWORD PTR [ecx+48]
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
movaps xmm5, xmm4
|
||||
movaps xmm6, xmm4
|
||||
movaps xmm7, xmm4
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
shufps xmm5, xmm5, 01010101b
|
||||
shufps xmm6, xmm6, 10101010b
|
||||
mulps xmm4, xmm0
|
||||
mulps xmm5, xmm1
|
||||
mulps xmm6, xmm2
|
||||
mulps xmm4, XMMWORD PTR [ecx]
|
||||
mulps xmm5, XMMWORD PTR [ecx+16]
|
||||
mulps xmm6, XMMWORD PTR [ecx+32]
|
||||
addps xmm4, xmm5
|
||||
addps xmm4, xmm6
|
||||
addps xmm4, xmm3
|
||||
addps xmm4, XMMWORD PTR [ecx+48]
|
||||
movaps XMMWORD PTR [ecx+48], xmm4
|
||||
ret 0
|
||||
@_sse2_MatrixTranslate@8 ENDP
|
||||
|
||||
@_sse2_MatrixScale@8 PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [ecx]
|
||||
movaps xmm1, XMMWORD PTR [ecx+16]
|
||||
movaps xmm2, XMMWORD PTR [ecx+32]
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
movaps xmm5, xmm4
|
||||
movaps xmm6, xmm4
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
shufps xmm5, xmm5, 01010101b
|
||||
shufps xmm6, xmm6, 10101010b
|
||||
mulps xmm4, xmm0
|
||||
mulps xmm5, xmm1
|
||||
mulps xmm6, xmm2
|
||||
mulps xmm4, XMMWORD PTR [ecx]
|
||||
mulps xmm5, XMMWORD PTR [ecx+16]
|
||||
mulps xmm6, XMMWORD PTR [ecx+32]
|
||||
movaps XMMWORD PTR [ecx], xmm4
|
||||
movaps XMMWORD PTR [ecx+16], xmm5
|
||||
movaps XMMWORD PTR [ecx+32], xmm6
|
||||
|
|
Loading…
Reference in New Issue