2012-04-19 19:38:29 +00:00
|
|
|
; Copyright (C) 2003-2005 Gabest
|
|
|
|
; http://www.gabest.org
|
|
|
|
;
|
|
|
|
; This Program is free software; you can redistribute it and/or modify
|
|
|
|
; it under the terms of the GNU General Public License as published by
|
|
|
|
; the Free Software Foundation; either version 2, or (at your option)
|
|
|
|
; any later version.
|
|
|
|
;
|
|
|
|
; This Program is distributed in the hope that it will be useful,
|
|
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
; GNU General Public License for more details.
|
|
|
|
;
|
|
|
|
; You should have received a copy of the GNU General Public License
|
|
|
|
; along with GNU Make; see the file COPYING. If not, write to
|
2012-10-19 20:27:50 +00:00
|
|
|
; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA.
|
2012-04-19 19:38:29 +00:00
|
|
|
; http://www.gnu.org/copyleft/gpl.html
|
|
|
|
;
|
|
|
|
;
|
|
|
|
.686
|
|
|
|
.model flat
|
|
|
|
.mmx
|
|
|
|
.xmm
|
|
|
|
|
|
|
|
.const
|
|
|
|
|
|
|
|
__uvmin DD 0d01502f9r ; -1e+010
|
|
|
|
__uvmax DD 0501502f9r ; +1e+010
|
|
|
|
|
|
|
|
.code
|
|
|
|
|
|
|
|
;
|
|
|
|
; swizzling
|
|
|
|
;
|
|
|
|
|
|
|
|
punpck macro op, sd0, sd2, s1, s3, d1, d3
|
|
|
|
|
|
|
|
movdqa @CatStr(xmm, %d1), @CatStr(xmm, %sd0)
|
|
|
|
pshufd @CatStr(xmm, %d3), @CatStr(xmm, %sd2), 0e4h
|
|
|
|
|
|
|
|
@CatStr(punpckl, op) @CatStr(xmm, %sd0), @CatStr(xmm, %s1)
|
|
|
|
@CatStr(punpckh, op) @CatStr(xmm, %d1), @CatStr(xmm, %s1)
|
|
|
|
@CatStr(punpckl, op) @CatStr(xmm, %sd2), @CatStr(xmm, %s3)
|
|
|
|
@CatStr(punpckh, op) @CatStr(xmm, %d3), @CatStr(xmm, %s3)
|
|
|
|
|
|
|
|
endm
|
|
|
|
|
|
|
|
punpcknb macro
|
|
|
|
|
|
|
|
movdqa xmm4, xmm0
|
|
|
|
pshufd xmm5, xmm1, 0e4h
|
|
|
|
|
|
|
|
psllq xmm1, 4
|
|
|
|
psrlq xmm4, 4
|
|
|
|
|
|
|
|
movdqa xmm6, xmm7
|
|
|
|
pand xmm0, xmm7
|
|
|
|
pandn xmm6, xmm1
|
|
|
|
por xmm0, xmm6
|
|
|
|
|
|
|
|
movdqa xmm6, xmm7
|
|
|
|
pand xmm4, xmm7
|
|
|
|
pandn xmm6, xmm5
|
|
|
|
por xmm4, xmm6
|
|
|
|
|
|
|
|
movdqa xmm1, xmm4
|
|
|
|
|
|
|
|
movdqa xmm4, xmm2
|
|
|
|
pshufd xmm5, xmm3, 0e4h
|
|
|
|
|
|
|
|
psllq xmm3, 4
|
|
|
|
psrlq xmm4, 4
|
|
|
|
|
|
|
|
movdqa xmm6, xmm7
|
|
|
|
pand xmm2, xmm7
|
|
|
|
pandn xmm6, xmm3
|
|
|
|
por xmm2, xmm6
|
|
|
|
|
|
|
|
movdqa xmm6, xmm7
|
|
|
|
pand xmm4, xmm7
|
|
|
|
pandn xmm6, xmm5
|
|
|
|
por xmm4, xmm6
|
|
|
|
|
|
|
|
movdqa xmm3, xmm4
|
|
|
|
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
|
|
|
|
|
|
endm
|
|
|
|
|
|
|
|
|
|
|
|
;
|
|
|
|
; swizzling
|
|
|
|
;
|
|
|
|
|
|
|
|
;
|
|
|
|
; SwizzleBlock32
|
|
|
|
;
|
|
|
|
|
|
|
|
@SwizzleBlock32_sse2@16 proc public
|
|
|
|
|
|
|
|
|
|
|
|
push esi
|
|
|
|
push edi
|
|
|
|
|
|
|
|
mov edi, ecx
|
|
|
|
mov esi, edx
|
|
|
|
mov edx, [esp+4+8]
|
|
|
|
mov ecx, 4
|
|
|
|
|
|
|
|
mov eax, [esp+8+8]
|
|
|
|
cmp eax, 0ffffffffh
|
|
|
|
jne SwizzleBlock32_sse2@WM
|
|
|
|
|
|
|
|
align 16
|
|
|
|
@@:
|
|
|
|
movdqa xmm0, [esi]
|
|
|
|
movdqa xmm4, [esi+16]
|
|
|
|
movdqa xmm1, [esi+edx]
|
|
|
|
movdqa xmm5, [esi+edx+16]
|
|
|
|
|
|
|
|
punpck qdq, 0, 4, 1, 5, 2, 6
|
|
|
|
|
|
|
|
movntps [edi+16*0], xmm0
|
|
|
|
movntps [edi+16*1], xmm2
|
|
|
|
movntps [edi+16*2], xmm4
|
|
|
|
movntps [edi+16*3], xmm6
|
|
|
|
|
|
|
|
lea esi, [esi+edx*2]
|
|
|
|
add edi, 64
|
|
|
|
|
|
|
|
dec ecx
|
|
|
|
jnz @B
|
|
|
|
|
|
|
|
pop edi
|
|
|
|
pop esi
|
|
|
|
|
|
|
|
ret 8
|
|
|
|
|
|
|
|
SwizzleBlock32_sse2@WM:
|
|
|
|
|
|
|
|
movd xmm7, eax
|
|
|
|
pshufd xmm7, xmm7, 0
|
|
|
|
|
|
|
|
align 16
|
|
|
|
@@:
|
|
|
|
movdqa xmm0, [esi]
|
|
|
|
movdqa xmm4, [esi+16]
|
|
|
|
movdqa xmm1, [esi+edx]
|
|
|
|
movdqa xmm5, [esi+edx+16]
|
|
|
|
|
|
|
|
punpck qdq, 0, 4, 1, 5, 2, 6
|
|
|
|
|
|
|
|
movdqa xmm3, xmm7
|
|
|
|
pshufd xmm5, xmm7, 0e4h
|
|
|
|
|
|
|
|
pandn xmm3, [edi+16*0]
|
|
|
|
pand xmm0, xmm7
|
|
|
|
por xmm0, xmm3
|
|
|
|
movntps [edi+16*0], xmm0
|
|
|
|
|
|
|
|
pandn xmm5, [edi+16*1]
|
|
|
|
pand xmm2, xmm7
|
|
|
|
por xmm2, xmm5
|
|
|
|
movntps [edi+16*1], xmm2
|
|
|
|
|
|
|
|
movdqa xmm3, xmm7
|
|
|
|
pshufd xmm5, xmm7, 0e4h
|
|
|
|
|
|
|
|
pandn xmm3, [edi+16*2]
|
|
|
|
pand xmm4, xmm7
|
|
|
|
por xmm4, xmm3
|
|
|
|
movntps [edi+16*2], xmm4
|
|
|
|
|
|
|
|
pandn xmm5, [edi+16*3]
|
|
|
|
pand xmm6, xmm7
|
|
|
|
por xmm6, xmm5
|
|
|
|
movntps [edi+16*3], xmm6
|
|
|
|
|
|
|
|
lea esi, [esi+edx*2]
|
|
|
|
add edi, 64
|
|
|
|
|
|
|
|
dec ecx
|
|
|
|
jnz @B
|
|
|
|
|
|
|
|
pop edi
|
|
|
|
pop esi
|
|
|
|
|
|
|
|
ret 8
|
|
|
|
|
|
|
|
@SwizzleBlock32_sse2@16 endp
|
|
|
|
|
|
|
|
;
|
|
|
|
; SwizzleBlock16
|
|
|
|
;
|
|
|
|
|
|
|
|
@SwizzleBlock16_sse2@12 proc public
|
|
|
|
|
|
|
|
push ebx
|
|
|
|
|
|
|
|
mov ebx, [esp+4+4]
|
|
|
|
mov eax, 4
|
|
|
|
|
|
|
|
align 16
|
|
|
|
@@:
|
|
|
|
movdqa xmm0, [edx]
|
|
|
|
movdqa xmm1, [edx+16]
|
|
|
|
movdqa xmm2, [edx+ebx]
|
|
|
|
movdqa xmm3, [edx+ebx+16]
|
|
|
|
|
|
|
|
punpck wd, 0, 2, 1, 3, 4, 6
|
|
|
|
punpck qdq, 0, 4, 2, 6, 1, 5
|
|
|
|
|
|
|
|
movntps [ecx+16*0], xmm0
|
|
|
|
movntps [ecx+16*1], xmm1
|
|
|
|
movntps [ecx+16*2], xmm4
|
|
|
|
movntps [ecx+16*3], xmm5
|
|
|
|
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
add ecx, 64
|
|
|
|
|
|
|
|
dec eax
|
|
|
|
jnz @B
|
|
|
|
|
|
|
|
pop ebx
|
|
|
|
|
|
|
|
ret 4
|
|
|
|
|
|
|
|
@SwizzleBlock16_sse2@12 endp
|
|
|
|
|
|
|
|
;
|
|
|
|
; SwizzleBlock8
|
|
|
|
;
|
|
|
|
|
|
|
|
@SwizzleBlock8_sse2@12 proc public
|
|
|
|
|
|
|
|
push ebx
|
|
|
|
|
|
|
|
mov ebx, [esp+4+4]
|
|
|
|
mov eax, 2
|
|
|
|
|
|
|
|
align 16
|
|
|
|
@@:
|
|
|
|
; col 0, 2
|
|
|
|
|
|
|
|
movdqa xmm0, [edx]
|
|
|
|
movdqa xmm2, [edx+ebx]
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
pshufd xmm1, [edx], 0b1h
|
|
|
|
pshufd xmm3, [edx+ebx], 0b1h
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
|
|
punpck wd, 0, 2, 4, 6, 1, 3
|
|
|
|
punpck qdq, 0, 1, 2, 3, 4, 5
|
|
|
|
|
|
|
|
movntps [ecx+16*0], xmm0
|
|
|
|
movntps [ecx+16*1], xmm4
|
|
|
|
movntps [ecx+16*2], xmm1
|
|
|
|
movntps [ecx+16*3], xmm5
|
|
|
|
|
|
|
|
; col 1, 3
|
|
|
|
|
|
|
|
pshufd xmm0, [edx], 0b1h
|
|
|
|
pshufd xmm2, [edx+ebx], 0b1h
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
movdqa xmm1, [edx]
|
|
|
|
movdqa xmm3, [edx+ebx]
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
|
|
punpck wd, 0, 2, 4, 6, 1, 3
|
|
|
|
punpck qdq, 0, 1, 2, 3, 4, 5
|
|
|
|
|
|
|
|
movntps [ecx+16*4], xmm0
|
|
|
|
movntps [ecx+16*5], xmm4
|
|
|
|
movntps [ecx+16*6], xmm1
|
|
|
|
movntps [ecx+16*7], xmm5
|
|
|
|
|
|
|
|
add ecx, 128
|
|
|
|
|
|
|
|
dec eax
|
|
|
|
jnz @B
|
|
|
|
|
|
|
|
pop ebx
|
|
|
|
|
|
|
|
ret 4
|
|
|
|
|
|
|
|
@SwizzleBlock8_sse2@12 endp
|
|
|
|
|
|
|
|
;
|
|
|
|
; SwizzleBlock4
|
|
|
|
;
|
|
|
|
|
|
|
|
@SwizzleBlock4_sse2@12 proc public
|
|
|
|
|
|
|
|
push ebx
|
|
|
|
|
|
|
|
mov eax, 0f0f0f0fh
|
|
|
|
movd xmm7, eax
|
|
|
|
pshufd xmm7, xmm7, 0
|
|
|
|
|
|
|
|
mov ebx, [esp+4+4]
|
|
|
|
mov eax, 2
|
|
|
|
|
|
|
|
align 16
|
|
|
|
@@:
|
|
|
|
; col 0, 2
|
|
|
|
|
|
|
|
movdqa xmm0, [edx]
|
|
|
|
movdqa xmm2, [edx+ebx]
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
movdqa xmm1, [edx]
|
|
|
|
movdqa xmm3, [edx+ebx]
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
pshuflw xmm1, xmm1, 0b1h
|
|
|
|
pshuflw xmm3, xmm3, 0b1h
|
|
|
|
pshufhw xmm1, xmm1, 0b1h
|
|
|
|
pshufhw xmm3, xmm3, 0b1h
|
|
|
|
|
|
|
|
punpcknb
|
|
|
|
punpck bw, 0, 2, 4, 6, 1, 3
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
|
|
punpck qdq, 0, 4, 2, 6, 1, 3
|
|
|
|
|
|
|
|
movntps [ecx+16*0], xmm0
|
|
|
|
movntps [ecx+16*1], xmm1
|
|
|
|
movntps [ecx+16*2], xmm4
|
|
|
|
movntps [ecx+16*3], xmm3
|
|
|
|
|
|
|
|
; col 1, 3
|
|
|
|
|
|
|
|
movdqa xmm0, [edx]
|
|
|
|
movdqa xmm2, [edx+ebx]
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
movdqa xmm1, [edx]
|
|
|
|
movdqa xmm3, [edx+ebx]
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
pshuflw xmm0, xmm0, 0b1h
|
|
|
|
pshuflw xmm2, xmm2, 0b1h
|
|
|
|
pshufhw xmm0, xmm0, 0b1h
|
|
|
|
pshufhw xmm2, xmm2, 0b1h
|
|
|
|
|
|
|
|
punpcknb
|
|
|
|
punpck bw, 0, 2, 4, 6, 1, 3
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
|
|
punpck qdq, 0, 4, 2, 6, 1, 3
|
|
|
|
|
|
|
|
movntps [ecx+16*4], xmm0
|
|
|
|
movntps [ecx+16*5], xmm1
|
|
|
|
movntps [ecx+16*6], xmm4
|
|
|
|
movntps [ecx+16*7], xmm3
|
|
|
|
|
|
|
|
add ecx, 128
|
|
|
|
|
|
|
|
dec eax
|
|
|
|
jnz @B
|
|
|
|
|
|
|
|
pop ebx
|
|
|
|
|
|
|
|
ret 4
|
|
|
|
|
|
|
|
@SwizzleBlock4_sse2@12 endp
|
|
|
|
|
|
|
|
;
|
|
|
|
; swizzling with unaligned reads
|
|
|
|
;
|
|
|
|
|
|
|
|
;
|
|
|
|
; SwizzleBlock32u
|
|
|
|
;
|
|
|
|
|
|
|
|
@SwizzleBlock32u_sse2@16 proc public
|
|
|
|
|
|
|
|
push esi
|
|
|
|
push edi
|
|
|
|
|
|
|
|
mov edi, ecx
|
|
|
|
mov esi, edx
|
|
|
|
mov edx, [esp+4+8]
|
|
|
|
mov ecx, 4
|
|
|
|
|
|
|
|
mov eax, [esp+8+8]
|
|
|
|
cmp eax, 0ffffffffh
|
|
|
|
jne SwizzleBlock32u_sse2@WM
|
|
|
|
|
|
|
|
align 16
|
|
|
|
@@:
|
|
|
|
movdqu xmm0, [esi]
|
|
|
|
movdqu xmm4, [esi+16]
|
|
|
|
movdqu xmm1, [esi+edx]
|
|
|
|
movdqu xmm5, [esi+edx+16]
|
|
|
|
|
|
|
|
punpck qdq, 0, 4, 1, 5, 2, 6
|
|
|
|
|
|
|
|
movntps [edi+16*0], xmm0
|
|
|
|
movntps [edi+16*1], xmm2
|
|
|
|
movntps [edi+16*2], xmm4
|
|
|
|
movntps [edi+16*3], xmm6
|
|
|
|
|
|
|
|
lea esi, [esi+edx*2]
|
|
|
|
add edi, 64
|
|
|
|
|
|
|
|
dec ecx
|
|
|
|
jnz @B
|
|
|
|
|
|
|
|
pop edi
|
|
|
|
pop esi
|
|
|
|
|
|
|
|
ret 8
|
|
|
|
|
|
|
|
SwizzleBlock32u_sse2@WM:
|
|
|
|
|
|
|
|
movd xmm7, eax
|
|
|
|
pshufd xmm7, xmm7, 0
|
|
|
|
|
|
|
|
align 16
|
|
|
|
@@:
|
|
|
|
movdqu xmm0, [esi]
|
|
|
|
movdqu xmm4, [esi+16]
|
|
|
|
movdqu xmm1, [esi+edx]
|
|
|
|
movdqu xmm5, [esi+edx+16]
|
|
|
|
|
|
|
|
punpck qdq, 0, 4, 1, 5, 2, 6
|
|
|
|
|
|
|
|
movdqa xmm3, xmm7
|
|
|
|
pshufd xmm5, xmm7, 0e4h
|
|
|
|
|
|
|
|
pandn xmm3, [edi+16*0]
|
|
|
|
pand xmm0, xmm7
|
|
|
|
por xmm0, xmm3
|
|
|
|
movdqa [edi+16*0], xmm0
|
|
|
|
|
|
|
|
pandn xmm5, [edi+16*1]
|
|
|
|
pand xmm2, xmm7
|
|
|
|
por xmm2, xmm5
|
|
|
|
movdqa [edi+16*1], xmm2
|
|
|
|
|
|
|
|
movdqa xmm3, xmm7
|
|
|
|
pshufd xmm5, xmm7, 0e4h
|
|
|
|
|
|
|
|
pandn xmm3, [edi+16*2]
|
|
|
|
pand xmm4, xmm7
|
|
|
|
por xmm4, xmm3
|
|
|
|
movdqa [edi+16*2], xmm4
|
|
|
|
|
|
|
|
pandn xmm5, [edi+16*3]
|
|
|
|
pand xmm6, xmm7
|
|
|
|
por xmm6, xmm5
|
|
|
|
movdqa [edi+16*3], xmm6
|
|
|
|
|
|
|
|
lea esi, [esi+edx*2]
|
|
|
|
add edi, 64
|
|
|
|
|
|
|
|
dec ecx
|
|
|
|
jnz @B
|
|
|
|
|
|
|
|
pop edi
|
|
|
|
pop esi
|
|
|
|
|
|
|
|
ret 8
|
|
|
|
|
|
|
|
@SwizzleBlock32u_sse2@16 endp
|
|
|
|
|
|
|
|
;
|
|
|
|
; SwizzleBlock16u
|
|
|
|
;
|
|
|
|
|
|
|
|
@SwizzleBlock16u_sse2@12 proc public
|
|
|
|
|
|
|
|
push ebx
|
|
|
|
|
|
|
|
mov ebx, [esp+4+4]
|
|
|
|
mov eax, 4
|
|
|
|
|
|
|
|
align 16
|
|
|
|
@@:
|
|
|
|
movdqu xmm0, [edx]
|
|
|
|
movdqu xmm1, [edx+16]
|
|
|
|
movdqu xmm2, [edx+ebx]
|
|
|
|
movdqu xmm3, [edx+ebx+16]
|
|
|
|
|
|
|
|
punpck wd, 0, 2, 1, 3, 4, 6
|
|
|
|
punpck qdq, 0, 4, 2, 6, 1, 5
|
|
|
|
|
|
|
|
movntps [ecx+16*0], xmm0
|
|
|
|
movntps [ecx+16*1], xmm1
|
|
|
|
movntps [ecx+16*2], xmm4
|
|
|
|
movntps [ecx+16*3], xmm5
|
|
|
|
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
add ecx, 64
|
|
|
|
|
|
|
|
dec eax
|
|
|
|
jnz @B
|
|
|
|
|
|
|
|
pop ebx
|
|
|
|
|
|
|
|
ret 4
|
|
|
|
|
|
|
|
@SwizzleBlock16u_sse2@12 endp
|
|
|
|
|
|
|
|
;
|
|
|
|
; SwizzleBlock8u
|
|
|
|
;
|
|
|
|
|
|
|
|
@SwizzleBlock8u_sse2@12 proc public
|
|
|
|
|
|
|
|
push ebx
|
|
|
|
|
|
|
|
mov ebx, [esp+4+4]
|
|
|
|
mov eax, 2
|
|
|
|
|
|
|
|
align 16
|
|
|
|
@@:
|
|
|
|
; col 0, 2
|
|
|
|
|
|
|
|
movdqu xmm0, [edx]
|
|
|
|
movdqu xmm2, [edx+ebx]
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
movdqu xmm1, [edx]
|
|
|
|
movdqu xmm3, [edx+ebx]
|
|
|
|
pshufd xmm1, xmm1, 0b1h
|
|
|
|
pshufd xmm3, xmm3, 0b1h
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
|
|
punpck wd, 0, 2, 4, 6, 1, 3
|
|
|
|
punpck qdq, 0, 1, 2, 3, 4, 5
|
|
|
|
|
|
|
|
movntps [ecx+16*0], xmm0
|
|
|
|
movntps [ecx+16*1], xmm4
|
|
|
|
movntps [ecx+16*2], xmm1
|
|
|
|
movntps [ecx+16*3], xmm5
|
|
|
|
|
|
|
|
; col 1, 3
|
|
|
|
|
|
|
|
movdqu xmm0, [edx]
|
|
|
|
movdqu xmm2, [edx+ebx]
|
|
|
|
pshufd xmm0, xmm0, 0b1h
|
|
|
|
pshufd xmm2, xmm2, 0b1h
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
movdqu xmm1, [edx]
|
|
|
|
movdqu xmm3, [edx+ebx]
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
|
|
punpck wd, 0, 2, 4, 6, 1, 3
|
|
|
|
punpck qdq, 0, 1, 2, 3, 4, 5
|
|
|
|
|
|
|
|
movntps [ecx+16*4], xmm0
|
|
|
|
movntps [ecx+16*5], xmm4
|
|
|
|
movntps [ecx+16*6], xmm1
|
|
|
|
movntps [ecx+16*7], xmm5
|
|
|
|
|
|
|
|
add ecx, 128
|
|
|
|
|
|
|
|
dec eax
|
|
|
|
jnz @B
|
|
|
|
|
|
|
|
pop ebx
|
|
|
|
|
|
|
|
ret 4
|
|
|
|
|
|
|
|
@SwizzleBlock8u_sse2@12 endp
|
|
|
|
|
|
|
|
;
|
|
|
|
; SwizzleBlock4u
|
|
|
|
;
|
|
|
|
|
|
|
|
@SwizzleBlock4u_sse2@12 proc public
|
|
|
|
|
|
|
|
push ebx
|
|
|
|
|
|
|
|
mov eax, 0f0f0f0fh
|
|
|
|
movd xmm7, eax
|
|
|
|
pshufd xmm7, xmm7, 0
|
|
|
|
|
|
|
|
mov ebx, [esp+4+4]
|
|
|
|
mov eax, 2
|
|
|
|
|
|
|
|
align 16
|
|
|
|
@@:
|
|
|
|
; col 0, 2
|
|
|
|
|
|
|
|
movdqu xmm0, [edx]
|
|
|
|
movdqu xmm2, [edx+ebx]
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
movdqu xmm1, [edx]
|
|
|
|
movdqu xmm3, [edx+ebx]
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
pshuflw xmm1, xmm1, 0b1h
|
|
|
|
pshuflw xmm3, xmm3, 0b1h
|
|
|
|
pshufhw xmm1, xmm1, 0b1h
|
|
|
|
pshufhw xmm3, xmm3, 0b1h
|
|
|
|
|
|
|
|
punpcknb
|
|
|
|
punpck bw, 0, 2, 4, 6, 1, 3
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
|
|
punpck qdq, 0, 4, 2, 6, 1, 3
|
|
|
|
|
|
|
|
movntps [ecx+16*0], xmm0
|
|
|
|
movntps [ecx+16*1], xmm1
|
|
|
|
movntps [ecx+16*2], xmm4
|
|
|
|
movntps [ecx+16*3], xmm3
|
|
|
|
|
|
|
|
; col 1, 3
|
|
|
|
|
|
|
|
movdqu xmm0, [edx]
|
|
|
|
movdqu xmm2, [edx+ebx]
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
movdqu xmm1, [edx]
|
|
|
|
movdqu xmm3, [edx+ebx]
|
|
|
|
lea edx, [edx+ebx*2]
|
|
|
|
|
|
|
|
pshuflw xmm0, xmm0, 0b1h
|
|
|
|
pshuflw xmm2, xmm2, 0b1h
|
|
|
|
pshufhw xmm0, xmm0, 0b1h
|
|
|
|
pshufhw xmm2, xmm2, 0b1h
|
|
|
|
|
|
|
|
punpcknb
|
|
|
|
punpck bw, 0, 2, 4, 6, 1, 3
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
|
|
punpck qdq, 0, 4, 2, 6, 1, 3
|
|
|
|
|
|
|
|
movntps [ecx+16*4], xmm0
|
|
|
|
movntps [ecx+16*5], xmm1
|
|
|
|
movntps [ecx+16*6], xmm4
|
|
|
|
movntps [ecx+16*7], xmm3
|
|
|
|
|
|
|
|
add ecx, 128
|
|
|
|
|
|
|
|
dec eax
|
|
|
|
jnz @B
|
|
|
|
|
|
|
|
pop ebx
|
|
|
|
|
|
|
|
ret 4
|
|
|
|
|
|
|
|
@SwizzleBlock4u_sse2@12 endp
|
|
|
|
|
|
|
|
end
|