mirror of https://github.com/PCSX2/pcsx2.git
1353 lines
20 KiB
NASM
1353 lines
20 KiB
NASM
; Copyright (C) 2003-2005 Gabest
|
|
; http://www.gabest.org
|
|
;
|
|
; This Program is free software; you can redistribute it and/or modify
|
|
; it under the terms of the GNU General Public License as published by
|
|
; the Free Software Foundation; either version 2, or (at your option)
|
|
; any later version.
|
|
;
|
|
; This Program is distributed in the hope that it will be useful,
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
; GNU General Public License for more details.
|
|
;
|
|
; You should have received a copy of the GNU General Public License
|
|
; along with GNU Make; see the file COPYING. If not, write to
|
|
; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA.
|
|
; http://www.gnu.org/copyleft/gpl.html
|
|
;
|
|
;
|
|
.686
|
|
.model flat
|
|
.mmx
|
|
.xmm
|
|
|
|
.const
|
|
|
|
__uvmin DD 0d01502f9r ; -1e+010
|
|
__uvmax DD 0501502f9r ; +1e+010
|
|
|
|
.code
|
|
|
|
;
|
|
; memsetd
|
|
;
|
|
|
|
@memsetd@12 proc public
|
|
|
|
push edi
|
|
|
|
mov edi, ecx
|
|
mov eax, edx
|
|
mov ecx, [esp+4+4]
|
|
cld
|
|
rep stosd
|
|
|
|
pop edi
|
|
|
|
ret 4
|
|
|
|
@memsetd@12 endp
|
|
|
|
;
|
|
; SaturateColor
|
|
;
|
|
|
|
@SaturateColor_sse2@4 proc public
|
|
|
|
pxor xmm0, xmm0
|
|
movdqa xmm1, [ecx]
|
|
packssdw xmm1, xmm0
|
|
packuswb xmm1, xmm0
|
|
punpcklbw xmm1, xmm0
|
|
punpcklwd xmm1, xmm0
|
|
movdqa [ecx], xmm1
|
|
|
|
ret
|
|
|
|
@SaturateColor_sse2@4 endp
|
|
|
|
@SaturateColor_asm@4 proc public
|
|
|
|
push esi
|
|
|
|
mov esi, ecx
|
|
|
|
xor eax, eax
|
|
mov edx, 000000ffh
|
|
|
|
mov ecx, [esi]
|
|
cmp ecx, eax
|
|
cmovl ecx, eax
|
|
cmp ecx, edx
|
|
cmovg ecx, edx
|
|
mov [esi], ecx
|
|
|
|
mov ecx, [esi+4]
|
|
cmp ecx, eax
|
|
cmovl ecx, eax
|
|
cmp ecx, edx
|
|
cmovg ecx, edx
|
|
mov [esi+4], ecx
|
|
|
|
mov ecx, [esi+8]
|
|
cmp ecx, eax
|
|
cmovl ecx, eax
|
|
cmp ecx, edx
|
|
cmovg ecx, edx
|
|
mov [esi+8], ecx
|
|
|
|
mov ecx, [esi+12]
|
|
cmp ecx, eax
|
|
cmovl ecx, eax
|
|
cmp ecx, edx
|
|
cmovg ecx, edx
|
|
mov [esi+12], ecx
|
|
|
|
pop esi
|
|
|
|
ret
|
|
|
|
@SaturateColor_asm@4 endp
|
|
|
|
;
|
|
; swizzling
|
|
;
|
|
|
|
punpck macro op, sd0, sd2, s1, s3, d1, d3
|
|
|
|
movdqa @CatStr(xmm, %d1), @CatStr(xmm, %sd0)
|
|
pshufd @CatStr(xmm, %d3), @CatStr(xmm, %sd2), 0e4h
|
|
|
|
@CatStr(punpckl, op) @CatStr(xmm, %sd0), @CatStr(xmm, %s1)
|
|
@CatStr(punpckh, op) @CatStr(xmm, %d1), @CatStr(xmm, %s1)
|
|
@CatStr(punpckl, op) @CatStr(xmm, %sd2), @CatStr(xmm, %s3)
|
|
@CatStr(punpckh, op) @CatStr(xmm, %d3), @CatStr(xmm, %s3)
|
|
|
|
endm
|
|
|
|
punpcknb macro
|
|
|
|
movdqa xmm4, xmm0
|
|
pshufd xmm5, xmm1, 0e4h
|
|
|
|
psllq xmm1, 4
|
|
psrlq xmm4, 4
|
|
|
|
movdqa xmm6, xmm7
|
|
pand xmm0, xmm7
|
|
pandn xmm6, xmm1
|
|
por xmm0, xmm6
|
|
|
|
movdqa xmm6, xmm7
|
|
pand xmm4, xmm7
|
|
pandn xmm6, xmm5
|
|
por xmm4, xmm6
|
|
|
|
movdqa xmm1, xmm4
|
|
|
|
movdqa xmm4, xmm2
|
|
pshufd xmm5, xmm3, 0e4h
|
|
|
|
psllq xmm3, 4
|
|
psrlq xmm4, 4
|
|
|
|
movdqa xmm6, xmm7
|
|
pand xmm2, xmm7
|
|
pandn xmm6, xmm3
|
|
por xmm2, xmm6
|
|
|
|
movdqa xmm6, xmm7
|
|
pand xmm4, xmm7
|
|
pandn xmm6, xmm5
|
|
por xmm4, xmm6
|
|
|
|
movdqa xmm3, xmm4
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
|
|
endm
|
|
|
|
;
|
|
; unSwizzleBlock32
|
|
;
|
|
|
|
@unSwizzleBlock32_sse2@12 proc public
|
|
|
|
push ebx
|
|
|
|
mov ebx, [esp+4+4]
|
|
lea eax, [ebx*2]
|
|
add eax, ebx
|
|
|
|
movdqa xmm0, [ecx+16*0]
|
|
movdqa xmm1, [ecx+16*1]
|
|
movdqa xmm2, [ecx+16*2]
|
|
movdqa xmm3, [ecx+16*3]
|
|
|
|
punpck qdq, 0, 2, 1, 3, 4, 6
|
|
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+16], xmm2
|
|
movdqa [edx+ebx], xmm4
|
|
movdqa [edx+ebx+16], xmm6
|
|
|
|
movdqa xmm0, [ecx+16*4]
|
|
movdqa xmm1, [ecx+16*5]
|
|
movdqa xmm2, [ecx+16*6]
|
|
movdqa xmm3, [ecx+16*7]
|
|
|
|
punpck qdq, 0, 2, 1, 3, 4, 6
|
|
|
|
movdqa [edx+ebx*2], xmm0
|
|
movdqa [edx+ebx*2+16], xmm2
|
|
movdqa [edx+eax], xmm4
|
|
movdqa [edx+eax+16], xmm6
|
|
|
|
lea edx, [edx+ebx*4]
|
|
|
|
movdqa xmm0, [ecx+16*8]
|
|
movdqa xmm1, [ecx+16*9]
|
|
movdqa xmm2, [ecx+16*10]
|
|
movdqa xmm3, [ecx+16*11]
|
|
|
|
punpck qdq, 0, 2, 1, 3, 4, 6
|
|
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+16], xmm2
|
|
movdqa [edx+ebx], xmm4
|
|
movdqa [edx+ebx+16], xmm6
|
|
|
|
movdqa xmm0, [ecx+16*12]
|
|
movdqa xmm1, [ecx+16*13]
|
|
movdqa xmm2, [ecx+16*14]
|
|
movdqa xmm3, [ecx+16*15]
|
|
|
|
punpck qdq, 0, 2, 1, 3, 4, 6
|
|
|
|
movdqa [edx+ebx*2], xmm0
|
|
movdqa [edx+ebx*2+16], xmm2
|
|
movdqa [edx+eax], xmm4
|
|
movdqa [edx+eax+16], xmm6
|
|
|
|
pop ebx
|
|
|
|
ret 4
|
|
|
|
@unSwizzleBlock32_sse2@12 endp
|
|
|
|
;
|
|
; unSwizzleBlock16
|
|
;
|
|
|
|
@unSwizzleBlock16_sse2@12 proc public
|
|
|
|
push ebx
|
|
|
|
mov ebx, [esp+4+4]
|
|
mov eax, 4
|
|
|
|
align 16
|
|
@@:
|
|
movdqa xmm0, [ecx+16*0]
|
|
movdqa xmm1, [ecx+16*1]
|
|
movdqa xmm2, [ecx+16*2]
|
|
movdqa xmm3, [ecx+16*3]
|
|
|
|
punpck wd, 0, 2, 1, 3, 4, 6
|
|
punpck dq, 0, 4, 2, 6, 1, 3
|
|
punpck wd, 0, 4, 1, 3, 2, 6
|
|
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+16], xmm2
|
|
movdqa [edx+ebx], xmm4
|
|
movdqa [edx+ebx+16], xmm6
|
|
|
|
add ecx, 64
|
|
lea edx, [edx+ebx*2]
|
|
|
|
dec eax
|
|
jnz @B
|
|
|
|
pop ebx
|
|
|
|
ret 4
|
|
|
|
@unSwizzleBlock16_sse2@12 endp
|
|
|
|
;
|
|
; unSwizzleBlock8
|
|
;
|
|
|
|
@unSwizzleBlock8_sse2@12 proc public
|
|
|
|
push ebx
|
|
|
|
mov ebx, [esp+4+4]
|
|
mov eax, 2
|
|
|
|
align 16
|
|
@@:
|
|
; col 0, 2
|
|
|
|
movdqa xmm0, [ecx+16*0]
|
|
movdqa xmm1, [ecx+16*1]
|
|
movdqa xmm4, [ecx+16*2]
|
|
movdqa xmm5, [ecx+16*3]
|
|
|
|
punpck bw, 0, 4, 1, 5, 2, 6
|
|
punpck wd, 0, 2, 4, 6, 1, 3
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
punpck qdq, 0, 2, 4, 6, 1, 3
|
|
|
|
pshufd xmm1, xmm1, 0b1h
|
|
pshufd xmm3, xmm3, 0b1h
|
|
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+ebx], xmm2
|
|
lea edx, [edx+ebx*2]
|
|
|
|
movdqa [edx], xmm1
|
|
movdqa [edx+ebx], xmm3
|
|
lea edx, [edx+ebx*2]
|
|
|
|
; col 1, 3
|
|
|
|
movdqa xmm0, [ecx+16*4]
|
|
movdqa xmm1, [ecx+16*5]
|
|
movdqa xmm4, [ecx+16*6]
|
|
movdqa xmm5, [ecx+16*7]
|
|
|
|
punpck bw, 0, 4, 1, 5, 2, 6
|
|
punpck wd, 0, 2, 4, 6, 1, 3
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
punpck qdq, 0, 2, 4, 6, 1, 3
|
|
|
|
pshufd xmm0, xmm0, 0b1h
|
|
pshufd xmm2, xmm2, 0b1h
|
|
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+ebx], xmm2
|
|
lea edx, [edx+ebx*2]
|
|
|
|
movdqa [edx], xmm1
|
|
movdqa [edx+ebx], xmm3
|
|
lea edx, [edx+ebx*2]
|
|
|
|
add ecx, 128
|
|
|
|
dec eax
|
|
jnz @B
|
|
|
|
pop ebx
|
|
|
|
ret 4
|
|
|
|
@unSwizzleBlock8_sse2@12 endp
|
|
|
|
;
|
|
; unSwizzleBlock4
|
|
;
|
|
|
|
@unSwizzleBlock4_sse2@12 proc public
|
|
|
|
push ebx
|
|
|
|
mov eax, 0f0f0f0fh
|
|
movd xmm7, eax
|
|
pshufd xmm7, xmm7, 0
|
|
|
|
mov ebx, [esp+4+4]
|
|
mov eax, 2
|
|
|
|
align 16
|
|
@@:
|
|
; col 0, 2
|
|
|
|
movdqa xmm0, [ecx+16*0]
|
|
movdqa xmm1, [ecx+16*1]
|
|
movdqa xmm4, [ecx+16*2]
|
|
movdqa xmm3, [ecx+16*3]
|
|
|
|
punpck dq, 0, 4, 1, 3, 2, 6
|
|
punpck dq, 0, 2, 4, 6, 1, 3
|
|
punpcknb
|
|
punpck bw, 0, 2, 4, 6, 1, 3
|
|
punpck wd, 0, 2, 1, 3, 4, 6
|
|
|
|
pshufd xmm0, xmm0, 0d8h
|
|
pshufd xmm2, xmm2, 0d8h
|
|
pshufd xmm4, xmm4, 0d8h
|
|
pshufd xmm6, xmm6, 0d8h
|
|
|
|
punpck qdq, 0, 2, 4, 6, 1, 3
|
|
|
|
pshuflw xmm1, xmm1, 0b1h
|
|
pshuflw xmm3, xmm3, 0b1h
|
|
pshufhw xmm1, xmm1, 0b1h
|
|
pshufhw xmm3, xmm3, 0b1h
|
|
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+ebx], xmm2
|
|
lea edx, [edx+ebx*2]
|
|
|
|
movdqa [edx], xmm1
|
|
movdqa [edx+ebx], xmm3
|
|
lea edx, [edx+ebx*2]
|
|
|
|
; col 1, 3
|
|
|
|
movdqa xmm0, [ecx+16*4]
|
|
movdqa xmm1, [ecx+16*5]
|
|
movdqa xmm4, [ecx+16*6]
|
|
movdqa xmm3, [ecx+16*7]
|
|
|
|
punpck dq, 0, 4, 1, 3, 2, 6
|
|
punpck dq, 0, 2, 4, 6, 1, 3
|
|
punpcknb
|
|
punpck bw, 0, 2, 4, 6, 1, 3
|
|
punpck wd, 0, 2, 1, 3, 4, 6
|
|
|
|
pshufd xmm0, xmm0, 0d8h
|
|
pshufd xmm2, xmm2, 0d8h
|
|
pshufd xmm4, xmm4, 0d8h
|
|
pshufd xmm6, xmm6, 0d8h
|
|
|
|
punpck qdq, 0, 2, 4, 6, 1, 3
|
|
|
|
pshuflw xmm0, xmm0, 0b1h
|
|
pshuflw xmm2, xmm2, 0b1h
|
|
pshufhw xmm0, xmm0, 0b1h
|
|
pshufhw xmm2, xmm2, 0b1h
|
|
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+ebx], xmm2
|
|
lea edx, [edx+ebx*2]
|
|
|
|
movdqa [edx], xmm1
|
|
movdqa [edx+ebx], xmm3
|
|
lea edx, [edx+ebx*2]
|
|
|
|
add ecx, 128
|
|
|
|
dec eax
|
|
jnz @B
|
|
|
|
pop ebx
|
|
|
|
ret 4
|
|
|
|
@unSwizzleBlock4_sse2@12 endp
|
|
|
|
;
|
|
; unSwizzleBlock8HP
|
|
;
|
|
|
|
@unSwizzleBlock8HP_sse2@12 proc public
|
|
|
|
push ebx
|
|
|
|
mov ebx, [esp+4+4]
|
|
mov eax, 4
|
|
|
|
align 16
|
|
@@:
|
|
movdqa xmm0, [ecx+16*0]
|
|
movdqa xmm1, [ecx+16*1]
|
|
movdqa xmm2, [ecx+16*2]
|
|
movdqa xmm3, [ecx+16*3]
|
|
|
|
punpck qdq, 0, 2, 1, 3, 4, 6
|
|
|
|
psrld xmm0, 24
|
|
psrld xmm2, 24
|
|
psrld xmm4, 24
|
|
psrld xmm6, 24
|
|
|
|
packssdw xmm0, xmm2
|
|
packssdw xmm4, xmm6
|
|
packuswb xmm0, xmm4
|
|
|
|
movlps qword ptr [edx], xmm0
|
|
movhps qword ptr [edx+ebx], xmm0
|
|
|
|
add ecx, 64
|
|
lea edx, [edx+ebx*2]
|
|
|
|
dec eax
|
|
jnz @B
|
|
|
|
pop ebx
|
|
|
|
ret 4
|
|
|
|
@unSwizzleBlock8HP_sse2@12 endp
|
|
|
|
;
|
|
; unSwizzleBlock4HLP
|
|
;
|
|
|
|
@unSwizzleBlock4HLP_sse2@12 proc public
|
|
|
|
push ebx
|
|
|
|
mov eax, 0f0f0f0fh
|
|
movd xmm7, eax
|
|
pshufd xmm7, xmm7, 0
|
|
|
|
mov ebx, [esp+4+4]
|
|
mov eax, 4
|
|
|
|
align 16
|
|
@@:
|
|
movdqa xmm0, [ecx+16*0]
|
|
movdqa xmm1, [ecx+16*1]
|
|
movdqa xmm2, [ecx+16*2]
|
|
movdqa xmm3, [ecx+16*3]
|
|
|
|
punpck qdq, 0, 2, 1, 3, 4, 6
|
|
|
|
psrld xmm0, 24
|
|
psrld xmm2, 24
|
|
psrld xmm4, 24
|
|
psrld xmm6, 24
|
|
|
|
packssdw xmm0, xmm2
|
|
packssdw xmm4, xmm6
|
|
packuswb xmm0, xmm4
|
|
|
|
pand xmm0, xmm7
|
|
|
|
movlps qword ptr [edx], xmm0
|
|
movhps qword ptr [edx+ebx], xmm0
|
|
|
|
add ecx, 64
|
|
lea edx, [edx+ebx*2]
|
|
|
|
dec eax
|
|
jnz @B
|
|
|
|
pop ebx
|
|
|
|
ret 4
|
|
|
|
@unSwizzleBlock4HLP_sse2@12 endp
|
|
|
|
;
|
|
; unSwizzleBlock4HHP
|
|
;
|
|
|
|
@unSwizzleBlock4HHP_sse2@12 proc public
|
|
|
|
push ebx
|
|
|
|
mov ebx, [esp+4+4]
|
|
mov eax, 4
|
|
|
|
align 16
|
|
@@:
|
|
movdqa xmm0, [ecx+16*0]
|
|
movdqa xmm1, [ecx+16*1]
|
|
movdqa xmm2, [ecx+16*2]
|
|
movdqa xmm3, [ecx+16*3]
|
|
|
|
punpck qdq, 0, 2, 1, 3, 4, 6
|
|
|
|
psrld xmm0, 28
|
|
psrld xmm2, 28
|
|
psrld xmm4, 28
|
|
psrld xmm6, 28
|
|
|
|
packssdw xmm0, xmm2
|
|
packssdw xmm4, xmm6
|
|
packuswb xmm0, xmm4
|
|
|
|
movlps qword ptr [edx], xmm0
|
|
movhps qword ptr [edx+ebx], xmm0
|
|
|
|
add ecx, 64
|
|
lea edx, [edx+ebx*2]
|
|
|
|
dec eax
|
|
jnz @B
|
|
|
|
pop ebx
|
|
|
|
ret 4
|
|
|
|
@unSwizzleBlock4HHP_sse2@12 endp
|
|
|
|
;
|
|
; unSwizzleBlock4P
|
|
;
|
|
|
|
@unSwizzleBlock4P_sse2@12 proc public
|
|
|
|
push esi
|
|
push edi
|
|
|
|
mov eax, 0f0f0f0fh
|
|
movd xmm7, eax
|
|
pshufd xmm7, xmm7, 0
|
|
|
|
mov esi, [esp+4+8]
|
|
lea edi, [esi*2]
|
|
add edi, esi
|
|
|
|
; col 0
|
|
|
|
movdqa xmm0, [ecx+16*0]
|
|
movdqa xmm1, [ecx+16*1]
|
|
movdqa xmm2, [ecx+16*2]
|
|
movdqa xmm3, [ecx+16*3]
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
punpck wd, 0, 4, 2, 6, 1, 3
|
|
punpck bw, 0, 4, 1, 3, 2, 6
|
|
|
|
movdqa xmm1, xmm7
|
|
pandn xmm1, xmm0
|
|
pand xmm0, xmm7
|
|
pshufd xmm1, xmm1, 0b1h
|
|
psrlq xmm1, 4
|
|
|
|
movdqa xmm3, xmm7
|
|
pandn xmm3, xmm2
|
|
pand xmm2, xmm7
|
|
pshufd xmm3, xmm3, 0b1h
|
|
psrlq xmm3, 4
|
|
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+16], xmm2
|
|
movdqa [edx+esi*2], xmm1
|
|
movdqa [edx+esi*2+16], xmm3
|
|
|
|
movdqa xmm1, xmm7
|
|
pandn xmm1, xmm4
|
|
pand xmm4, xmm7
|
|
pshufd xmm1, xmm1, 0b1h
|
|
psrlq xmm1, 4
|
|
|
|
movdqa xmm3, xmm7
|
|
pandn xmm3, xmm6
|
|
pand xmm6, xmm7
|
|
pshufd xmm3, xmm3, 0b1h
|
|
psrlq xmm3, 4
|
|
|
|
movdqa [edx+esi], xmm4
|
|
movdqa [edx+esi+16], xmm6
|
|
movdqa [edx+edi], xmm1
|
|
movdqa [edx+edi+16], xmm3
|
|
|
|
lea edx, [edx+esi*4]
|
|
|
|
; col 1
|
|
|
|
movdqa xmm0, [ecx+16*4]
|
|
movdqa xmm1, [ecx+16*5]
|
|
movdqa xmm2, [ecx+16*6]
|
|
movdqa xmm3, [ecx+16*7]
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
punpck wd, 0, 4, 2, 6, 1, 3
|
|
punpck bw, 0, 4, 1, 3, 2, 6
|
|
|
|
movdqa xmm1, xmm7
|
|
pandn xmm1, xmm0
|
|
pand xmm0, xmm7
|
|
pshufd xmm0, xmm0, 0b1h
|
|
psrlq xmm1, 4
|
|
|
|
movdqa xmm3, xmm7
|
|
pandn xmm3, xmm2
|
|
pand xmm2, xmm7
|
|
pshufd xmm2, xmm2, 0b1h
|
|
psrlq xmm3, 4
|
|
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+16], xmm2
|
|
movdqa [edx+esi*2], xmm1
|
|
movdqa [edx+esi*2+16], xmm3
|
|
|
|
movdqa xmm1, xmm7
|
|
pandn xmm1, xmm4
|
|
pand xmm4, xmm7
|
|
pshufd xmm4, xmm4, 0b1h
|
|
psrlq xmm1, 4
|
|
|
|
movdqa xmm3, xmm7
|
|
pandn xmm3, xmm6
|
|
pand xmm6, xmm7
|
|
pshufd xmm6, xmm6, 0b1h
|
|
psrlq xmm3, 4
|
|
|
|
movdqa [edx+esi], xmm4
|
|
movdqa [edx+esi+16], xmm6
|
|
movdqa [edx+edi], xmm1
|
|
movdqa [edx+edi+16], xmm3
|
|
|
|
lea edx, [edx+esi*4]
|
|
|
|
; col 2
|
|
|
|
movdqa xmm0, [ecx+16*8]
|
|
movdqa xmm1, [ecx+16*9]
|
|
movdqa xmm2, [ecx+16*10]
|
|
movdqa xmm3, [ecx+16*11]
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
punpck wd, 0, 4, 2, 6, 1, 3
|
|
punpck bw, 0, 4, 1, 3, 2, 6
|
|
|
|
movdqa xmm1, xmm7
|
|
pandn xmm1, xmm0
|
|
pand xmm0, xmm7
|
|
pshufd xmm1, xmm1, 0b1h
|
|
psrlq xmm1, 4
|
|
|
|
movdqa xmm3, xmm7
|
|
pandn xmm3, xmm2
|
|
pand xmm2, xmm7
|
|
pshufd xmm3, xmm3, 0b1h
|
|
psrlq xmm3, 4
|
|
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+16], xmm2
|
|
movdqa [edx+esi*2], xmm1
|
|
movdqa [edx+esi*2+16], xmm3
|
|
|
|
movdqa xmm1, xmm7
|
|
pandn xmm1, xmm4
|
|
pand xmm4, xmm7
|
|
pshufd xmm1, xmm1, 0b1h
|
|
psrlq xmm1, 4
|
|
|
|
movdqa xmm3, xmm7
|
|
pandn xmm3, xmm6
|
|
pand xmm6, xmm7
|
|
pshufd xmm3, xmm3, 0b1h
|
|
psrlq xmm3, 4
|
|
|
|
movdqa [edx+esi], xmm4
|
|
movdqa [edx+esi+16], xmm6
|
|
movdqa [edx+edi], xmm1
|
|
movdqa [edx+edi+16], xmm3
|
|
|
|
lea edx, [edx+esi*4]
|
|
|
|
; col 3
|
|
|
|
movdqa xmm0, [ecx+16*12]
|
|
movdqa xmm1, [ecx+16*13]
|
|
movdqa xmm2, [ecx+16*14]
|
|
movdqa xmm3, [ecx+16*15]
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
punpck wd, 0, 4, 2, 6, 1, 3
|
|
punpck bw, 0, 4, 1, 3, 2, 6
|
|
|
|
movdqa xmm1, xmm7
|
|
pandn xmm1, xmm0
|
|
pand xmm0, xmm7
|
|
pshufd xmm0, xmm0, 0b1h
|
|
psrlq xmm1, 4
|
|
|
|
movdqa xmm3, xmm7
|
|
pandn xmm3, xmm2
|
|
pand xmm2, xmm7
|
|
pshufd xmm2, xmm2, 0b1h
|
|
psrlq xmm3, 4
|
|
|
|
movdqa [edx], xmm0
|
|
movdqa [edx+16], xmm2
|
|
movdqa [edx+esi*2], xmm1
|
|
movdqa [edx+esi*2+16], xmm3
|
|
|
|
movdqa xmm1, xmm7
|
|
pandn xmm1, xmm4
|
|
pand xmm4, xmm7
|
|
pshufd xmm4, xmm4, 0b1h
|
|
psrlq xmm1, 4
|
|
|
|
movdqa xmm3, xmm7
|
|
pandn xmm3, xmm6
|
|
pand xmm6, xmm7
|
|
pshufd xmm6, xmm6, 0b1h
|
|
psrlq xmm3, 4
|
|
|
|
movdqa [edx+esi], xmm4
|
|
movdqa [edx+esi+16], xmm6
|
|
movdqa [edx+edi], xmm1
|
|
movdqa [edx+edi+16], xmm3
|
|
|
|
; lea edx, [edx+esi*4]
|
|
|
|
pop edi
|
|
pop esi
|
|
|
|
ret 4
|
|
|
|
@unSwizzleBlock4P_sse2@12 endp
|
|
|
|
;
|
|
; swizzling
|
|
;
|
|
|
|
;
|
|
; SwizzleBlock32
|
|
;
|
|
|
|
@SwizzleBlock32_sse2@16 proc public
|
|
|
|
|
|
push esi
|
|
push edi
|
|
|
|
mov edi, ecx
|
|
mov esi, edx
|
|
mov edx, [esp+4+8]
|
|
mov ecx, 4
|
|
|
|
mov eax, [esp+8+8]
|
|
cmp eax, 0ffffffffh
|
|
jnz SwizzleBlock32_sse2@WM
|
|
|
|
align 16
|
|
@@:
|
|
movdqa xmm0, [esi]
|
|
movdqa xmm4, [esi+16]
|
|
movdqa xmm1, [esi+edx]
|
|
movdqa xmm5, [esi+edx+16]
|
|
|
|
punpck qdq, 0, 4, 1, 5, 2, 6
|
|
|
|
movntps [edi+16*0], xmm0
|
|
movntps [edi+16*1], xmm2
|
|
movntps [edi+16*2], xmm4
|
|
movntps [edi+16*3], xmm6
|
|
|
|
lea esi, [esi+edx*2]
|
|
add edi, 64
|
|
|
|
dec ecx
|
|
jnz @B
|
|
|
|
pop edi
|
|
pop esi
|
|
|
|
ret 8
|
|
|
|
SwizzleBlock32_sse2@WM:
|
|
|
|
movd xmm7, eax
|
|
pshufd xmm7, xmm7, 0
|
|
|
|
align 16
|
|
@@:
|
|
movdqa xmm0, [esi]
|
|
movdqa xmm4, [esi+16]
|
|
movdqa xmm1, [esi+edx]
|
|
movdqa xmm5, [esi+edx+16]
|
|
|
|
punpck qdq, 0, 4, 1, 5, 2, 6
|
|
|
|
movdqa xmm3, xmm7
|
|
pshufd xmm5, xmm7, 0e4h
|
|
|
|
pandn xmm3, [edi+16*0]
|
|
pand xmm0, xmm7
|
|
por xmm0, xmm3
|
|
movntps [edi+16*0], xmm0
|
|
|
|
pandn xmm5, [edi+16*1]
|
|
pand xmm2, xmm7
|
|
por xmm2, xmm5
|
|
movntps [edi+16*1], xmm2
|
|
|
|
movdqa xmm3, xmm7
|
|
pshufd xmm5, xmm7, 0e4h
|
|
|
|
pandn xmm3, [edi+16*2]
|
|
pand xmm4, xmm7
|
|
por xmm4, xmm3
|
|
movntps [edi+16*2], xmm4
|
|
|
|
pandn xmm5, [edi+16*3]
|
|
pand xmm6, xmm7
|
|
por xmm6, xmm5
|
|
movntps [edi+16*3], xmm6
|
|
|
|
lea esi, [esi+edx*2]
|
|
add edi, 64
|
|
|
|
dec ecx
|
|
jnz @B
|
|
|
|
pop edi
|
|
pop esi
|
|
|
|
ret 8
|
|
|
|
@SwizzleBlock32_sse2@16 endp
|
|
|
|
;
|
|
; SwizzleBlock16
|
|
;
|
|
|
|
@SwizzleBlock16_sse2@12 proc public
|
|
|
|
push ebx
|
|
|
|
mov ebx, [esp+4+4]
|
|
mov eax, 4
|
|
|
|
align 16
|
|
@@:
|
|
movdqa xmm0, [edx]
|
|
movdqa xmm1, [edx+16]
|
|
movdqa xmm2, [edx+ebx]
|
|
movdqa xmm3, [edx+ebx+16]
|
|
|
|
punpck wd, 0, 2, 1, 3, 4, 6
|
|
punpck qdq, 0, 4, 2, 6, 1, 5
|
|
|
|
movntps [ecx+16*0], xmm0
|
|
movntps [ecx+16*1], xmm1
|
|
movntps [ecx+16*2], xmm4
|
|
movntps [ecx+16*3], xmm5
|
|
|
|
lea edx, [edx+ebx*2]
|
|
add ecx, 64
|
|
|
|
dec eax
|
|
jnz @B
|
|
|
|
pop ebx
|
|
|
|
ret 4
|
|
|
|
@SwizzleBlock16_sse2@12 endp
|
|
|
|
;
|
|
; SwizzleBlock8
|
|
;
|
|
|
|
@SwizzleBlock8_sse2@12 proc public
|
|
|
|
push ebx
|
|
|
|
mov ebx, [esp+4+4]
|
|
mov eax, 2
|
|
|
|
align 16
|
|
@@:
|
|
; col 0, 2
|
|
|
|
movdqa xmm0, [edx]
|
|
movdqa xmm2, [edx+ebx]
|
|
lea edx, [edx+ebx*2]
|
|
|
|
pshufd xmm1, [edx], 0b1h
|
|
pshufd xmm3, [edx+ebx], 0b1h
|
|
lea edx, [edx+ebx*2]
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
punpck wd, 0, 2, 4, 6, 1, 3
|
|
punpck qdq, 0, 1, 2, 3, 4, 5
|
|
|
|
movntps [ecx+16*0], xmm0
|
|
movntps [ecx+16*1], xmm4
|
|
movntps [ecx+16*2], xmm1
|
|
movntps [ecx+16*3], xmm5
|
|
|
|
; col 1, 3
|
|
|
|
pshufd xmm0, [edx], 0b1h
|
|
pshufd xmm2, [edx+ebx], 0b1h
|
|
lea edx, [edx+ebx*2]
|
|
|
|
movdqa xmm1, [edx]
|
|
movdqa xmm3, [edx+ebx]
|
|
lea edx, [edx+ebx*2]
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
punpck wd, 0, 2, 4, 6, 1, 3
|
|
punpck qdq, 0, 1, 2, 3, 4, 5
|
|
|
|
movntps [ecx+16*4], xmm0
|
|
movntps [ecx+16*5], xmm4
|
|
movntps [ecx+16*6], xmm1
|
|
movntps [ecx+16*7], xmm5
|
|
|
|
add ecx, 128
|
|
|
|
dec eax
|
|
jnz @B
|
|
|
|
pop ebx
|
|
|
|
ret 4
|
|
|
|
@SwizzleBlock8_sse2@12 endp
|
|
|
|
;
|
|
; SwizzleBlock4
|
|
;
|
|
|
|
@SwizzleBlock4_sse2@12 proc public
|
|
|
|
push ebx
|
|
|
|
mov eax, 0f0f0f0fh
|
|
movd xmm7, eax
|
|
pshufd xmm7, xmm7, 0
|
|
|
|
mov ebx, [esp+4+4]
|
|
mov eax, 2
|
|
|
|
align 16
|
|
@@:
|
|
; col 0, 2
|
|
|
|
movdqa xmm0, [edx]
|
|
movdqa xmm2, [edx+ebx]
|
|
lea edx, [edx+ebx*2]
|
|
|
|
movdqa xmm1, [edx]
|
|
movdqa xmm3, [edx+ebx]
|
|
lea edx, [edx+ebx*2]
|
|
|
|
pshuflw xmm1, xmm1, 0b1h
|
|
pshuflw xmm3, xmm3, 0b1h
|
|
pshufhw xmm1, xmm1, 0b1h
|
|
pshufhw xmm3, xmm3, 0b1h
|
|
|
|
punpcknb
|
|
punpck bw, 0, 2, 4, 6, 1, 3
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
punpck qdq, 0, 4, 2, 6, 1, 3
|
|
|
|
movntps [ecx+16*0], xmm0
|
|
movntps [ecx+16*1], xmm1
|
|
movntps [ecx+16*2], xmm4
|
|
movntps [ecx+16*3], xmm3
|
|
|
|
; col 1, 3
|
|
|
|
movdqa xmm0, [edx]
|
|
movdqa xmm2, [edx+ebx]
|
|
lea edx, [edx+ebx*2]
|
|
|
|
movdqa xmm1, [edx]
|
|
movdqa xmm3, [edx+ebx]
|
|
lea edx, [edx+ebx*2]
|
|
|
|
pshuflw xmm0, xmm0, 0b1h
|
|
pshuflw xmm2, xmm2, 0b1h
|
|
pshufhw xmm0, xmm0, 0b1h
|
|
pshufhw xmm2, xmm2, 0b1h
|
|
|
|
punpcknb
|
|
punpck bw, 0, 2, 4, 6, 1, 3
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
punpck qdq, 0, 4, 2, 6, 1, 3
|
|
|
|
movntps [ecx+16*4], xmm0
|
|
movntps [ecx+16*5], xmm1
|
|
movntps [ecx+16*6], xmm4
|
|
movntps [ecx+16*7], xmm3
|
|
|
|
add ecx, 128
|
|
|
|
dec eax
|
|
jnz @B
|
|
|
|
pop ebx
|
|
|
|
ret 4
|
|
|
|
@SwizzleBlock4_sse2@12 endp
|
|
|
|
;
|
|
; swizzling with unaligned reads
|
|
;
|
|
|
|
;
|
|
; SwizzleBlock32u
|
|
;
|
|
|
|
@SwizzleBlock32u_sse2@16 proc public
|
|
|
|
push esi
|
|
push edi
|
|
|
|
mov edi, ecx
|
|
mov esi, edx
|
|
mov edx, [esp+4+8]
|
|
mov ecx, 4
|
|
|
|
mov eax, [esp+8+8]
|
|
cmp eax, 0ffffffffh
|
|
jnz SwizzleBlock32u_sse2@WM
|
|
|
|
align 16
|
|
@@:
|
|
movdqu xmm0, [esi]
|
|
movdqu xmm4, [esi+16]
|
|
movdqu xmm1, [esi+edx]
|
|
movdqu xmm5, [esi+edx+16]
|
|
|
|
punpck qdq, 0, 4, 1, 5, 2, 6
|
|
|
|
movntps [edi+16*0], xmm0
|
|
movntps [edi+16*1], xmm2
|
|
movntps [edi+16*2], xmm4
|
|
movntps [edi+16*3], xmm6
|
|
|
|
lea esi, [esi+edx*2]
|
|
add edi, 64
|
|
|
|
dec ecx
|
|
jnz @B
|
|
|
|
pop edi
|
|
pop esi
|
|
|
|
ret 8
|
|
|
|
SwizzleBlock32u_sse2@WM:
|
|
|
|
movd xmm7, eax
|
|
pshufd xmm7, xmm7, 0
|
|
|
|
align 16
|
|
@@:
|
|
movdqu xmm0, [esi]
|
|
movdqu xmm4, [esi+16]
|
|
movdqu xmm1, [esi+edx]
|
|
movdqu xmm5, [esi+edx+16]
|
|
|
|
punpck qdq, 0, 4, 1, 5, 2, 6
|
|
|
|
movdqa xmm3, xmm7
|
|
pshufd xmm5, xmm7, 0e4h
|
|
|
|
pandn xmm3, [edi+16*0]
|
|
pand xmm0, xmm7
|
|
por xmm0, xmm3
|
|
movdqa [edi+16*0], xmm0
|
|
|
|
pandn xmm5, [edi+16*1]
|
|
pand xmm2, xmm7
|
|
por xmm2, xmm5
|
|
movdqa [edi+16*1], xmm2
|
|
|
|
movdqa xmm3, xmm7
|
|
pshufd xmm5, xmm7, 0e4h
|
|
|
|
pandn xmm3, [edi+16*2]
|
|
pand xmm4, xmm7
|
|
por xmm4, xmm3
|
|
movdqa [edi+16*2], xmm4
|
|
|
|
pandn xmm5, [edi+16*3]
|
|
pand xmm6, xmm7
|
|
por xmm6, xmm5
|
|
movdqa [edi+16*3], xmm6
|
|
|
|
lea esi, [esi+edx*2]
|
|
add edi, 64
|
|
|
|
dec ecx
|
|
jnz @B
|
|
|
|
pop edi
|
|
pop esi
|
|
|
|
ret 8
|
|
|
|
@SwizzleBlock32u_sse2@16 endp
|
|
|
|
;
|
|
; SwizzleBlock16u
|
|
;
|
|
|
|
@SwizzleBlock16u_sse2@12 proc public
|
|
|
|
push ebx
|
|
|
|
mov ebx, [esp+4+4]
|
|
mov eax, 4
|
|
|
|
align 16
|
|
@@:
|
|
movdqu xmm0, [edx]
|
|
movdqu xmm1, [edx+16]
|
|
movdqu xmm2, [edx+ebx]
|
|
movdqu xmm3, [edx+ebx+16]
|
|
|
|
punpck wd, 0, 2, 1, 3, 4, 6
|
|
punpck qdq, 0, 4, 2, 6, 1, 5
|
|
|
|
movntps [ecx+16*0], xmm0
|
|
movntps [ecx+16*1], xmm1
|
|
movntps [ecx+16*2], xmm4
|
|
movntps [ecx+16*3], xmm5
|
|
|
|
lea edx, [edx+ebx*2]
|
|
add ecx, 64
|
|
|
|
dec eax
|
|
jnz @B
|
|
|
|
pop ebx
|
|
|
|
ret 4
|
|
|
|
@SwizzleBlock16u_sse2@12 endp
|
|
|
|
;
|
|
; SwizzleBlock8u
|
|
;
|
|
|
|
@SwizzleBlock8u_sse2@12 proc public
|
|
|
|
push ebx
|
|
|
|
mov ebx, [esp+4+4]
|
|
mov eax, 2
|
|
|
|
align 16
|
|
@@:
|
|
; col 0, 2
|
|
|
|
movdqu xmm0, [edx]
|
|
movdqu xmm2, [edx+ebx]
|
|
lea edx, [edx+ebx*2]
|
|
|
|
movdqu xmm1, [edx]
|
|
movdqu xmm3, [edx+ebx]
|
|
pshufd xmm1, xmm1, 0b1h
|
|
pshufd xmm3, xmm3, 0b1h
|
|
lea edx, [edx+ebx*2]
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
punpck wd, 0, 2, 4, 6, 1, 3
|
|
punpck qdq, 0, 1, 2, 3, 4, 5
|
|
|
|
movntps [ecx+16*0], xmm0
|
|
movntps [ecx+16*1], xmm4
|
|
movntps [ecx+16*2], xmm1
|
|
movntps [ecx+16*3], xmm5
|
|
|
|
; col 1, 3
|
|
|
|
movdqu xmm0, [edx]
|
|
movdqu xmm2, [edx+ebx]
|
|
pshufd xmm0, xmm0, 0b1h
|
|
pshufd xmm2, xmm2, 0b1h
|
|
lea edx, [edx+ebx*2]
|
|
|
|
movdqu xmm1, [edx]
|
|
movdqu xmm3, [edx+ebx]
|
|
lea edx, [edx+ebx*2]
|
|
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
punpck wd, 0, 2, 4, 6, 1, 3
|
|
punpck qdq, 0, 1, 2, 3, 4, 5
|
|
|
|
movntps [ecx+16*4], xmm0
|
|
movntps [ecx+16*5], xmm4
|
|
movntps [ecx+16*6], xmm1
|
|
movntps [ecx+16*7], xmm5
|
|
|
|
add ecx, 128
|
|
|
|
dec eax
|
|
jnz @B
|
|
|
|
pop ebx
|
|
|
|
ret 4
|
|
|
|
@SwizzleBlock8u_sse2@12 endp
|
|
|
|
;
|
|
; SwizzleBlock4u
|
|
;
|
|
|
|
@SwizzleBlock4u_sse2@12 proc public
|
|
|
|
push ebx
|
|
|
|
mov eax, 0f0f0f0fh
|
|
movd xmm7, eax
|
|
pshufd xmm7, xmm7, 0
|
|
|
|
mov ebx, [esp+4+4]
|
|
mov eax, 2
|
|
|
|
align 16
|
|
@@:
|
|
; col 0, 2
|
|
|
|
movdqu xmm0, [edx]
|
|
movdqu xmm2, [edx+ebx]
|
|
lea edx, [edx+ebx*2]
|
|
|
|
movdqu xmm1, [edx]
|
|
movdqu xmm3, [edx+ebx]
|
|
lea edx, [edx+ebx*2]
|
|
|
|
pshuflw xmm1, xmm1, 0b1h
|
|
pshuflw xmm3, xmm3, 0b1h
|
|
pshufhw xmm1, xmm1, 0b1h
|
|
pshufhw xmm3, xmm3, 0b1h
|
|
|
|
punpcknb
|
|
punpck bw, 0, 2, 4, 6, 1, 3
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
punpck qdq, 0, 4, 2, 6, 1, 3
|
|
|
|
movntps [ecx+16*0], xmm0
|
|
movntps [ecx+16*1], xmm1
|
|
movntps [ecx+16*2], xmm4
|
|
movntps [ecx+16*3], xmm3
|
|
|
|
; col 1, 3
|
|
|
|
movdqu xmm0, [edx]
|
|
movdqu xmm2, [edx+ebx]
|
|
lea edx, [edx+ebx*2]
|
|
|
|
movdqu xmm1, [edx]
|
|
movdqu xmm3, [edx+ebx]
|
|
lea edx, [edx+ebx*2]
|
|
|
|
pshuflw xmm0, xmm0, 0b1h
|
|
pshuflw xmm2, xmm2, 0b1h
|
|
pshufhw xmm0, xmm0, 0b1h
|
|
pshufhw xmm2, xmm2, 0b1h
|
|
|
|
punpcknb
|
|
punpck bw, 0, 2, 4, 6, 1, 3
|
|
punpck bw, 0, 2, 1, 3, 4, 6
|
|
punpck qdq, 0, 4, 2, 6, 1, 3
|
|
|
|
movntps [ecx+16*4], xmm0
|
|
movntps [ecx+16*5], xmm1
|
|
movntps [ecx+16*6], xmm4
|
|
movntps [ecx+16*7], xmm3
|
|
|
|
add ecx, 128
|
|
|
|
dec eax
|
|
jnz @B
|
|
|
|
pop ebx
|
|
|
|
ret 4
|
|
|
|
@SwizzleBlock4u_sse2@12 endp
|
|
|
|
end |