pcsx2/plugins/gs/gsdx9/x86-64.asm

1418 lines
22 KiB
NASM

.const
__uvmin DD 0d01502f9r ; -1e+010
__uvmax DD 0501502f9r ; +1e+010
.code
;
; memsetd
;
memsetd proc public
push rdi
mov rdi, rcx
mov eax, edx
mov rcx, r8
cld
rep stosd
pop rdi
ret
memsetd endp
;
; SaturateColor
;
SaturateColor_amd64 proc public
pxor xmm0, xmm0
movdqa xmm1, [rcx]
packssdw xmm1, xmm0
packuswb xmm1, xmm0
punpcklbw xmm1, xmm0
punpcklwd xmm1, xmm0
movdqa [rcx], xmm1
ret
SaturateColor_amd64 endp
;
; swizzling
;
punpck macro op, sd0, sd2, s1, s3, d1, d3
movdqa @CatStr(xmm, %d1), @CatStr(xmm, %sd0)
pshufd @CatStr(xmm, %d3), @CatStr(xmm, %sd2), 0e4h
@CatStr(punpckl, op) @CatStr(xmm, %sd0), @CatStr(xmm, %s1)
@CatStr(punpckh, op) @CatStr(xmm, %d1), @CatStr(xmm, %s1)
@CatStr(punpckl, op) @CatStr(xmm, %sd2), @CatStr(xmm, %s3)
@CatStr(punpckh, op) @CatStr(xmm, %d3), @CatStr(xmm, %s3)
endm
punpck2 macro op, sd0, sd2, sd4, sd6, s1, s3, s5, s7, d1, d3, d5, d7
movdqa @CatStr(xmm, %d1), @CatStr(xmm, %sd0)
pshufd @CatStr(xmm, %d3), @CatStr(xmm, %sd2), 0e4h
movdqa @CatStr(xmm, %d5), @CatStr(xmm, %sd4)
pshufd @CatStr(xmm, %d7), @CatStr(xmm, %sd6), 0e4h
@CatStr(punpckl, op) @CatStr(xmm, %sd0), @CatStr(xmm, %s1)
@CatStr(punpckh, op) @CatStr(xmm, %d1), @CatStr(xmm, %s1)
@CatStr(punpckl, op) @CatStr(xmm, %sd2), @CatStr(xmm, %s3)
@CatStr(punpckh, op) @CatStr(xmm, %d3), @CatStr(xmm, %s3)
@CatStr(punpckl, op) @CatStr(xmm, %sd4), @CatStr(xmm, %s5)
@CatStr(punpckh, op) @CatStr(xmm, %d5), @CatStr(xmm, %s5)
@CatStr(punpckl, op) @CatStr(xmm, %sd6), @CatStr(xmm, %s7)
@CatStr(punpckh, op) @CatStr(xmm, %d7), @CatStr(xmm, %s7)
endm
punpcknbl macro
movdqa xmm4, xmm0
pshufd xmm5, xmm1, 0e4h
psllq xmm1, 4
psrlq xmm4, 4
movdqa xmm6, xmm7
pand xmm0, xmm7
pandn xmm6, xmm1
por xmm0, xmm6
movdqa xmm6, xmm7
pand xmm4, xmm7
pandn xmm6, xmm5
por xmm4, xmm6
movdqa xmm1, xmm4
movdqa xmm4, xmm2
pshufd xmm5, xmm3, 0e4h
psllq xmm3, 4
psrlq xmm4, 4
movdqa xmm6, xmm7
pand xmm2, xmm7
pandn xmm6, xmm3
por xmm2, xmm6
movdqa xmm6, xmm7
pand xmm4, xmm7
pandn xmm6, xmm5
por xmm4, xmm6
movdqa xmm3, xmm4
punpck bw, 0, 2, 1, 3, 4, 6
endm
punpcknbh macro
movdqa xmm12, xmm8
pshufd xmm13, xmm9, 0e4h
psllq xmm9, 4
psrlq xmm12, 4
movdqa xmm14, xmm15
pand xmm8, xmm15
pandn xmm14, xmm9
por xmm8, xmm14
movdqa xmm14, xmm15
pand xmm12, xmm15
pandn xmm14, xmm13
por xmm12, xmm14
movdqa xmm9, xmm12
movdqa xmm12, xmm10
pshufd xmm13, xmm11, 0e4h
psllq xmm11, 4
psrlq xmm12, 4
movdqa xmm14, xmm15
pand xmm10, xmm15
pandn xmm14, xmm11
por xmm10, xmm14
movdqa xmm14, xmm15
pand xmm12, xmm15
pandn xmm14, xmm13
por xmm12, xmm14
movdqa xmm11, xmm12
punpck bw, 8, 10, 9, 11, 12, 14
endm
;
; unSwizzleBlock32
;
unSwizzleBlock32_amd64 proc public
push rsi
push rdi
mov rsi, rcx
mov rdi, rdx
mov rcx, 4
align 16
@@:
movdqa xmm0, [rsi+16*0]
movdqa xmm1, [rsi+16*1]
movdqa xmm2, [rsi+16*2]
movdqa xmm3, [rsi+16*3]
punpck qdq, 0, 2, 1, 3, 4, 6
movdqa [rdi], xmm0
movdqa [rdi+16], xmm2
movdqa [rdi+r8], xmm4
movdqa [rdi+r8+16], xmm6
add rsi, 64
lea rdi, [rdi+r8*2]
dec rcx
jnz @B
pop rdi
pop rsi
ret
unSwizzleBlock32_amd64 endp
;
; unSwizzleBlock32_2 (TODO: test me)
;
unSwizzleBlock32_2_amd64 proc public
push rsi
push rdi
mov rsi, rcx
mov rdi, rdx
mov rcx, 2
align 16
@@:
movdqa xmm0, [rsi+16*0]
movdqa xmm1, [rsi+16*1]
movdqa xmm2, [rsi+16*2]
movdqa xmm3, [rsi+16*3]
movdqa xmm4, [rsi+16*4]
movdqa xmm5, [rsi+16*5]
movdqa xmm6, [rsi+16*6]
movdqa xmm7, [rsi+16*7]
punpck2 qdq, 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14
movdqa [rdi], xmm0
movdqa [rdi+16], xmm2
movdqa [rdi+r8], xmm4
movdqa [rdi+r8+16], xmm6
lea rdi, [rdi+r8*2]
movdqa [rdi], xmm8
movdqa [rdi+16], xmm10
movdqa [rdi+r8], xmm12
movdqa [rdi+r8+16], xmm14
lea rdi, [rdi+r8*2]
add rsi, 128
dec rcx
jnz @B
pop rdi
pop rsi
ret
unSwizzleBlock32_2_amd64 endp
;
; unSwizzleBlock16
;
unSwizzleBlock16_amd64 proc public
push rsi
push rdi
mov rsi, rcx
mov rdi, rdx
mov rcx, 4
align 16
@@:
movdqa xmm0, [rsi+16*0]
movdqa xmm1, [rsi+16*1]
movdqa xmm2, [rsi+16*2]
movdqa xmm3, [rsi+16*3]
punpck wd, 0, 2, 1, 3, 4, 6
punpck dq, 0, 4, 2, 6, 1, 3
punpck wd, 0, 4, 1, 3, 2, 6
movdqa [rdi], xmm0
movdqa [rdi+16], xmm2
movdqa [rdi+r8], xmm4
movdqa [rdi+r8+16], xmm6
add rsi, 64
lea rdi, [rdi+r8*2]
dec rcx
jnz @B
pop rdi
pop rsi
ret
unSwizzleBlock16_amd64 endp
;
; unSwizzleBlock8
;
unSwizzleBlock8_amd64 proc public
push rsi
push rdi
mov rsi, rcx
mov rdi, rdx
mov rcx, 2
; r9 = r8*3
lea r9, [r8*2]
add r9, r8
align 16
@@:
; col 0, 2
movdqa xmm0, [rsi+16*0]
movdqa xmm1, [rsi+16*1]
movdqa xmm4, [rsi+16*2]
movdqa xmm5, [rsi+16*3]
; col 1, 3
movdqa xmm8, [rsi+16*4]
movdqa xmm9, [rsi+16*5]
movdqa xmm12, [rsi+16*6]
movdqa xmm13, [rsi+16*7]
; col 0, 2
punpck bw, 0, 4, 1, 5, 2, 6
punpck wd, 0, 2, 4, 6, 1, 3
punpck bw, 0, 2, 1, 3, 4, 6
punpck qdq, 0, 2, 4, 6, 1, 3
pshufd xmm1, xmm1, 0b1h
pshufd xmm3, xmm3, 0b1h
; col 1, 3
punpck bw, 8, 12, 9, 13, 10, 14
punpck wd, 8, 10, 12, 14, 9, 11
punpck bw, 8, 10, 9, 11, 12, 14
punpck qdq, 8, 10, 12, 14, 9, 11
pshufd xmm8, xmm8, 0b1h
pshufd xmm10, xmm10, 0b1h
; col 0, 2
movdqa [rdi], xmm0
movdqa [rdi+r8], xmm2
movdqa [rdi+r8*2], xmm1
movdqa [rdi+r9], xmm3
lea rdi, [rdi+r8*4]
; col 1, 3
movdqa [rdi], xmm8
movdqa [rdi+r8], xmm10
movdqa [rdi+r8*2], xmm9
movdqa [rdi+r9], xmm11
lea rdi, [rdi+r8*4]
add rsi, 128
dec rcx
jnz @B
pop rdi
pop rsi
ret
unSwizzleBlock8_amd64 endp
;
; unSwizzleBlock4
;
unSwizzleBlock4_amd64 proc public
push rsi
push rdi
mov rsi, rcx
mov rdi, rdx
mov rcx, 2
; r9 = r8*3
lea r9, [r8*2]
add r9, r8
mov eax, 0f0f0f0fh
movd xmm7, rax
pshufd xmm7, xmm7, 0
movdqa xmm15, xmm7
align 16
@@:
; col 0, 2
movdqa xmm0, [rsi+16*0]
movdqa xmm1, [rsi+16*1]
movdqa xmm4, [rsi+16*2]
movdqa xmm3, [rsi+16*3]
; col 1, 3
movdqa xmm8, [rsi+16*4]
movdqa xmm9, [rsi+16*5]
movdqa xmm12, [rsi+16*6]
movdqa xmm11, [rsi+16*7]
; col 0, 2
punpck dq, 0, 4, 1, 3, 2, 6
punpck dq, 0, 2, 4, 6, 1, 3
punpcknbl
punpck bw, 0, 2, 4, 6, 1, 3
punpck wd, 0, 2, 1, 3, 4, 6
; col 1, 3
punpck dq, 8, 12, 9, 11, 10, 14
punpck dq, 8, 10, 12, 14, 9, 11
punpcknbh
punpck bw, 8, 10, 12, 14, 9, 11
punpck wd, 8, 10, 9, 11, 12, 14
; col 0, 2
pshufd xmm0, xmm0, 0d8h
pshufd xmm2, xmm2, 0d8h
pshufd xmm4, xmm4, 0d8h
pshufd xmm6, xmm6, 0d8h
; col 1, 3
pshufd xmm8, xmm8, 0d8h
pshufd xmm10, xmm10, 0d8h
pshufd xmm12, xmm12, 0d8h
pshufd xmm14, xmm14, 0d8h
; col 0, 2
punpck qdq, 0, 2, 4, 6, 1, 3
; col 1, 3
punpck qdq, 8, 10, 12, 14, 9, 11
; col 0, 2
pshuflw xmm1, xmm1, 0b1h
pshuflw xmm3, xmm3, 0b1h
pshufhw xmm1, xmm1, 0b1h
pshufhw xmm3, xmm3, 0b1h
; col 1, 3
pshuflw xmm8, xmm8, 0b1h
pshuflw xmm10, xmm10, 0b1h
pshufhw xmm8, xmm8, 0b1h
pshufhw xmm10, xmm10, 0b1h
; col 0, 2
movdqa [rdi], xmm0
movdqa [rdi+r8], xmm2
movdqa [rdi+r8*2], xmm1
movdqa [rdi+r9], xmm3
lea rdi, [rdi+r8*4]
; col 1, 3
movdqa [rdi], xmm8
movdqa [rdi+r8], xmm10
movdqa [rdi+r8*2], xmm9
movdqa [rdi+r9], xmm11
lea rdi, [rdi+r8*4]
add rsi, 128
dec rcx
jnz @B
pop rdi
pop rsi
ret
unSwizzleBlock4_amd64 endp
;
; unSwizzleBlock8HP
;
unSwizzleBlock8HP_amd64 proc public
push rsi
push rdi
mov rsi, rcx
mov rdi, rdx
mov rcx, 4
align 16
@@:
movdqa xmm0, [rsi+16*0]
movdqa xmm1, [rsi+16*1]
movdqa xmm2, [rsi+16*2]
movdqa xmm3, [rsi+16*3]
punpck qdq, 0, 2, 1, 3, 4, 6
psrld xmm0, 24
psrld xmm2, 24
psrld xmm4, 24
psrld xmm6, 24
packssdw xmm0, xmm2
packssdw xmm4, xmm6
packuswb xmm0, xmm4
movlps qword ptr [rdi], xmm0
movhps qword ptr [rdi+r8], xmm0
add rsi, 64
lea rdi, [rdi+r8*2]
dec rcx
jnz @B
pop rdi
pop rsi
ret
unSwizzleBlock8HP_amd64 endp
;
; unSwizzleBlock4HLP
;
unSwizzleBlock4HLP_amd64 proc public
push rsi
push rdi
mov rsi, rcx
mov rdi, rdx
mov rcx, 4
mov eax, 0f0f0f0fh
movd xmm7, eax
pshufd xmm7, xmm7, 0
align 16
@@:
movdqa xmm0, [rsi+16*0]
movdqa xmm1, [rsi+16*1]
movdqa xmm2, [rsi+16*2]
movdqa xmm3, [rsi+16*3]
punpck qdq, 0, 2, 1, 3, 4, 6
psrld xmm0, 24
psrld xmm2, 24
psrld xmm4, 24
psrld xmm6, 24
packssdw xmm0, xmm2
packssdw xmm4, xmm6
packuswb xmm0, xmm4
pand xmm0, xmm7
movlps qword ptr [rdi], xmm0
movhps qword ptr [rdi+r8], xmm0
add rsi, 64
lea rdi, [rdi+r8*2]
dec rcx
jnz @B
pop rdi
pop rsi
ret
unSwizzleBlock4HLP_amd64 endp
;
; unSwizzleBlock4HHP
;
unSwizzleBlock4HHP_amd64 proc public
push rsi
push rdi
mov rsi, rcx
mov rdi, rdx
mov rcx, 4
align 16
@@:
movdqa xmm0, [rsi+16*0]
movdqa xmm1, [rsi+16*1]
movdqa xmm2, [rsi+16*2]
movdqa xmm3, [rsi+16*3]
punpck qdq, 0, 2, 1, 3, 4, 6
psrld xmm0, 28
psrld xmm2, 28
psrld xmm4, 28
psrld xmm6, 28
packssdw xmm0, xmm2
packssdw xmm4, xmm6
packuswb xmm0, xmm4
movlps qword ptr [rdi], xmm0
movhps qword ptr [rdi+r8], xmm0
add rsi, 64
lea rdi, [rdi+r8*2]
dec rcx
jnz @B
pop rdi
pop rsi
ret
unSwizzleBlock4HHP_amd64 endp
;
; unSwizzleBlock4P
;
unSwizzleBlock4P_amd64 proc public
mov eax, 0f0f0f0fh
movd xmm8, eax
pshufd xmm8, xmm8, 0
; r9 = r8*3
lea r9, [r8*2]
add r9, r8
; col 0
movdqa xmm0, [rcx+16*0]
movdqa xmm1, [rcx+16*1]
movdqa xmm2, [rcx+16*2]
movdqa xmm3, [rcx+16*3]
punpck bw, 0, 2, 1, 3, 4, 6
punpck wd, 0, 4, 2, 6, 1, 3
punpck bw, 0, 4, 1, 3, 2, 6
movdqa xmm1, xmm8
pandn xmm1, xmm0
pand xmm0, xmm8
pshufd xmm1, xmm1, 0b1h
psrlq xmm1, 4
movdqa xmm3, xmm8
pandn xmm3, xmm2
pand xmm2, xmm8
pshufd xmm3, xmm3, 0b1h
psrlq xmm3, 4
movdqa xmm5, xmm8
pandn xmm5, xmm4
pand xmm4, xmm8
pshufd xmm5, xmm5, 0b1h
psrlq xmm5, 4
movdqa xmm7, xmm8
pandn xmm7, xmm6
pand xmm6, xmm8
pshufd xmm7, xmm7, 0b1h
psrlq xmm7, 4
movdqa [rdx], xmm0
movdqa [rdx+16], xmm2
movdqa [rdx+r8], xmm4
movdqa [rdx+r8+16], xmm6
movdqa [rdx+r8*2], xmm1
movdqa [rdx+r8*2+16], xmm3
movdqa [rdx+r9], xmm5
movdqa [rdx+r9+16], xmm7
lea rdx, [rdx+r8*4]
; col 1
movdqa xmm0, [rcx+16*4]
movdqa xmm1, [rcx+16*5]
movdqa xmm2, [rcx+16*6]
movdqa xmm3, [rcx+16*7]
punpck bw, 0, 2, 1, 3, 4, 6
punpck wd, 0, 4, 2, 6, 1, 3
punpck bw, 0, 4, 1, 3, 2, 6
movdqa xmm1, xmm8
pandn xmm1, xmm0
pand xmm0, xmm8
pshufd xmm0, xmm0, 0b1h
psrlq xmm1, 4
movdqa xmm3, xmm8
pandn xmm3, xmm2
pand xmm2, xmm8
pshufd xmm2, xmm2, 0b1h
psrlq xmm3, 4
movdqa xmm5, xmm8
pandn xmm5, xmm4
pand xmm4, xmm8
pshufd xmm4, xmm4, 0b1h
psrlq xmm5, 4
movdqa xmm7, xmm8
pandn xmm7, xmm6
pand xmm6, xmm8
pshufd xmm6, xmm6, 0b1h
psrlq xmm7, 4
movdqa [rdx], xmm0
movdqa [rdx+16], xmm2
movdqa [rdx+r8], xmm4
movdqa [rdx+r8+16], xmm6
movdqa [rdx+r8*2], xmm1
movdqa [rdx+r8*2+16], xmm3
movdqa [rdx+r9], xmm5
movdqa [rdx+r9+16], xmm7
lea rdx, [rdx+r8*4]
; col 2
movdqa xmm0, [rcx+16*8]
movdqa xmm1, [rcx+16*9]
movdqa xmm2, [rcx+16*10]
movdqa xmm3, [rcx+16*11]
punpck bw, 0, 2, 1, 3, 4, 6
punpck wd, 0, 4, 2, 6, 1, 3
punpck bw, 0, 4, 1, 3, 2, 6
movdqa xmm1, xmm8
pandn xmm1, xmm0
pand xmm0, xmm8
pshufd xmm1, xmm1, 0b1h
psrlq xmm1, 4
movdqa xmm3, xmm8
pandn xmm3, xmm2
pand xmm2, xmm8
pshufd xmm3, xmm3, 0b1h
psrlq xmm3, 4
movdqa xmm5, xmm8
pandn xmm5, xmm4
pand xmm4, xmm8
pshufd xmm5, xmm5, 0b1h
psrlq xmm5, 4
movdqa xmm7, xmm8
pandn xmm7, xmm6
pand xmm6, xmm8
pshufd xmm7, xmm7, 0b1h
psrlq xmm7, 4
movdqa [rdx], xmm0
movdqa [rdx+16], xmm2
movdqa [rdx+r8], xmm4
movdqa [rdx+r8+16], xmm6
movdqa [rdx+r8*2], xmm1
movdqa [rdx+r8*2+16], xmm3
movdqa [rdx+r9], xmm5
movdqa [rdx+r9+16], xmm7
lea rdx, [rdx+r8*4]
; col 3
movdqa xmm0, [rcx+16*12]
movdqa xmm1, [rcx+16*13]
movdqa xmm2, [rcx+16*14]
movdqa xmm3, [rcx+16*15]
punpck bw, 0, 2, 1, 3, 4, 6
punpck wd, 0, 4, 2, 6, 1, 3
punpck bw, 0, 4, 1, 3, 2, 6
movdqa xmm1, xmm8
pandn xmm1, xmm0
pand xmm0, xmm8
pshufd xmm0, xmm0, 0b1h
psrlq xmm1, 4
movdqa xmm3, xmm8
pandn xmm3, xmm2
pand xmm2, xmm8
pshufd xmm2, xmm2, 0b1h
psrlq xmm3, 4
movdqa xmm5, xmm8
pandn xmm5, xmm4
pand xmm4, xmm8
pshufd xmm4, xmm4, 0b1h
psrlq xmm5, 4
movdqa xmm7, xmm8
pandn xmm7, xmm6
pand xmm6, xmm8
pshufd xmm6, xmm6, 0b1h
psrlq xmm7, 4
movdqa [rdx], xmm0
movdqa [rdx+16], xmm2
movdqa [rdx+r8], xmm4
movdqa [rdx+r8+16], xmm6
movdqa [rdx+r8*2], xmm1
movdqa [rdx+r8*2+16], xmm3
movdqa [rdx+r9], xmm5
movdqa [rdx+r9+16], xmm7
; lea rdx, [rdx+r8*4]
ret
unSwizzleBlock4P_amd64 endp
;
; swizzling
;
;
; SwizzleBlock32_amd64
;
SwizzleBlock32_amd64 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov rcx, 4
cmp r9d, 0ffffffffh
jnz SwizzleBlock32_amd64@WM
align 16
@@:
movdqa xmm0, [rsi]
movdqa xmm4, [rsi+16]
movdqa xmm1, [rsi+r8]
movdqa xmm5, [rsi+r8+16]
punpck qdq, 0, 4, 1, 5, 2, 6
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm2
movdqa [rdi+16*2], xmm4
movdqa [rdi+16*3], xmm6
lea rsi, [rsi+r8*2]
add rdi, 64
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock32_amd64@WM:
movd xmm7, r9d
pshufd xmm7, xmm7, 0
align 16
@@:
movdqa xmm0, [rsi]
movdqa xmm4, [rsi+16]
movdqa xmm1, [rsi+r8]
movdqa xmm5, [rsi+r8+16]
punpck qdq, 0, 4, 1, 5, 2, 6
movdqa xmm3, xmm7
pshufd xmm5, xmm7, 0e4h
movdqa xmm9, xmm7
pshufd xmm11, xmm7, 0e4h
pandn xmm3, [rdi+16*0]
pand xmm0, xmm7
por xmm0, xmm3
movdqa [rdi+16*0], xmm0
pandn xmm5, [rdi+16*1]
pand xmm2, xmm7
por xmm2, xmm5
movdqa [rdi+16*1], xmm2
pandn xmm9, [rdi+16*2]
pand xmm4, xmm7
por xmm4, xmm9
movdqa [rdi+16*2], xmm4
pandn xmm11, [rdi+16*3]
pand xmm6, xmm7
por xmm6, xmm11
movdqa [edi+16*3], xmm6
lea rsi, [rsi+r8*2]
add rdi, 64
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock32_amd64 endp
;
; SwizzleBlock16_amd64
;
SwizzleBlock16_amd64 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov rcx, 4
align 16
@@:
movdqa xmm0, [rsi]
movdqa xmm1, [rsi+16]
movdqa xmm2, [rsi+r8]
movdqa xmm3, [rsi+r8+16]
punpck wd, 0, 2, 1, 3, 4, 6
punpck qdq, 0, 4, 2, 6, 1, 5
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm1
movdqa [rdi+16*2], xmm4
movdqa [rdi+16*3], xmm5
lea rsi, [rsi+r8*2]
add rdi, 64
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock16_amd64 endp
;
; SwizzleBlock8
;
SwizzleBlock8_amd64 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov ecx, 2
align 16
@@:
; col 0, 2
movdqa xmm0, [rsi]
movdqa xmm2, [rsi+r8]
lea rsi, [rsi+r8*2]
pshufd xmm1, [rsi], 0b1h
pshufd xmm3, [rsi+r8], 0b1h
lea rsi, [rsi+r8*2]
punpck bw, 0, 2, 1, 3, 4, 6
punpck wd, 0, 2, 4, 6, 1, 3
punpck qdq, 0, 1, 2, 3, 4, 5
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm4
movdqa [rdi+16*2], xmm1
movdqa [rdi+16*3], xmm5
; col 1, 3
pshufd xmm0, [rsi], 0b1h
pshufd xmm2, [rsi+r8], 0b1h
lea rsi, [rsi+r8*2]
movdqa xmm1, [rsi]
movdqa xmm3, [rsi+r8]
lea rsi, [rsi+r8*2]
punpck bw, 0, 2, 1, 3, 4, 6
punpck wd, 0, 2, 4, 6, 1, 3
punpck qdq, 0, 1, 2, 3, 4, 5
movdqa [rdi+16*4], xmm0
movdqa [rdi+16*5], xmm4
movdqa [rdi+16*6], xmm1
movdqa [rdi+16*7], xmm5
add edi, 128
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock8_amd64 endp
;
; SwizzleBlock4
;
SwizzleBlock4_amd64 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov rcx, 2
mov eax, 0f0f0f0fh
movd xmm7, eax
pshufd xmm7, xmm7, 0
align 16
@@:
; col 0, 2
movdqa xmm0, [rsi]
movdqa xmm2, [rsi+r8]
lea rsi, [rsi+r8*2]
movdqa xmm1, [rsi]
movdqa xmm3, [rsi+r8]
lea rsi, [rsi+r8*2]
pshuflw xmm1, xmm1, 0b1h
pshuflw xmm3, xmm3, 0b1h
pshufhw xmm1, xmm1, 0b1h
pshufhw xmm3, xmm3, 0b1h
punpcknbl
punpck bw, 0, 2, 4, 6, 1, 3
punpck bw, 0, 2, 1, 3, 4, 6
punpck qdq, 0, 4, 2, 6, 1, 3
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm1
movdqa [rdi+16*2], xmm4
movdqa [rdi+16*3], xmm3
; col 1, 3
movdqa xmm0, [rsi]
movdqa xmm2, [rsi+r8]
lea esi, [rsi+r8*2]
movdqa xmm1, [rsi]
movdqa xmm3, [rsi+r8]
lea rsi, [rsi+r8*2]
pshuflw xmm0, xmm0, 0b1h
pshuflw xmm2, xmm2, 0b1h
pshufhw xmm0, xmm0, 0b1h
pshufhw xmm2, xmm2, 0b1h
punpcknbl
punpck bw, 0, 2, 4, 6, 1, 3
punpck bw, 0, 2, 1, 3, 4, 6
punpck qdq, 0, 4, 2, 6, 1, 3
movdqa [rdi+16*4], xmm0
movdqa [rdi+16*5], xmm1
movdqa [rdi+16*6], xmm4
movdqa [rdi+16*7], xmm3
add rdi, 128
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock4_amd64 endp
;
; swizzling with unaligned reads
;
;
; SwizzleBlock32u_amd64
;
SwizzleBlock32u_amd64 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov rcx, 4
cmp r9d, 0ffffffffh
jnz SwizzleBlock32u_amd64@WM
align 16
@@:
movdqu xmm0, [rsi]
movdqu xmm4, [rsi+16]
movdqu xmm1, [rsi+r8]
movdqu xmm5, [rsi+r8+16]
punpck qdq, 0, 4, 1, 5, 2, 6
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm2
movdqa [rdi+16*2], xmm4
movdqa [rdi+16*3], xmm6
lea rsi, [rsi+r8*2]
add rdi, 64
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock32u_amd64@WM:
movd xmm7, r9d
pshufd xmm7, xmm7, 0
align 16
@@:
movdqu xmm0, [rsi]
movdqu xmm4, [rsi+16]
movdqu xmm1, [rsi+r8]
movdqu xmm5, [rsi+r8+16]
punpck qdq, 0, 4, 1, 5, 2, 6
movdqa xmm3, xmm7
pshufd xmm5, xmm7, 0e4h
movdqa xmm9, xmm7
pshufd xmm11, xmm7, 0e4h
pandn xmm3, [rdi+16*0]
pand xmm0, xmm7
por xmm0, xmm3
movdqa [rdi+16*0], xmm0
pandn xmm5, [rdi+16*1]
pand xmm2, xmm7
por xmm2, xmm5
movdqa [rdi+16*1], xmm2
pandn xmm9, [rdi+16*2]
pand xmm4, xmm7
por xmm4, xmm9
movdqa [rdi+16*2], xmm4
pandn xmm11, [rdi+16*3]
pand xmm6, xmm7
por xmm6, xmm11
movdqa [edi+16*3], xmm6
lea rsi, [rsi+r8*2]
add rdi, 64
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock32u_amd64 endp
;
; SwizzleBlock16u_amd64
;
SwizzleBlock16u_amd64 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov rcx, 4
align 16
@@:
movdqu xmm0, [rsi]
movdqu xmm1, [rsi+16]
movdqu xmm2, [rsi+r8]
movdqu xmm3, [rsi+r8+16]
punpck wd, 0, 2, 1, 3, 4, 6
punpck qdq, 0, 4, 2, 6, 1, 5
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm1
movdqa [rdi+16*2], xmm4
movdqa [rdi+16*3], xmm5
lea rsi, [rsi+r8*2]
add rdi, 64
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock16u_amd64 endp
;
; SwizzleBlock8u
;
SwizzleBlock8u_amd64 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov ecx, 2
align 16
@@:
; col 0, 2
movdqu xmm0, [rsi]
movdqu xmm2, [rsi+r8]
lea rsi, [rsi+r8*2]
pshufd xmm1, [rsi], 0b1h
pshufd xmm3, [rsi+r8], 0b1h
lea rsi, [rsi+r8*2]
punpck bw, 0, 2, 1, 3, 4, 6
punpck wd, 0, 2, 4, 6, 1, 3
punpck qdq, 0, 1, 2, 3, 4, 5
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm4
movdqa [rdi+16*2], xmm1
movdqa [rdi+16*3], xmm5
; col 1, 3
pshufd xmm0, [rsi], 0b1h
pshufd xmm2, [rsi+r8], 0b1h
lea rsi, [rsi+r8*2]
movdqu xmm1, [rsi]
movdqu xmm3, [rsi+r8]
lea rsi, [rsi+r8*2]
punpck bw, 0, 2, 1, 3, 4, 6
punpck wd, 0, 2, 4, 6, 1, 3
punpck qdq, 0, 1, 2, 3, 4, 5
movdqa [rdi+16*4], xmm0
movdqa [rdi+16*5], xmm4
movdqa [rdi+16*6], xmm1
movdqa [rdi+16*7], xmm5
add edi, 128
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock8u_amd64 endp
;
; SwizzleBlock4u
;
SwizzleBlock4u_amd64 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov rcx, 2
mov eax, 0f0f0f0fh
movd xmm7, eax
pshufd xmm7, xmm7, 0
align 16
@@:
; col 0, 2
movdqu xmm0, [rsi]
movdqu xmm2, [rsi+r8]
lea rsi, [rsi+r8*2]
movdqu xmm1, [rsi]
movdqu xmm3, [rsi+r8]
lea rsi, [rsi+r8*2]
pshuflw xmm1, xmm1, 0b1h
pshuflw xmm3, xmm3, 0b1h
pshufhw xmm1, xmm1, 0b1h
pshufhw xmm3, xmm3, 0b1h
punpcknbl
punpck bw, 0, 2, 4, 6, 1, 3
punpck bw, 0, 2, 1, 3, 4, 6
punpck qdq, 0, 4, 2, 6, 1, 3
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm1
movdqa [rdi+16*2], xmm4
movdqa [rdi+16*3], xmm3
; col 1, 3
movdqu xmm0, [rsi]
movdqu xmm2, [rsi+r8]
lea esi, [rsi+r8*2]
movdqu xmm1, [rsi]
movdqu xmm3, [rsi+r8]
lea rsi, [rsi+r8*2]
pshuflw xmm0, xmm0, 0b1h
pshuflw xmm2, xmm2, 0b1h
pshufhw xmm0, xmm0, 0b1h
pshufhw xmm2, xmm2, 0b1h
punpcknbl
punpck bw, 0, 2, 4, 6, 1, 3
punpck bw, 0, 2, 1, 3, 4, 6
punpck qdq, 0, 4, 2, 6, 1, 3
movdqa [rdi+16*4], xmm0
movdqa [rdi+16*5], xmm1
movdqa [rdi+16*6], xmm4
movdqa [rdi+16*7], xmm3
add rdi, 128
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock4u_amd64 endp
end