pcsx2/plugins/zerogs/dx/x86-64.asm

1090 lines
18 KiB
NASM

; Copyright (C) 2003-2005 Gabest/zerofrog
; http:;;www.gabest.org
;
; This Program is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation; either version 2, or (at your option)
; any later version.
;
; This Program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with GNU Make; see the file COPYING. If not, write to
; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
; http:;;www.gnu.org/copyleft/gpl.html
;
;
extern s_clut16mask:ptr
.code
; mmx memcpy implementation, size has to be a multiple of 8
; returns 0 is equal, nonzero value if not equal
; ~10 times faster than standard memcmp
; (zerofrog)
; u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
; rcx - src1
; rdx - src2
; r8d - cmpsize
memcmp_mmx proc public
cmp r8d, 32
jl Done4
; custom test first 8 to make sure things are ok
movq mm0, [rdx]
movq mm1, [rdx+8]
pcmpeqd mm0, [rcx]
pcmpeqd mm1, [rcx+8]
pand mm0, mm1
movq mm2, [rdx+16]
pmovmskb eax, mm0
movq mm3, [rdx+24]
; check if eq
cmp eax, 0ffh
je NextComp
mov eax, 1
jmp Finish
NextComp:
pcmpeqd mm2, [rcx+16]
pcmpeqd mm3, [rcx+24]
pand mm2, mm3
pmovmskb eax, mm2
sub r8d, 32
add rdx, 32
add rcx, 32
; check if eq
cmp eax, 0ffh
je ContinueTest
mov eax, 1
jmp Finish
cmp r8d, 64
jl Done8
Cmp8:
movq mm0, [rdx]
movq mm1, [rdx+8]
movq mm2, [rdx+16]
movq mm3, [rdx+24]
movq mm4, [rdx+32]
movq mm5, [rdx+40]
movq mm6, [rdx+48]
movq mm7, [rdx+56]
pcmpeqd mm0, [rcx]
pcmpeqd mm1, [rcx+8]
pcmpeqd mm2, [rcx+16]
pcmpeqd mm3, [rcx+24]
pand mm0, mm1
pcmpeqd mm4, [rcx+32]
pand mm0, mm2
pcmpeqd mm5, [rcx+40]
pand mm0, mm3
pcmpeqd mm6, [rcx+48]
pand mm0, mm4
pcmpeqd mm7, [rcx+56]
pand mm0, mm5
pand mm0, mm6
pand mm0, mm7
pmovmskb eax, mm0
; check if eq
cmp eax, 0ffh
je Continue
mov eax, 1
jmp Finish
Continue:
sub r8d, 64
add rdx, 64
add rcx, 64
ContinueTest:
cmp r8d, 64
jge Cmp8
Done8:
test r8d, 020h
jz Done4
movq mm0, [rdx]
movq mm1, [rdx+8]
movq mm2, [rdx+16]
movq mm3, [rdx+24]
pcmpeqd mm0, [rcx]
pcmpeqd mm1, [rcx+8]
pcmpeqd mm2, [rcx+16]
pcmpeqd mm3, [rcx+24]
pand mm0, mm1
pand mm0, mm2
pand mm0, mm3
pmovmskb eax, mm0
sub r8d, 32
add rdx, 32
add rcx, 32
; check if eq
cmp eax, 0ffh
je Done4
mov eax, 1
jmp Finish
Done4:
cmp r8d, 24
jne Done2
movq mm0, [rdx]
movq mm1, [rdx+8]
movq mm2, [rdx+16]
pcmpeqd mm0, [rcx]
pcmpeqd mm1, [rcx+8]
pcmpeqd mm2, [rcx+16]
pand mm0, mm1
pand mm0, mm2
pmovmskb eax, mm0
; check if eq
cmp eax, 0ffh
je Done
mov eax, 1
jmp Finish
Done2:
cmp r8d, 16
jne Done1
movq mm0, [rdx]
movq mm1, [rdx+8]
pcmpeqd mm0, [rcx]
pcmpeqd mm1, [rcx+8]
pand mm0, mm1
pmovmskb eax, mm0
; check if eq
cmp eax, 0ffh
je Done
mov eax, 1
jmp Finish
Done1:
cmp r8d, 8
jne Done
mov eax, [rdx]
mov rdx, [rdx+4]
cmp eax, [rcx]
je Next
mov eax, 1
jmp Finish
Next:
cmp rdx, [rcx+4]
je Done
mov eax, 1
jmp Finish
Done:
xor eax, eax
Finish:
emms
ret
memcmp_mmx endp
; TestClutChangeMMX
; mov rdx, dst
; mov rcx, src
; mov r8d, entries
TestClutChangeMMX proc public
Start:
movq mm0, [rdx]
movq mm1, [rdx+8]
pcmpeqd mm0, [rcx]
pcmpeqd mm1, [rcx+16]
movq mm2, [rdx+16]
movq mm3, [rdx+24]
pcmpeqd mm2, [rcx+32]
pcmpeqd mm3, [rcx+48]
pand mm0, mm1
pand mm2, mm3
movq mm4, [rdx+32]
movq mm5, [rdx+40]
pcmpeqd mm4, [rcx+8]
pcmpeqd mm5, [rcx+24]
pand mm0, mm2
pand mm4, mm5
movq mm6, [rdx+48]
movq mm7, [rdx+56]
pcmpeqd mm6, [rcx+40]
pcmpeqd mm7, [rcx+56]
pand mm0, mm4
pand mm6, mm7
pand mm0, mm6
pmovmskb eax, mm0
cmp eax, 0ffh
je Continue
mov byte ptr [r9], 1
jmp Return
Continue:
cmp r8d, 16
jle Return
test r8d, 010h
jz AddRcx
sub rcx, 448 ; go back and down one column,
AddRcx:
add rcx, 256 ; go to the right block
jne Continue1
add rcx, 256 ; skip whole block
Continue1:
add rdx, 64
sub r8d, 16
jmp Start
Return:
emms
ret
TestClutChangeMMX endp
UnswizzleZ16Target proc public
pxor xmm7, xmm7
Z16Loop:
;; unpack 64 bytes at a time
movdqa xmm0, [rdx]
movdqa xmm2, [rdx+16]
movdqa xmm4, [rdx+32]
movdqa xmm6, [rdx+48]
movdqa xmm1, xmm0
movdqa xmm3, xmm2
movdqa xmm5, xmm4
punpcklwd xmm0, xmm7
punpckhwd xmm1, xmm7
punpcklwd xmm2, xmm7
punpckhwd xmm3, xmm7
;; start saving
movdqa [rcx], xmm0
movdqa [rcx+16], xmm1
punpcklwd xmm4, xmm7
punpckhwd xmm5, xmm7
movdqa [rcx+32], xmm2
movdqa [rcx+48], xmm3
movdqa xmm0, xmm6
punpcklwd xmm6, xmm7
movdqa [rcx+64], xmm4
movdqa [rcx+80], xmm5
punpckhwd xmm0, xmm7
movdqa [rcx+96], xmm6
movdqa [rcx+112], xmm0
add rdx, 64
add rcx, 128
sub r9d, 1
jne Z16Loop
ret
UnswizzleZ16Target endp
;
; swizzling
;
punpck macro op, sd0, sd2, s1, s3, d1, d3
movdqa @CatStr(xmm, %d1), @CatStr(xmm, %sd0)
pshufd @CatStr(xmm, %d3), @CatStr(xmm, %sd2), 0e4h
@CatStr(punpckl, op) @CatStr(xmm, %sd0), @CatStr(xmm, %s1)
@CatStr(punpckh, op) @CatStr(xmm, %d1), @CatStr(xmm, %s1)
@CatStr(punpckl, op) @CatStr(xmm, %sd2), @CatStr(xmm, %s3)
@CatStr(punpckh, op) @CatStr(xmm, %d3), @CatStr(xmm, %s3)
endm
punpcknbl macro
movdqa xmm4, xmm0
pshufd xmm5, xmm1, 0e4h
psllq xmm1, 4
psrlq xmm4, 4
movdqa xmm6, xmm7
pand xmm0, xmm7
pandn xmm6, xmm1
por xmm0, xmm6
movdqa xmm6, xmm7
pand xmm4, xmm7
pandn xmm6, xmm5
por xmm4, xmm6
movdqa xmm1, xmm4
movdqa xmm4, xmm2
pshufd xmm5, xmm3, 0e4h
psllq xmm3, 4
psrlq xmm4, 4
movdqa xmm6, xmm7
pand xmm2, xmm7
pandn xmm6, xmm3
por xmm2, xmm6
movdqa xmm6, xmm7
pand xmm4, xmm7
pandn xmm6, xmm5
por xmm4, xmm6
movdqa xmm3, xmm4
punpck bw, 0, 2, 1, 3, 4, 6
endm
punpcknbh macro
movdqa xmm12, xmm8
pshufd xmm13, xmm9, 0e4h
psllq xmm9, 4
psrlq xmm12, 4
movdqa xmm14, xmm15
pand xmm8, xmm15
pandn xmm14, xmm9
por xmm8, xmm14
movdqa xmm14, xmm15
pand xmm12, xmm15
pandn xmm14, xmm13
por xmm12, xmm14
movdqa xmm9, xmm12
movdqa xmm12, xmm10
pshufd xmm13, xmm11, 0e4h
psllq xmm11, 4
psrlq xmm12, 4
movdqa xmm14, xmm15
pand xmm10, xmm15
pandn xmm14, xmm11
por xmm10, xmm14
movdqa xmm14, xmm15
pand xmm12, xmm15
pandn xmm14, xmm13
por xmm12, xmm14
movdqa xmm11, xmm12
punpck bw, 8, 10, 9, 11, 12, 14
endm
;
; SwizzleBlock32_sse2
;
SwizzleBlock32_sse2 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov rcx, 4
cmp r9d, 0ffffffffh
jne SwizzleBlock32_sse2@WM
align 16
@@:
movdqa xmm0, [rsi]
movdqa xmm4, [rsi+16]
movdqa xmm1, [rsi+r8]
movdqa xmm5, [rsi+r8+16]
punpck qdq, 0, 4, 1, 5, 2, 6
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm2
movdqa [rdi+16*2], xmm4
movdqa [rdi+16*3], xmm6
lea rsi, [rsi+r8*2]
add rdi, 64
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock32_sse2@WM:
movd xmm7, r9d
pshufd xmm7, xmm7, 0
align 16
@@:
movdqa xmm0, [rsi]
movdqa xmm4, [rsi+16]
movdqa xmm1, [rsi+r8]
movdqa xmm5, [rsi+r8+16]
punpck qdq, 0, 4, 1, 5, 2, 6
movdqa xmm3, xmm7
pshufd xmm5, xmm7, 0e4h
movdqa xmm9, xmm7
pshufd xmm11, xmm7, 0e4h
pandn xmm3, [rdi+16*0]
pand xmm0, xmm7
por xmm0, xmm3
movdqa [rdi+16*0], xmm0
pandn xmm5, [rdi+16*1]
pand xmm2, xmm7
por xmm2, xmm5
movdqa [rdi+16*1], xmm2
pandn xmm9, [rdi+16*2]
pand xmm4, xmm7
por xmm4, xmm9
movdqa [rdi+16*2], xmm4
pandn xmm11, [rdi+16*3]
pand xmm6, xmm7
por xmm6, xmm11
movdqa [edi+16*3], xmm6
lea rsi, [rsi+r8*2]
add rdi, 64
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock32_sse2 endp
;
; SwizzleBlock16_sse2
;
SwizzleBlock16_sse2 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov rcx, 4
align 16
@@:
movdqa xmm0, [rsi]
movdqa xmm1, [rsi+16]
movdqa xmm2, [rsi+r8]
movdqa xmm3, [rsi+r8+16]
punpck wd, 0, 2, 1, 3, 4, 6
punpck qdq, 0, 4, 2, 6, 1, 5
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm1
movdqa [rdi+16*2], xmm4
movdqa [rdi+16*3], xmm5
lea rsi, [rsi+r8*2]
add rdi, 64
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock16_sse2 endp
;
; SwizzleBlock8
;
SwizzleBlock8_sse2 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov ecx, 2
align 16
@@:
; col 0, 2
movdqa xmm0, [rsi]
movdqa xmm2, [rsi+r8]
lea rsi, [rsi+r8*2]
pshufd xmm1, [rsi], 0b1h
pshufd xmm3, [rsi+r8], 0b1h
lea rsi, [rsi+r8*2]
punpck bw, 0, 2, 1, 3, 4, 6
punpck wd, 0, 2, 4, 6, 1, 3
punpck qdq, 0, 1, 2, 3, 4, 5
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm4
movdqa [rdi+16*2], xmm1
movdqa [rdi+16*3], xmm5
; col 1, 3
pshufd xmm0, [rsi], 0b1h
pshufd xmm2, [rsi+r8], 0b1h
lea rsi, [rsi+r8*2]
movdqa xmm1, [rsi]
movdqa xmm3, [rsi+r8]
lea rsi, [rsi+r8*2]
punpck bw, 0, 2, 1, 3, 4, 6
punpck wd, 0, 2, 4, 6, 1, 3
punpck qdq, 0, 1, 2, 3, 4, 5
movdqa [rdi+16*4], xmm0
movdqa [rdi+16*5], xmm4
movdqa [rdi+16*6], xmm1
movdqa [rdi+16*7], xmm5
add edi, 128
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock8_sse2 endp
;
; SwizzleBlock4
;
SwizzleBlock4_sse2 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov rcx, 2
mov eax, 0f0f0f0fh
movd xmm7, eax
pshufd xmm7, xmm7, 0
align 16
@@:
; col 0, 2
movdqa xmm0, [rsi]
movdqa xmm2, [rsi+r8]
lea rsi, [rsi+r8*2]
movdqa xmm1, [rsi]
movdqa xmm3, [rsi+r8]
lea rsi, [rsi+r8*2]
pshuflw xmm1, xmm1, 0b1h
pshuflw xmm3, xmm3, 0b1h
pshufhw xmm1, xmm1, 0b1h
pshufhw xmm3, xmm3, 0b1h
punpcknbl
punpck bw, 0, 2, 4, 6, 1, 3
punpck bw, 0, 2, 1, 3, 4, 6
punpck qdq, 0, 4, 2, 6, 1, 3
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm1
movdqa [rdi+16*2], xmm4
movdqa [rdi+16*3], xmm3
; col 1, 3
movdqa xmm0, [rsi]
movdqa xmm2, [rsi+r8]
lea esi, [rsi+r8*2]
movdqa xmm1, [rsi]
movdqa xmm3, [rsi+r8]
lea rsi, [rsi+r8*2]
pshuflw xmm0, xmm0, 0b1h
pshuflw xmm2, xmm2, 0b1h
pshufhw xmm0, xmm0, 0b1h
pshufhw xmm2, xmm2, 0b1h
punpcknbl
punpck bw, 0, 2, 4, 6, 1, 3
punpck bw, 0, 2, 1, 3, 4, 6
punpck qdq, 0, 4, 2, 6, 1, 3
movdqa [rdi+16*4], xmm0
movdqa [rdi+16*5], xmm1
movdqa [rdi+16*6], xmm4
movdqa [rdi+16*7], xmm3
add rdi, 128
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock4_sse2 endp
;
; swizzling with unaligned reads
;
;
; SwizzleBlock32u_sse2
;
SwizzleBlock32u_sse2 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov rcx, 4
cmp r9d, 0ffffffffh
jne SwizzleBlock32u_sse2@WM
align 16
@@:
movdqu xmm0, [rsi]
movdqu xmm4, [rsi+16]
movdqu xmm1, [rsi+r8]
movdqu xmm5, [rsi+r8+16]
punpck qdq, 0, 4, 1, 5, 2, 6
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm2
movdqa [rdi+16*2], xmm4
movdqa [rdi+16*3], xmm6
lea rsi, [rsi+r8*2]
add rdi, 64
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock32u_sse2@WM:
movd xmm7, r9d
pshufd xmm7, xmm7, 0
align 16
@@:
movdqu xmm0, [rsi]
movdqu xmm4, [rsi+16]
movdqu xmm1, [rsi+r8]
movdqu xmm5, [rsi+r8+16]
punpck qdq, 0, 4, 1, 5, 2, 6
movdqa xmm3, xmm7
pshufd xmm5, xmm7, 0e4h
movdqa xmm9, xmm7
pshufd xmm11, xmm7, 0e4h
pandn xmm3, [rdi+16*0]
pand xmm0, xmm7
por xmm0, xmm3
movdqa [rdi+16*0], xmm0
pandn xmm5, [rdi+16*1]
pand xmm2, xmm7
por xmm2, xmm5
movdqa [rdi+16*1], xmm2
pandn xmm9, [rdi+16*2]
pand xmm4, xmm7
por xmm4, xmm9
movdqa [rdi+16*2], xmm4
pandn xmm11, [rdi+16*3]
pand xmm6, xmm7
por xmm6, xmm11
movdqa [edi+16*3], xmm6
lea rsi, [rsi+r8*2]
add rdi, 64
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock32u_sse2 endp
;
; SwizzleBlock16u_sse2
;
SwizzleBlock16u_sse2 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov rcx, 4
align 16
@@:
movdqu xmm0, [rsi]
movdqu xmm1, [rsi+16]
movdqu xmm2, [rsi+r8]
movdqu xmm3, [rsi+r8+16]
punpck wd, 0, 2, 1, 3, 4, 6
punpck qdq, 0, 4, 2, 6, 1, 5
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm1
movdqa [rdi+16*2], xmm4
movdqa [rdi+16*3], xmm5
lea rsi, [rsi+r8*2]
add rdi, 64
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock16u_sse2 endp
;
; SwizzleBlock8u
;
SwizzleBlock8u_sse2 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov ecx, 2
align 16
@@:
; col 0, 2
movdqu xmm0, [rsi]
movdqu xmm2, [rsi+r8]
lea rsi, [rsi+r8*2]
pshufd xmm1, xmm0, 0b1h
pshufd xmm3, xmm2, 0b1h
lea rsi, [rsi+r8*2]
punpck bw, 0, 2, 1, 3, 4, 6
punpck wd, 0, 2, 4, 6, 1, 3
punpck qdq, 0, 1, 2, 3, 4, 5
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm4
movdqa [rdi+16*2], xmm1
movdqa [rdi+16*3], xmm5
; col 1, 3
movdqu xmm0, [rsi]
movdqu xmm2, [rsi+r8]
pshufd xmm0, xmm0, 0b1h
pshufd xmm2, xmm2, 0b1h
lea rsi, [rsi+r8*2]
movdqu xmm1, [rsi]
movdqu xmm3, [rsi+r8]
lea rsi, [rsi+r8*2]
punpck bw, 0, 2, 1, 3, 4, 6
punpck wd, 0, 2, 4, 6, 1, 3
punpck qdq, 0, 1, 2, 3, 4, 5
movdqa [rdi+16*4], xmm0
movdqa [rdi+16*5], xmm4
movdqa [rdi+16*6], xmm1
movdqa [rdi+16*7], xmm5
add edi, 128
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock8u_sse2 endp
;
; SwizzleBlock4u
;
SwizzleBlock4u_sse2 proc public
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
mov rcx, 2
mov eax, 0f0f0f0fh
movd xmm7, eax
pshufd xmm7, xmm7, 0
align 16
@@:
; col 0, 2
movdqu xmm0, [rsi]
movdqu xmm2, [rsi+r8]
lea rsi, [rsi+r8*2]
movdqu xmm1, [rsi]
movdqu xmm3, [rsi+r8]
lea rsi, [rsi+r8*2]
pshuflw xmm1, xmm1, 0b1h
pshuflw xmm3, xmm3, 0b1h
pshufhw xmm1, xmm1, 0b1h
pshufhw xmm3, xmm3, 0b1h
punpcknbl
punpck bw, 0, 2, 4, 6, 1, 3
punpck bw, 0, 2, 1, 3, 4, 6
punpck qdq, 0, 4, 2, 6, 1, 3
movdqa [rdi+16*0], xmm0
movdqa [rdi+16*1], xmm1
movdqa [rdi+16*2], xmm4
movdqa [rdi+16*3], xmm3
; col 1, 3
movdqu xmm0, [rsi]
movdqu xmm2, [rsi+r8]
lea esi, [rsi+r8*2]
movdqu xmm1, [rsi]
movdqu xmm3, [rsi+r8]
lea rsi, [rsi+r8*2]
pshuflw xmm0, xmm0, 0b1h
pshuflw xmm2, xmm2, 0b1h
pshufhw xmm0, xmm0, 0b1h
pshufhw xmm2, xmm2, 0b1h
punpcknbl
punpck bw, 0, 2, 4, 6, 1, 3
punpck bw, 0, 2, 1, 3, 4, 6
punpck qdq, 0, 4, 2, 6, 1, 3
movdqa [rdi+16*4], xmm0
movdqa [rdi+16*5], xmm1
movdqa [rdi+16*6], xmm4
movdqa [rdi+16*7], xmm3
add rdi, 128
dec rcx
jnz @B
pop rdi
pop rsi
ret
SwizzleBlock4u_sse2 endp
WriteCLUT_T16_I4_CSM1_sse2 proc public
movdqa xmm0, XMMWORD PTR [rcx]
movdqa xmm1, XMMWORD PTR [rcx+16]
movdqa xmm2, XMMWORD PTR [rcx+32]
movdqa xmm3, XMMWORD PTR [rcx+48]
;; rearrange
pshuflw xmm0, xmm0, 088h
pshufhw xmm0, xmm0, 088h
pshuflw xmm1, xmm1, 088h
pshufhw xmm1, xmm1, 088h
pshuflw xmm2, xmm2, 088h
pshufhw xmm2, xmm2, 088h
pshuflw xmm3, xmm3, 088h
pshufhw xmm3, xmm3, 088h
shufps xmm0, xmm1, 088h
shufps xmm2, xmm3, 088h
pshufd xmm0, xmm0, 0d8h
pshufd xmm2, xmm2, 0d8h
pxor xmm6, xmm6
mov rax, offset s_clut16mask
test rdx, 15
jnz WriteUnaligned
movdqa xmm7, XMMWORD PTR [rax] ;; saves upper 16 bits
;; have to save interlaced with the old data
movdqa xmm4, [rdx]
movdqa xmm5, [rdx+32]
movhlps xmm1, xmm0
movlhps xmm0, xmm2 ;; lower 8 colors
pand xmm4, xmm7
pand xmm5, xmm7
shufps xmm1, xmm2, 0e4h ;; upper 8 colors
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklwd xmm0, xmm6
punpcklwd xmm1, xmm6
por xmm0, xmm4
por xmm1, xmm5
punpckhwd xmm2, xmm6
punpckhwd xmm3, xmm6
movdqa [rdx], xmm0
movdqa [rdx+32], xmm1
movdqa xmm5, xmm7
pand xmm7, [rdx+16]
pand xmm5, [rdx+48]
por xmm2, xmm7
por xmm3, xmm5
movdqa [rdx+16], xmm2
movdqa [rdx+48], xmm3
jmp WriteCLUT_T16_I4_CSM1_End
WriteUnaligned:
;; rdx is offset by 2
sub rdx, 2
movdqa xmm7, XMMWORD PTR [rax+16] ;; saves lower 16 bits
;; have to save interlaced with the old data
movdqa xmm4, [rdx]
movdqa xmm5, [rdx+32]
movhlps xmm1, xmm0
movlhps xmm0, xmm2 ;; lower 8 colors
pand xmm4, xmm7
pand xmm5, xmm7
shufps xmm1, xmm2, 0e4h ;; upper 8 colors
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklwd xmm0, xmm6
punpcklwd xmm1, xmm6
pslld xmm0, 16
pslld xmm1, 16
por xmm0, xmm4
por xmm1, xmm5
punpckhwd xmm2, xmm6
punpckhwd xmm3, xmm6
pslld xmm2, 16
pslld xmm3, 16
movdqa [rdx], xmm0
movdqa [rdx+32], xmm1
movdqa xmm5, xmm7
pand xmm7, [rdx+16]
pand xmm5, [rdx+48]
por xmm2, xmm7
por xmm3, xmm5
movdqa [rdx+16], xmm2
movdqa [rdx+48], xmm3
WriteCLUT_T16_I4_CSM1_End:
ret
WriteCLUT_T16_I4_CSM1_sse2 endp
end