zzogl-pg: Move WriteCLUT_T32_I4_CSM1_sse2 to inline assembly.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2803 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
arcum42 2010-04-01 22:52:59 +00:00
parent c604bf89d3
commit 9ea97041fb
2 changed files with 114 additions and 373 deletions

View File

@ -18,181 +18,6 @@
#
.intel_syntax
## mmx memcpy implementation, size has to be a multiple of 8
## returns 0 is equal, nonzero value if not equal
## ~10 times faster than standard memcmp
## (zerofrog)
#u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
#.globl memcmp_mmx
# .type memcmp_mmx, @function
#memcmp_mmx:
# push %esi
# mov %ecx, dword ptr [%esp+16]
# mov %edx, dword ptr [%esp+8]
# mov %esi, dword ptr [%esp+12]
#
# cmp %ecx, 32
# jl Done4
#
# // custom test first 8 to make sure things are ok
# movq %mm0, [%esi]
# movq %mm1, [%esi+8]
# pcmpeqd %mm0, [%edx]
# pcmpeqd %mm1, [%edx+8]
# pand %mm0, %mm1
# movq %mm2, [%esi+16]
# pmovmskb %eax, %mm0
# movq %mm3, [%esi+24]
#
# // check if eq
# cmp %eax, 0xff
# je NextComp
# mov %eax, 1
# jmp End
#
#NextComp:
# pcmpeqd %mm2, [%edx+16]
# pcmpeqd %mm3, [%edx+24]
# pand %mm2, %mm3
# pmovmskb %eax, %mm2
#
# sub %ecx, 32
# add %esi, 32
# add %edx, 32
#
# // check if eq
# cmp %eax, 0xff
# je ContinueTest
# mov %eax, 1
# jmp End
#
# cmp %ecx, 64
# jl Done8
#
#Cmp8:
# movq %mm0, [%esi]
# movq %mm1, [%esi+8]
# movq %mm2, [%esi+16]
# movq %mm3, [%esi+24]
# movq %mm4, [%esi+32]
# movq %mm5, [%esi+40]
# movq %mm6, [%esi+48]
# movq %mm7, [%esi+56]
# pcmpeqd %mm0, [%edx]
# pcmpeqd %mm1, [%edx+8]
# pcmpeqd %mm2, [%edx+16]
# pcmpeqd %mm3, [%edx+24]
# pand %mm0, %mm1
# pcmpeqd %mm4, [%edx+32]
# pand %mm0, %mm2
# pcmpeqd %mm5, [%edx+40]
# pand %mm0, %mm3
# pcmpeqd %mm6, [%edx+48]
# pand %mm0, %mm4
# pcmpeqd %mm7, [%edx+56]
# pand %mm0, %mm5
# pand %mm0, %mm6
# pand %mm0, %mm7
# pmovmskb %eax, %mm0
#
# // check if eq
# cmp %eax, 0xff
# je Continue
# mov %eax, 1
# jmp End
#
#Continue:
# sub %ecx, 64
# add %esi, 64
# add %edx, 64
#ContinueTest:
# cmp %ecx, 64
# jge Cmp8
#
#Done8:
# test %ecx, 0x20
# jz Done4
# movq %mm0, [%esi]
# movq %mm1, [%esi+8]
# movq %mm2, [%esi+16]
# movq %mm3, [%esi+24]
# pcmpeqd %mm0, [%edx]
# pcmpeqd %mm1, [%edx+8]
# pcmpeqd %mm2, [%edx+16]
# pcmpeqd %mm3, [%edx+24]
# pand %mm0, %mm1
# pand %mm0, %mm2
# pand %mm0, %mm3
# pmovmskb %eax, %mm0
# sub %ecx, 32
# add %esi, 32
# add %edx, 32
#
# // check if eq
# cmp %eax, 0xff
# je Done4
# mov %eax, 1
# jmp End
#
#Done4:
# cmp %ecx, 24
# jne Done2
# movq %mm0, [%esi]
# movq %mm1, [%esi+8]
# movq %mm2, [%esi+16]
# pcmpeqd %mm0, [%edx]
# pcmpeqd %mm1, [%edx+8]
# pcmpeqd %mm2, [%edx+16]
# pand %mm0, %mm1
# pand %mm0, %mm2
# pmovmskb %eax, %mm0
#
# // check if eq
# cmp %eax, 0xff
# setne %al
# jmp End
#
#Done2:
# cmp %ecx, 16
# jne Done1
#
# movq %mm0, [%esi]
# movq %mm1, [%esi+8]
# pcmpeqd %mm0, [%edx]
# pcmpeqd %mm1, [%edx+8]
# pand %mm0, %mm1
# pmovmskb %eax, %mm0
#
# // check if eq
# cmp %eax, 0xff
# setne %al
# jmp End
#
#Done1:
# cmp %ecx, 8
# jne Done
#
# mov %eax, [%esi]
# mov %esi, [%esi+4]
# cmp %eax, [%edx]
# je Next
# mov %eax, 1
# jmp End
#
#Next:
# cmp %esi, [%edx+4]
# setne %al
# jmp End
#
#Done:
# xor %eax, %eax
#
#End:
# pop %esi
# emms
# ret
#ifdef ZEROGS_SSE2
// SSE2 extensions
#define punpck(op, sd0, sd2, s1, s3, d1, d3) \
@ -804,199 +629,4 @@ SwizzleBlock4u_sse2_1:
ret 4
.align 16
s_clut16mask:
.long 0xffff0000
.long 0xffff0000
.long 0xffff0000
.long 0xffff0000
.align 16
s_clut16mask2:
.long 0x0000ffff
.long 0x0000ffff
.long 0x0000ffff
.long 0x0000ffff
.globl WriteCLUT_T16_I4_CSM1_sse2
.type WriteCLUT_T16_I4_CSM1_sse2, @function
WriteCLUT_T16_I4_CSM1_sse2:
movdqa %xmm0, xmmword ptr [%ecx]
movdqa %xmm1, xmmword ptr [%ecx+16]
movdqa %xmm2, xmmword ptr [%ecx+32]
movdqa %xmm3, xmmword ptr [%ecx+48]
// rearrange
pshuflw %xmm0, %xmm0, 0x88
pshufhw %xmm0, %xmm0, 0x88
pshuflw %xmm1, %xmm1, 0x88
pshufhw %xmm1, %xmm1, 0x88
pshuflw %xmm2, %xmm2, 0x88
pshufhw %xmm2, %xmm2, 0x88
pshuflw %xmm3, %xmm3, 0x88
pshufhw %xmm3, %xmm3, 0x88
shufps %xmm0, %xmm1, 0x88
shufps %xmm2, %xmm3, 0x88
pshufd %xmm0, %xmm0, 0xd8
pshufd %xmm2, %xmm2, 0xd8
pxor %xmm6, %xmm6
test %edx, 15
jnz WriteUnaligned
movdqa %xmm7, [s_clut16mask] // saves upper 16 bits
// have to save interlaced with the old data
movdqa %xmm4, [%edx]
movdqa %xmm5, [%edx+32]
movhlps %xmm1, %xmm0
movlhps %xmm0, %xmm2 // lower 8 colors
pand %xmm4, %xmm7
pand %xmm5, %xmm7
shufps %xmm1, %xmm2, 0xe4 // upper 8 colors
movdqa %xmm2, %xmm0
movdqa %xmm3, %xmm1
punpcklwd %xmm0, %xmm6
punpcklwd %xmm1, %xmm6
por %xmm0, %xmm4
por %xmm1, %xmm5
punpckhwd %xmm2, %xmm6
punpckhwd %xmm3, %xmm6
movdqa [%edx], %xmm0
movdqa [%edx+32], %xmm1
movdqa %xmm5, %xmm7
pand %xmm7, [%edx+16]
pand %xmm5, [%edx+48]
por %xmm2, %xmm7
por %xmm3, %xmm5
movdqa [%edx+16], %xmm2
movdqa [%edx+48], %xmm3
jmp WriteCLUT_T16_I4_CSM1_End
WriteUnaligned:
// %edx is offset by 2
sub %edx, 2
movdqa %xmm7, [s_clut16mask2] // saves lower 16 bits
// have to save interlaced with the old data
movdqa %xmm4, [%edx]
movdqa %xmm5, [%edx+32]
movhlps %xmm1, %xmm0
movlhps %xmm0, %xmm2 // lower 8 colors
pand %xmm4, %xmm7
pand %xmm5, %xmm7
shufps %xmm1, %xmm2, 0xe4 // upper 8 colors
movdqa %xmm2, %xmm0
movdqa %xmm3, %xmm1
punpcklwd %xmm0, %xmm6
punpcklwd %xmm1, %xmm6
pslld %xmm0, 16
pslld %xmm1, 16
por %xmm0, %xmm4
por %xmm1, %xmm5
punpckhwd %xmm2, %xmm6
punpckhwd %xmm3, %xmm6
pslld %xmm2, 16
pslld %xmm3, 16
movdqa [%edx], %xmm0
movdqa [%edx+32], %xmm1
movdqa %xmm5, %xmm7
pand %xmm7, [%edx+16]
pand %xmm5, [%edx+48]
por %xmm2, %xmm7
por %xmm3, %xmm5
movdqa [%edx+16], %xmm2
movdqa [%edx+48], %xmm3
WriteCLUT_T16_I4_CSM1_End:
ret
#.globl WriteCLUT_T32_I8_CSM1_sse2
# .type WriteCLUT_T32_I8_CSM1_sse2, @function
#WriteCLUT_T32_I8_CSM1_sse2:
# push %ebx
# xor %ebx, %ebx
#.L231:
# xor %eax, %eax
# .align 16
#.L232:
# movdqa %xmm3, XMMWORD PTR [%eax+16+%ecx]
# movdqa %xmm4, XMMWORD PTR [%eax+48+%ecx]
# movdqa %xmm1, XMMWORD PTR [%eax+%ecx]
# movdqa %xmm2, XMMWORD PTR [%eax+32+%ecx]
# movdqa %xmm0, %xmm1
# punpckhqdq %xmm1, %xmm3
# punpcklqdq %xmm0, %xmm3
# movdqa XMMWORD PTR [%edx+32+%eax*2], %xmm1
# movdqa XMMWORD PTR [%edx+%eax*2], %xmm0
# movdqa %xmm0, %xmm2
# punpckhqdq %xmm2, %xmm4
# punpcklqdq %xmm0, %xmm4
# movdqa XMMWORD PTR [%edx+48+%eax*2], %xmm2
# movdqa XMMWORD PTR [%edx+16+%eax*2], %xmm0
# movdqa %xmm1, XMMWORD PTR [%eax+256+%ecx]
# movdqa %xmm3, XMMWORD PTR [%eax+272+%ecx]
# movdqa %xmm2, XMMWORD PTR [%eax+288+%ecx]
# movdqa %xmm4, XMMWORD PTR [%eax+304+%ecx]
# movdqa %xmm0, %xmm1
# punpckhqdq %xmm1, %xmm3
# punpcklqdq %xmm0, %xmm3
# movdqa XMMWORD PTR [%edx+96+%eax*2], %xmm1
# movdqa XMMWORD PTR [%edx+64+%eax*2], %xmm0
# movdqa %xmm0, %xmm2
# punpckhqdq %xmm2, %xmm4
# punpcklqdq %xmm0, %xmm4
# movdqa XMMWORD PTR [%edx+112+%eax*2], %xmm2
# movdqa XMMWORD PTR [%edx+80+%eax*2], %xmm0
# add %eax, 64
# cmp %eax, 256
# jne .L232
# add %edx, 512
# add %ecx, 512
# add %ebx, 512
# cmp %ebx, 1024
# jne .L231
# pop %ebx
# ret
#.globl WriteCLUT_T32_I4_CSM1_sse2
# .type WriteCLUT_T32_I4_CSM1_sse2, @function
#WriteCLUT_T32_I4_CSM1_sse2:
# movdqa %xmm1, XMMWORD PTR [%ecx]
# movdqa %xmm3, XMMWORD PTR [%ecx+16]
# movdqa %xmm2, XMMWORD PTR [%ecx+32]
# movdqa %xmm4, XMMWORD PTR [%ecx+48]
# movdqa %xmm0, %xmm1
# punpckhqdq %xmm1, %xmm3
# punpcklqdq %xmm0, %xmm3
# movdqa XMMWORD PTR [%edx+32], %xmm1
# movdqa XMMWORD PTR [%edx], %xmm0
# movdqa %xmm0, %xmm2
# punpckhqdq %xmm2, %xmm4
# punpcklqdq %xmm0, %xmm4
# movdqa XMMWORD PTR [%edx+48], %xmm2
# movdqa XMMWORD PTR [%edx+16], %xmm0
# ret
#endif

View File

@ -309,7 +309,6 @@ extern "C" void __fastcall WriteCLUT_T32_I4_CSM1_sse2(u32* vm, u32* clut)
_mm_store_si128(&dst[3], _mm_unpackhi_epi64(r2, r3));
}
#if defined(_MSC_VER)
extern "C" {
PCSX2_ALIGNED16(int s_clut16mask2[4]) = { 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff };
@ -319,6 +318,7 @@ PCSX2_ALIGNED16(int s_clut16mask[8]) = { 0xffff0000, 0xffff0000, 0xffff0000, 0xf
extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut)
{
#if defined(_MSC_VER)
__asm {
mov eax, vm
mov ecx, clut
@ -430,9 +430,120 @@ WriteUnaligned:
movdqa [ecx+48], xmm3
End:
}
}
#endif // _MSC_VER
#else
__asm__(".intel_syntax noprefix\n"
"movdqa xmm0, xmmword ptr [ecx]\n"
"movdqa xmm1, xmmword ptr [ecx+16]\n"
"movdqa xmm2, xmmword ptr [ecx+32]\n"
"movdqa xmm3, xmmword ptr [ecx+48]\n"
// rearrange
"pshuflw xmm0, xmm0, 0x88\n"
"pshufhw xmm0, xmm0, 0x88\n"
"pshuflw xmm1, xmm1, 0x88\n"
"pshufhw xmm1, xmm1, 0x88\n"
"pshuflw xmm2, xmm2, 0x88\n"
"pshufhw xmm2, xmm2, 0x88\n"
"pshuflw xmm3, xmm3, 0x88\n"
"pshufhw xmm3, xmm3, 0x88\n"
"shufps xmm0, xmm1, 0x88\n"
"shufps xmm2, xmm3, 0x88\n"
"pshufd xmm0, xmm0, 0xd8\n"
"pshufd xmm2, xmm2, 0xd8\n"
"pxor xmm6, xmm6\n"
"test edx, 15\n"
"jnz WriteUnaligned\n"
"movdqa xmm7, [%[s_clut16mask]]\n" // saves upper 16 bits
// have to save interlaced with the old data
"movdqa xmm4, [edx]\n"
"movdqa xmm5, [edx+32]\n"
"movhlps xmm1, xmm0\n"
"movlhps xmm0, xmm2\n"// lower 8 colors
"pand xmm4, xmm7\n"
"pand xmm5, xmm7\n"
"shufps xmm1, xmm2, 0xe4\n" // upper 8 colors
"movdqa xmm2, xmm0\n"
"movdqa xmm3, xmm1\n"
"punpcklwd xmm0, xmm6\n"
"punpcklwd xmm1, xmm6\n"
"por xmm0, xmm4\n"
"por xmm1, xmm5\n"
"punpckhwd xmm2, xmm6\n"
"punpckhwd xmm3, xmm6\n"
"movdqa [edx], xmm0\n"
"movdqa [edx+32], xmm1\n"
"movdqa xmm5, xmm7\n"
"pand xmm7, [edx+16]\n"
"pand xmm5, [edx+48]\n"
"por xmm2, xmm7\n"
"por xmm3, xmm5\n"
"movdqa [edx+16], xmm2\n"
"movdqa [edx+48], xmm3\n"
"jmp WriteCLUT_T16_I4_CSM1_End\n"
"WriteUnaligned:\n"
// %edx is offset by 2
"sub edx, 2\n"
"movdqa xmm7, [%[s_clut16mask2]]\n" // saves lower 16 bits
// have to save interlaced with the old data
"movdqa xmm4, [edx]\n"
"movdqa xmm5, [edx+32]\n"
"movhlps xmm1, xmm0\n"
"movlhps xmm0, xmm2\n" // lower 8 colors
"pand xmm4, xmm7\n"
"pand xmm5, xmm7\n"
"shufps xmm1, xmm2, 0xe4\n" // upper 8 colors
"movdqa xmm2, xmm0\n"
"movdqa xmm3, xmm1\n"
"punpcklwd xmm0, xmm6\n"
"punpcklwd xmm1, xmm6\n"
"pslld xmm0, 16\n"
"pslld xmm1, 16\n"
"por xmm0, xmm4\n"
"por xmm1, xmm5\n"
"punpckhwd xmm2, xmm6\n"
"punpckhwd xmm3, xmm6\n"
"pslld xmm2, 16\n"
"pslld xmm3, 16\n"
"movdqa [edx], xmm0\n"
"movdqa [edx+32], xmm1\n"
"movdqa xmm5, xmm7\n"
"pand xmm7, [edx+16]\n"
"pand xmm5, [edx+48]\n"
"por xmm2, xmm7\n"
"por xmm3, xmm5\n"
"movdqa [edx+16], xmm2\n"
"movdqa [edx+48], xmm3\n"
"WriteCLUT_T16_I4_CSM1_End:\n"
".att_syntax\n"
: [s_clut16mask]"=m"(s_clut16mask), [s_clut16mask2]"=m"(s_clut16mask2)
);
#endif // _MSC_VER
}
#endif // ZEROGS_SSE2
void __fastcall WriteCLUT_T16_I8_CSM1_c(u32* _vm, u32* _clut)