From 9ea97041fb553dde3d5b70fdac57d57cb1f3a9ad Mon Sep 17 00:00:00 2001 From: arcum42 Date: Thu, 1 Apr 2010 22:52:59 +0000 Subject: [PATCH] zzogl-pg: Move WriteCLUT_T32_I4_CSM1_sse2 to inline assembly. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2803 96395faa-99c1-11dd-bbfe-3dabce05a288 --- plugins/zzogl-pg/opengl/x86-32.S | 370 ------------------------------- plugins/zzogl-pg/opengl/x86.cpp | 117 +++++++++- 2 files changed, 114 insertions(+), 373 deletions(-) diff --git a/plugins/zzogl-pg/opengl/x86-32.S b/plugins/zzogl-pg/opengl/x86-32.S index cb57d36c9a..2d49c2de2d 100644 --- a/plugins/zzogl-pg/opengl/x86-32.S +++ b/plugins/zzogl-pg/opengl/x86-32.S @@ -18,181 +18,6 @@ # .intel_syntax -## mmx memcpy implementation, size has to be a multiple of 8 -## returns 0 is equal, nonzero value if not equal -## ~10 times faster than standard memcmp -## (zerofrog) -#u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize) -#.globl memcmp_mmx -# .type memcmp_mmx, @function -#memcmp_mmx: -# push %esi -# mov %ecx, dword ptr [%esp+16] -# mov %edx, dword ptr [%esp+8] -# mov %esi, dword ptr [%esp+12] -# -# cmp %ecx, 32 -# jl Done4 -# -# // custom test first 8 to make sure things are ok -# movq %mm0, [%esi] -# movq %mm1, [%esi+8] -# pcmpeqd %mm0, [%edx] -# pcmpeqd %mm1, [%edx+8] -# pand %mm0, %mm1 -# movq %mm2, [%esi+16] -# pmovmskb %eax, %mm0 -# movq %mm3, [%esi+24] -# -# // check if eq -# cmp %eax, 0xff -# je NextComp -# mov %eax, 1 -# jmp End -# -#NextComp: -# pcmpeqd %mm2, [%edx+16] -# pcmpeqd %mm3, [%edx+24] -# pand %mm2, %mm3 -# pmovmskb %eax, %mm2 -# -# sub %ecx, 32 -# add %esi, 32 -# add %edx, 32 -# -# // check if eq -# cmp %eax, 0xff -# je ContinueTest -# mov %eax, 1 -# jmp End -# -# cmp %ecx, 64 -# jl Done8 -# -#Cmp8: -# movq %mm0, [%esi] -# movq %mm1, [%esi+8] -# movq %mm2, [%esi+16] -# movq %mm3, [%esi+24] -# movq %mm4, [%esi+32] -# movq %mm5, [%esi+40] -# movq %mm6, [%esi+48] -# movq %mm7, [%esi+56] -# pcmpeqd %mm0, [%edx] -# pcmpeqd %mm1, [%edx+8] -# pcmpeqd %mm2, [%edx+16] -# pcmpeqd %mm3, [%edx+24] -# pand %mm0, %mm1 -# pcmpeqd %mm4, [%edx+32] -# pand %mm0, %mm2 -# pcmpeqd %mm5, [%edx+40] -# pand %mm0, %mm3 -# pcmpeqd %mm6, [%edx+48] -# pand %mm0, %mm4 -# pcmpeqd %mm7, [%edx+56] -# pand %mm0, %mm5 -# pand %mm0, %mm6 -# pand %mm0, %mm7 -# pmovmskb %eax, %mm0 -# -# // check if eq -# cmp %eax, 0xff -# je Continue -# mov %eax, 1 -# jmp End -# -#Continue: -# sub %ecx, 64 -# add %esi, 64 -# add %edx, 64 -#ContinueTest: -# cmp %ecx, 64 -# jge Cmp8 -# -#Done8: -# test %ecx, 0x20 -# jz Done4 -# movq %mm0, [%esi] -# movq %mm1, [%esi+8] -# movq %mm2, [%esi+16] -# movq %mm3, [%esi+24] -# pcmpeqd %mm0, [%edx] -# pcmpeqd %mm1, [%edx+8] -# pcmpeqd %mm2, [%edx+16] -# pcmpeqd %mm3, [%edx+24] -# pand %mm0, %mm1 -# pand %mm0, %mm2 -# pand %mm0, %mm3 -# pmovmskb %eax, %mm0 -# sub %ecx, 32 -# add %esi, 32 -# add %edx, 32 -# -# // check if eq -# cmp %eax, 0xff -# je Done4 -# mov %eax, 1 -# jmp End -# -#Done4: -# cmp %ecx, 24 -# jne Done2 -# movq %mm0, [%esi] -# movq %mm1, [%esi+8] -# movq %mm2, [%esi+16] -# pcmpeqd %mm0, [%edx] -# pcmpeqd %mm1, [%edx+8] -# pcmpeqd %mm2, [%edx+16] -# pand %mm0, %mm1 -# pand %mm0, %mm2 -# pmovmskb %eax, %mm0 -# -# // check if eq -# cmp %eax, 0xff -# setne %al -# jmp End -# -#Done2: -# cmp %ecx, 16 -# jne Done1 -# -# movq %mm0, [%esi] -# movq %mm1, [%esi+8] -# pcmpeqd %mm0, [%edx] -# pcmpeqd %mm1, [%edx+8] -# pand %mm0, %mm1 -# pmovmskb %eax, %mm0 -# -# // check if eq -# cmp %eax, 0xff -# setne %al -# jmp End -# -#Done1: -# cmp %ecx, 8 -# jne Done -# -# mov %eax, [%esi] -# mov %esi, [%esi+4] -# cmp %eax, [%edx] -# je Next -# mov %eax, 1 -# jmp End -# -#Next: -# cmp %esi, [%edx+4] -# setne %al -# jmp End -# -#Done: -# xor %eax, %eax -# -#End: -# pop %esi -# emms -# ret - - #ifdef ZEROGS_SSE2 // SSE2 extensions #define punpck(op, sd0, sd2, s1, s3, d1, d3) \ @@ -803,200 +628,5 @@ SwizzleBlock4u_sse2_1: pop %ebx ret 4 - - - .align 16 -s_clut16mask: - .long 0xffff0000 - .long 0xffff0000 - .long 0xffff0000 - .long 0xffff0000 - - .align 16 -s_clut16mask2: - .long 0x0000ffff - .long 0x0000ffff - .long 0x0000ffff - .long 0x0000ffff - -.globl WriteCLUT_T16_I4_CSM1_sse2 - .type WriteCLUT_T16_I4_CSM1_sse2, @function -WriteCLUT_T16_I4_CSM1_sse2: - movdqa %xmm0, xmmword ptr [%ecx] - movdqa %xmm1, xmmword ptr [%ecx+16] - movdqa %xmm2, xmmword ptr [%ecx+32] - movdqa %xmm3, xmmword ptr [%ecx+48] - - // rearrange - pshuflw %xmm0, %xmm0, 0x88 - pshufhw %xmm0, %xmm0, 0x88 - pshuflw %xmm1, %xmm1, 0x88 - pshufhw %xmm1, %xmm1, 0x88 - pshuflw %xmm2, %xmm2, 0x88 - pshufhw %xmm2, %xmm2, 0x88 - pshuflw %xmm3, %xmm3, 0x88 - pshufhw %xmm3, %xmm3, 0x88 - - shufps %xmm0, %xmm1, 0x88 - shufps %xmm2, %xmm3, 0x88 - - pshufd %xmm0, %xmm0, 0xd8 - pshufd %xmm2, %xmm2, 0xd8 - - pxor %xmm6, %xmm6 - - test %edx, 15 - jnz WriteUnaligned - - movdqa %xmm7, [s_clut16mask] // saves upper 16 bits - - // have to save interlaced with the old data - movdqa %xmm4, [%edx] - movdqa %xmm5, [%edx+32] - movhlps %xmm1, %xmm0 - movlhps %xmm0, %xmm2 // lower 8 colors - - pand %xmm4, %xmm7 - pand %xmm5, %xmm7 - - shufps %xmm1, %xmm2, 0xe4 // upper 8 colors - movdqa %xmm2, %xmm0 - movdqa %xmm3, %xmm1 - - punpcklwd %xmm0, %xmm6 - punpcklwd %xmm1, %xmm6 - por %xmm0, %xmm4 - por %xmm1, %xmm5 - - punpckhwd %xmm2, %xmm6 - punpckhwd %xmm3, %xmm6 - - movdqa [%edx], %xmm0 - movdqa [%edx+32], %xmm1 - - movdqa %xmm5, %xmm7 - pand %xmm7, [%edx+16] - pand %xmm5, [%edx+48] - - por %xmm2, %xmm7 - por %xmm3, %xmm5 - - movdqa [%edx+16], %xmm2 - movdqa [%edx+48], %xmm3 - jmp WriteCLUT_T16_I4_CSM1_End - -WriteUnaligned: - // %edx is offset by 2 - sub %edx, 2 - - movdqa %xmm7, [s_clut16mask2] // saves lower 16 bits - - // have to save interlaced with the old data - movdqa %xmm4, [%edx] - movdqa %xmm5, [%edx+32] - movhlps %xmm1, %xmm0 - movlhps %xmm0, %xmm2 // lower 8 colors - - pand %xmm4, %xmm7 - pand %xmm5, %xmm7 - - shufps %xmm1, %xmm2, 0xe4 // upper 8 colors - movdqa %xmm2, %xmm0 - movdqa %xmm3, %xmm1 - - punpcklwd %xmm0, %xmm6 - punpcklwd %xmm1, %xmm6 - pslld %xmm0, 16 - pslld %xmm1, 16 - por %xmm0, %xmm4 - por %xmm1, %xmm5 - - punpckhwd %xmm2, %xmm6 - punpckhwd %xmm3, %xmm6 - pslld %xmm2, 16 - pslld %xmm3, 16 - - movdqa [%edx], %xmm0 - movdqa [%edx+32], %xmm1 - - movdqa %xmm5, %xmm7 - pand %xmm7, [%edx+16] - pand %xmm5, [%edx+48] - - por %xmm2, %xmm7 - por %xmm3, %xmm5 - - movdqa [%edx+16], %xmm2 - movdqa [%edx+48], %xmm3 -WriteCLUT_T16_I4_CSM1_End: - ret - - -#.globl WriteCLUT_T32_I8_CSM1_sse2 -# .type WriteCLUT_T32_I8_CSM1_sse2, @function -#WriteCLUT_T32_I8_CSM1_sse2: -# push %ebx -# xor %ebx, %ebx -#.L231: -# xor %eax, %eax -# .align 16 -#.L232: -# movdqa %xmm3, XMMWORD PTR [%eax+16+%ecx] -# movdqa %xmm4, XMMWORD PTR [%eax+48+%ecx] -# movdqa %xmm1, XMMWORD PTR [%eax+%ecx] -# movdqa %xmm2, XMMWORD PTR [%eax+32+%ecx] -# movdqa %xmm0, %xmm1 -# punpckhqdq %xmm1, %xmm3 -# punpcklqdq %xmm0, %xmm3 -# movdqa XMMWORD PTR [%edx+32+%eax*2], %xmm1 -# movdqa XMMWORD PTR [%edx+%eax*2], %xmm0 -# movdqa %xmm0, %xmm2 -# punpckhqdq %xmm2, %xmm4 -# punpcklqdq %xmm0, %xmm4 -# movdqa XMMWORD PTR [%edx+48+%eax*2], %xmm2 -# movdqa XMMWORD PTR [%edx+16+%eax*2], %xmm0 -# movdqa %xmm1, XMMWORD PTR [%eax+256+%ecx] -# movdqa %xmm3, XMMWORD PTR [%eax+272+%ecx] -# movdqa %xmm2, XMMWORD PTR [%eax+288+%ecx] -# movdqa %xmm4, XMMWORD PTR [%eax+304+%ecx] -# movdqa %xmm0, %xmm1 -# punpckhqdq %xmm1, %xmm3 -# punpcklqdq %xmm0, %xmm3 -# movdqa XMMWORD PTR [%edx+96+%eax*2], %xmm1 -# movdqa XMMWORD PTR [%edx+64+%eax*2], %xmm0 -# movdqa %xmm0, %xmm2 -# punpckhqdq %xmm2, %xmm4 -# punpcklqdq %xmm0, %xmm4 -# movdqa XMMWORD PTR [%edx+112+%eax*2], %xmm2 -# movdqa XMMWORD PTR [%edx+80+%eax*2], %xmm0 -# add %eax, 64 -# cmp %eax, 256 -# jne .L232 -# add %edx, 512 -# add %ecx, 512 -# add %ebx, 512 -# cmp %ebx, 1024 -# jne .L231 -# pop %ebx -# ret - -#.globl WriteCLUT_T32_I4_CSM1_sse2 -# .type WriteCLUT_T32_I4_CSM1_sse2, @function -#WriteCLUT_T32_I4_CSM1_sse2: -# movdqa %xmm1, XMMWORD PTR [%ecx] -# movdqa %xmm3, XMMWORD PTR [%ecx+16] -# movdqa %xmm2, XMMWORD PTR [%ecx+32] -# movdqa %xmm4, XMMWORD PTR [%ecx+48] -# movdqa %xmm0, %xmm1 -# punpckhqdq %xmm1, %xmm3 -# punpcklqdq %xmm0, %xmm3 -# movdqa XMMWORD PTR [%edx+32], %xmm1 -# movdqa XMMWORD PTR [%edx], %xmm0 -# movdqa %xmm0, %xmm2 -# punpckhqdq %xmm2, %xmm4 -# punpcklqdq %xmm0, %xmm4 -# movdqa XMMWORD PTR [%edx+48], %xmm2 -# movdqa XMMWORD PTR [%edx+16], %xmm0 -# ret #endif diff --git a/plugins/zzogl-pg/opengl/x86.cpp b/plugins/zzogl-pg/opengl/x86.cpp index 330163deb3..dc263de1e9 100644 --- a/plugins/zzogl-pg/opengl/x86.cpp +++ b/plugins/zzogl-pg/opengl/x86.cpp @@ -309,7 +309,6 @@ extern "C" void __fastcall WriteCLUT_T32_I4_CSM1_sse2(u32* vm, u32* clut) _mm_store_si128(&dst[3], _mm_unpackhi_epi64(r2, r3)); } -#if defined(_MSC_VER) extern "C" { PCSX2_ALIGNED16(int s_clut16mask2[4]) = { 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff }; @@ -319,6 +318,7 @@ PCSX2_ALIGNED16(int s_clut16mask[8]) = { 0xffff0000, 0xffff0000, 0xffff0000, 0xf extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut) { +#if defined(_MSC_VER) __asm { mov eax, vm mov ecx, clut @@ -430,9 +430,120 @@ WriteUnaligned: movdqa [ecx+48], xmm3 End: } -} -#endif // _MSC_VER +#else + __asm__(".intel_syntax noprefix\n" + "movdqa xmm0, xmmword ptr [ecx]\n" + "movdqa xmm1, xmmword ptr [ecx+16]\n" + "movdqa xmm2, xmmword ptr [ecx+32]\n" + "movdqa xmm3, xmmword ptr [ecx+48]\n" + // rearrange + "pshuflw xmm0, xmm0, 0x88\n" + "pshufhw xmm0, xmm0, 0x88\n" + "pshuflw xmm1, xmm1, 0x88\n" + "pshufhw xmm1, xmm1, 0x88\n" + "pshuflw xmm2, xmm2, 0x88\n" + "pshufhw xmm2, xmm2, 0x88\n" + "pshuflw xmm3, xmm3, 0x88\n" + "pshufhw xmm3, xmm3, 0x88\n" + + "shufps xmm0, xmm1, 0x88\n" + "shufps xmm2, xmm3, 0x88\n" + + "pshufd xmm0, xmm0, 0xd8\n" + "pshufd xmm2, xmm2, 0xd8\n" + + "pxor xmm6, xmm6\n" + + "test edx, 15\n" + "jnz WriteUnaligned\n" + + "movdqa xmm7, [%[s_clut16mask]]\n" // saves upper 16 bits + + // have to save interlaced with the old data + "movdqa xmm4, [edx]\n" + "movdqa xmm5, [edx+32]\n" + "movhlps xmm1, xmm0\n" + "movlhps xmm0, xmm2\n"// lower 8 colors + + "pand xmm4, xmm7\n" + "pand xmm5, xmm7\n" + + "shufps xmm1, xmm2, 0xe4\n" // upper 8 colors + "movdqa xmm2, xmm0\n" + "movdqa xmm3, xmm1\n" + + "punpcklwd xmm0, xmm6\n" + "punpcklwd xmm1, xmm6\n" + "por xmm0, xmm4\n" + "por xmm1, xmm5\n" + + "punpckhwd xmm2, xmm6\n" + "punpckhwd xmm3, xmm6\n" + + "movdqa [edx], xmm0\n" + "movdqa [edx+32], xmm1\n" + + "movdqa xmm5, xmm7\n" + "pand xmm7, [edx+16]\n" + "pand xmm5, [edx+48]\n" + + "por xmm2, xmm7\n" + "por xmm3, xmm5\n" + + "movdqa [edx+16], xmm2\n" + "movdqa [edx+48], xmm3\n" + "jmp WriteCLUT_T16_I4_CSM1_End\n" + +"WriteUnaligned:\n" + // %edx is offset by 2 + "sub edx, 2\n" + + "movdqa xmm7, [%[s_clut16mask2]]\n" // saves lower 16 bits + + // have to save interlaced with the old data + "movdqa xmm4, [edx]\n" + "movdqa xmm5, [edx+32]\n" + "movhlps xmm1, xmm0\n" + "movlhps xmm0, xmm2\n" // lower 8 colors + + "pand xmm4, xmm7\n" + "pand xmm5, xmm7\n" + + "shufps xmm1, xmm2, 0xe4\n" // upper 8 colors + "movdqa xmm2, xmm0\n" + "movdqa xmm3, xmm1\n" + + "punpcklwd xmm0, xmm6\n" + "punpcklwd xmm1, xmm6\n" + "pslld xmm0, 16\n" + "pslld xmm1, 16\n" + "por xmm0, xmm4\n" + "por xmm1, xmm5\n" + + "punpckhwd xmm2, xmm6\n" + "punpckhwd xmm3, xmm6\n" + "pslld xmm2, 16\n" + "pslld xmm3, 16\n" + + "movdqa [edx], xmm0\n" + "movdqa [edx+32], xmm1\n" + + "movdqa xmm5, xmm7\n" + "pand xmm7, [edx+16]\n" + "pand xmm5, [edx+48]\n" + + "por xmm2, xmm7\n" + "por xmm3, xmm5\n" + + "movdqa [edx+16], xmm2\n" + "movdqa [edx+48], xmm3\n" +"WriteCLUT_T16_I4_CSM1_End:\n" + ".att_syntax\n" + : [s_clut16mask]"=m"(s_clut16mask), [s_clut16mask2]"=m"(s_clut16mask2) + ); +#endif // _MSC_VER +} #endif // ZEROGS_SSE2 void __fastcall WriteCLUT_T16_I8_CSM1_c(u32* _vm, u32* _clut)