From 4f6cbe476d9bd3770f356edede1f42df13f1d933 Mon Sep 17 00:00:00 2001 From: arcum42 Date: Fri, 19 Mar 2010 01:17:42 +0000 Subject: [PATCH] zzogl-pg: Remove old 64 bit code. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2742 96395faa-99c1-11dd-bbfe-3dabce05a288 --- plugins/zzogl-pg/opengl/Linux/Makefile.am | 6 - plugins/zzogl-pg/opengl/Makefile.am | 10 - plugins/zzogl-pg/opengl/configure.ac | 28 - plugins/zzogl-pg/opengl/x86-64.S | 906 ----------------- plugins/zzogl-pg/opengl/x86-64.asm | 1091 --------------------- plugins/zzogl-pg/opengl/zerogs.cpp | 63 -- 6 files changed, 2104 deletions(-) delete mode 100644 plugins/zzogl-pg/opengl/x86-64.S delete mode 100644 plugins/zzogl-pg/opengl/x86-64.asm diff --git a/plugins/zzogl-pg/opengl/Linux/Makefile.am b/plugins/zzogl-pg/opengl/Linux/Makefile.am index 8200d1f6d3..7d3af6140e 100644 --- a/plugins/zzogl-pg/opengl/Linux/Makefile.am +++ b/plugins/zzogl-pg/opengl/Linux/Makefile.am @@ -1,10 +1,4 @@ noinst_LIBRARIES = libZeroGSLinux.a -if X86_64 -libZeroGSLinux_a_CPPFLAGS = -fPIC -libZeroGSLinux_a_CXXFLAGS = -fPIC -libZeroGSLinux_a_CFLAGS = -fPIC -endif - INCLUDES = $(shell pkg-config --cflags gtk+-2.0) -I@srcdir@/../ -I@srcdir@/../../../../common/include libZeroGSLinux_a_SOURCES = callbacks.c Conf.cpp interface.c Linux.cpp support.c diff --git a/plugins/zzogl-pg/opengl/Makefile.am b/plugins/zzogl-pg/opengl/Makefile.am index 200328ed7c..07f428ccd3 100644 --- a/plugins/zzogl-pg/opengl/Makefile.am +++ b/plugins/zzogl-pg/opengl/Makefile.am @@ -6,12 +6,6 @@ INCLUDES = -I@srcdir@/../../../common/include -I@srcdir@/common libzzoglpg_a_CPPFLAGS = $(shell pkg-config --cflags gtk+-2.0) libzzoglpg_a_CXXFLAGS = $(shell pkg-config --cflags gtk+-2.0) -if X86_64 -libzzoglpg_a_CPPFLAGS += -fPIC -libzzoglpg_a_CXXFLAGS += -fPIC -CCASFLAGS += -fPIC -endif - # Create a shared object by faking an exe (thanks to ODE makefiles) traplibdir=$(prefix) @@ -34,11 +28,7 @@ GSmain.cpp memcpy_amd.cpp Regs.cpp x86.cpp zpipe.cpp Mem.cpp \ rasterfont.cpp targets.cpp zerogs.cpp ZZoglVB.cpp ZZoglShoots.cpp ZZoglCreate.cpp \ ZZoglShaders.cpp ZZoglCRTC.cpp ZZoglSave.cpp ZZoglFlush.cpp -if X86_64 -libzzoglpg_a_SOURCES += x86-64.S -else libzzoglpg_a_SOURCES += x86-32.S -endif if SSE2 CCASFLAGS+= -DZEROGS_SSE2 diff --git a/plugins/zzogl-pg/opengl/configure.ac b/plugins/zzogl-pg/opengl/configure.ac index 8aaafdcca5..8d77c714f5 100644 --- a/plugins/zzogl-pg/opengl/configure.ac +++ b/plugins/zzogl-pg/opengl/configure.ac @@ -71,33 +71,6 @@ fi AC_MSG_RESULT($sse2) AM_CONDITIONAL(SSE2, test x$sse2 = xyes) -dnl Check for 64bit CPU -AC_MSG_CHECKING(for a x86-64 CPU) -dnl if test "$build_os" == "target_os" -dnl then -AC_TRY_RUN([ -int main() -{ -int a = 0; -int*pa = &a; -asm(".intel_syntax\n" - "mov %%rax, %0\n" - "mov %%eax, [%%rax]\n" - ".att_syntax\n" - : : "r"(pa) : "%rax"); -return 0; -} -],cpu64=yes,cpu64=no,) -dnl else -dnl cpu64=no -dnl fi -if test "x$cpu64" == xyes -then -AC_DEFINE(__x86_64__,1,[__x86_64__]) -fi -AC_MSG_RESULT($cpu64) -AM_CONDITIONAL(X86_64, test x$cpu64 = xyes) - dnl gtk AC_MSG_CHECKING(gtk2+) AC_CHECK_PROG(GTK_CONFIG, pkg-config, pkg-config) @@ -145,7 +118,6 @@ AC_OUTPUT([ echo "Configuration:" echo " Target system type: $target" -echo " x86-64 build? $cpu64" echo " Debug build? $debug" echo " Dev build? $devbuild" echo " SSE2 enabled? $sse2" diff --git a/plugins/zzogl-pg/opengl/x86-64.S b/plugins/zzogl-pg/opengl/x86-64.S deleted file mode 100644 index 6f221e7b33..0000000000 --- a/plugins/zzogl-pg/opengl/x86-64.S +++ /dev/null @@ -1,906 +0,0 @@ -## Copyright (C) 2005-2006 zerofrog(@gmail.com) -# -# This Program is free software you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation either ve%rsion 2, or (at your option) -# any later ve%rsion. -# -# This Program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Make see the file COPYING. If not, write to -# the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. -# http://www.gnu.org/copyleft/gpl.html -# -# -.intel_syntax - -## mmx memcpy implementation, size has to be a multiple of 8 -## returns 0 is equal, nonzero value if not equal -## ~10 times faster than standard memcmp -## (zerofrog) -## u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize) -## %rdi - src1 -## %rsi - src2 -## edx - cmpsize -.globl memcmp_mmx - .type memcmp_mmx, @function -memcmp_mmx: - cmp %edx, 32 - jl Done4 - - ## custom test first 8 to make sure things are ok - movq %mm0, [%rsi] - movq %mm1, [%rsi+8] - pcmpeqd %mm0, [%rdi] - pcmpeqd %mm1, [%rdi+8] - pand %mm0, %mm1 - movq %mm2, [%rsi+16] - pmovmskb %eax, %mm0 - movq %mm3, [%rsi+24] - - // check if eq - cmp %eax, 0xff - je NextComp - mov %eax, 1 - jmp End - -NextComp: - pcmpeqd %mm2, [%rdi+16] - pcmpeqd %mm3, [%rdi+24] - pand %mm2, %mm3 - pmovmskb %eax, %mm2 - - sub %edx, 32 - add %rsi, 32 - add %rdi, 32 - - // check if eq - cmp %eax, 0xff - je ContinueTest - mov %eax, 1 - jmp End - - cmp %edx, 64 - jl Done8 - -Cmp8: - movq %mm0, [%rsi] - movq %mm1, [%rsi+8] - movq %mm2, [%rsi+16] - movq %mm3, [%rsi+24] - movq %mm4, [%rsi+32] - movq %mm5, [%rsi+40] - movq %mm6, [%rsi+48] - movq %mm7, [%rsi+56] - pcmpeqd %mm0, [%rdi] - pcmpeqd %mm1, [%rdi+8] - pcmpeqd %mm2, [%rdi+16] - pcmpeqd %mm3, [%rdi+24] - pand %mm0, %mm1 - pcmpeqd %mm4, [%rdi+32] - pand %mm0, %mm2 - pcmpeqd %mm5, [%rdi+40] - pand %mm0, %mm3 - pcmpeqd %mm6, [%rdi+48] - pand %mm0, %mm4 - pcmpeqd %mm7, [%rdi+56] - pand %mm0, %mm5 - pand %mm0, %mm6 - pand %mm0, %mm7 - pmovmskb %eax, %mm0 - - // check if eq - cmp %eax, 0xff - je Continue - mov %eax, 1 - jmp End - -Continue: - sub %edx, 64 - add %rsi, 64 - add %rdi, 64 -ContinueTest: - cmp %edx, 64 - jge Cmp8 - -Done8: - test %edx, 0x20 - jz Done4 - movq %mm0, [%rsi] - movq %mm1, [%rsi+8] - movq %mm2, [%rsi+16] - movq %mm3, [%rsi+24] - pcmpeqd %mm0, [%rdi] - pcmpeqd %mm1, [%rdi+8] - pcmpeqd %mm2, [%rdi+16] - pcmpeqd %mm3, [%rdi+24] - pand %mm0, %mm1 - pand %mm0, %mm2 - pand %mm0, %mm3 - pmovmskb %eax, %mm0 - sub %edx, 32 - add %rsi, 32 - add %rdi, 32 - - // check if eq - cmp %eax, 0xff - je Done4 - mov %eax, 1 - jmp End - -Done4: - cmp %edx, 24 - jne Done2 - movq %mm0, [%rsi] - movq %mm1, [%rsi+8] - movq %mm2, [%rsi+16] - pcmpeqd %mm0, [%rdi] - pcmpeqd %mm1, [%rdi+8] - pcmpeqd %mm2, [%rdi+16] - pand %mm0, %mm1 - pand %mm0, %mm2 - pmovmskb %eax, %mm0 - - // check if eq - cmp %eax, 0xff - je Done - mov %eax, 1 - jmp End - -Done2: - cmp %edx, 16 - jne Done1 - - movq %mm0, [%rsi] - movq %mm1, [%rsi+8] - pcmpeqd %mm0, [%rdi] - pcmpeqd %mm1, [%rdi+8] - pand %mm0, %mm1 - pmovmskb %eax, %mm0 - - // check if eq - cmp %eax, 0xff - je Done - mov %eax, 1 - jmp End - -Done1: - cmp %edx, 8 - jne Done - - mov %eax, [%rsi] - mov %rsi, [%rsi+4] - cmp %eax, [%rdi] - je Next - mov %eax, 1 - jmp End - -Next: - cmp %rsi, [%rdi+4] - je Done - mov %eax, 1 - jmp End - -Done: - xor %eax, %eax - -End: - emms - ret - -#ifdef ZEROGS_SSE2 -// SSE2 extensions - -#define punpck(op, sd0, sd2, s1, s3, d1, d3) \ - movdqa %xmm##d1, %xmm##sd0; \ - pshufd %xmm##d3, %xmm##sd2, 0xe4; \ - punpckl##op %xmm##sd0, %xmm##s1; \ - punpckh##op %xmm##d1, %xmm##s1; \ - punpckl##op %xmm##sd2, %xmm##s3; \ - punpckh##op %xmm##d3, %xmm##s3; \ - -#define punpcknbl \ - movdqa %xmm4, %xmm0; \ - pshufd %xmm5, %xmm1, 0xe4; \ - \ - psllq %xmm1, 4; \ - psrlq %xmm4, 4; \ - \ - movdqa %xmm6, %xmm7; \ - pand %xmm0, %xmm7; \ - pandn %xmm6, %xmm1; \ - por %xmm0, %xmm6; \ - \ - movdqa %xmm6, %xmm7; \ - pand %xmm4, %xmm7; \ - pandn %xmm6, %xmm5; \ - por %xmm4, %xmm6; \ - \ - movdqa %xmm1, %xmm4; \ - \ - movdqa %xmm4, %xmm2; \ - pshufd %xmm5, %xmm3, 0xe4; \ - \ - psllq %xmm3, 4; \ - psrlq %xmm4, 4; \ - \ - movdqa %xmm6, %xmm7; \ - pand %xmm2, %xmm7; \ - pandn %xmm6, %xmm3; \ - por %xmm2, %xmm6; \ - \ - movdqa %xmm6, %xmm7; \ - pand %xmm4, %xmm7; \ - pandn %xmm6, %xmm5; \ - por %xmm4, %xmm6; \ - \ - movdqa %xmm3, %xmm4; \ - \ - punpck(bw, 0, 2, 1, 3, 4, 6); \ - -#define punpcknbh \ - movdqa %xmm12, %xmm8; \ - pshufd %xmm13, %xmm9, 0xe4; \ - \ - psllq %xmm9, 4; \ - psrlq %xmm12, 4; \ - \ - movdqa %xmm14, %xmm15; \ - pand %xmm8, %xmm15; \ - pandn %xmm14, %xmm9; \ - por %xmm8, %xmm14; \ - \ - movdqa %xmm14, %xmm15; \ - pand %xmm12, %xmm15; \ - pandn %xmm14, %xmm13; \ - por %xmm12, %xmm14; \ - \ - movdqa %xmm9, %xmm12; \ - \ - movdqa %xmm12, %xmm10; \ - pshufd %xmm13, %xmm11, 0xe4; \ - \ - psllq %xmm11, 4; \ - psrlq %xmm12, 4; \ - \ - movdqa %xmm14, %xmm15; \ - pand %xmm10, %xmm15; \ - pandn %xmm14, %xmm11; \ - por %xmm10, %xmm14; \ - \ - movdqa %xmm14, %xmm15; \ - pand %xmm12, %xmm15; \ - pandn %xmm14, %xmm13; \ - por %xmm12, %xmm14; \ - \ - movdqa %xmm11, %xmm12; \ - \ - punpck(bw, 8, 10, 9, 11, 12, 14); \ - -// -// SwizzleBlock32_sse2 -// - -.globl SwizzleBlock32_sse2 - .type SwizzleBlock32_sse2, @function -SwizzleBlock32_sse2: - - mov %eax, 4 - - cmp %ecx, 0xffffffff - jne SwizzleBlock32_sse2_2 - - .align 16 -SwizzleBlock32_sse2_1: - movdqa %xmm0, [%rsi] - movdqa %xmm4, [%rsi+16] - movdqa %xmm1, [%rsi+%rdx] - movdqa %xmm5, [%rsi+%rdx+16] - - punpck(qdq, 0, 4, 1, 5, 2, 6) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm2 - movdqa [%rdi+16*2], %xmm4 - movdqa [%rdi+16*3], %xmm6 - - lea %rsi, [%rsi+%rdx*2] - add %rdi, 64 - - dec %eax - jnz SwizzleBlock32_sse2_1 - - ret - -SwizzleBlock32_sse2_2: - - movd %xmm7, %rcx - pshufd %xmm7, %xmm7, 0 - - .align 16 -SwizzleBlock32_sse2_3: - movdqa %xmm0, [%rsi] - movdqa %xmm4, [%rsi+16] - movdqa %xmm1, [%rsi+%rdx] - movdqa %xmm5, [%rsi+%rdx+16] - - punpck(qdq, 0, 4, 1, 5, 2, 6) - - movdqa %xmm3, %xmm7 - pshufd %xmm5, %xmm7, 0xe4 - movdqa %xmm9, %xmm7 - pshufd %xmm11, %xmm7, 0xe4 - - pandn %xmm3, [%rdi+16*0] - pand %xmm0, %xmm7 - por %xmm0, %xmm3 - movdqa [%rdi+16*0], %xmm0 - - pandn %xmm5, [%rdi+16*1] - pand %xmm2, %xmm7 - por %xmm2, %xmm5 - movdqa [%rdi+16*1], %xmm2 - - pandn %xmm9, [%rdi+16*2] - pand %xmm4, %xmm7 - por %xmm4, %xmm9 - movdqa [%rdi+16*2], %xmm4 - - pandn %xmm11, [%rdi+16*3] - pand %xmm6, %xmm7 - por %xmm6, %xmm11 - movdqa [%rdi+16*3], %xmm6 - - lea %rsi, [%rsi+%rdx*2] - add %rdi, 64 - - dec %eax - jnz SwizzleBlock32_sse2_3 - - ret - -// -// SwizzleBlock16_sse2 -// - -.globl SwizzleBlock16_sse2 - .type SwizzleBlock16_sse2, @function -SwizzleBlock16_sse2: - - mov %eax, 4 - - .align 16 -SwizzleBlock16_sse2_1: - movdqa %xmm0, [%rsi] - movdqa %xmm1, [%rsi+16] - movdqa %xmm2, [%rsi+%rdx] - movdqa %xmm3, [%rsi+%rdx+16] - - punpck(wd, 0, 2, 1, 3, 4, 6) - punpck(qdq, 0, 4, 2, 6, 1, 5) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm1 - movdqa [%rdi+16*2], %xmm4 - movdqa [%rdi+16*3], %xmm5 - - lea %rsi, [%rsi+%rdx*2] - add %rdi, 64 - - dec %eax - jnz SwizzleBlock16_sse2_1 - - ret - -// -// SwizzleBlock8 -// - -.globl SwizzleBlock8_sse2 - .type SwizzleBlock8_sse2, @function -SwizzleBlock8_sse2: - - mov %ecx, 2 - - .align 16 -SwizzleBlock8_sse2_1: - // col 0, 2 - - movdqa %xmm0, [%rsi] - movdqa %xmm2, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - pshufd %xmm1, [%rsi], 0xb1 - pshufd %xmm3, [%rsi+%rdx], 0xb1 - lea %rsi, [%rsi+%rdx*2] - - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(wd, 0, 2, 4, 6, 1, 3) - punpck(qdq, 0, 1, 2, 3, 4, 5) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm4 - movdqa [%rdi+16*2], %xmm1 - movdqa [%rdi+16*3], %xmm5 - - // col 1, 3 - - pshufd %xmm0, [%rsi], 0xb1 - pshufd %xmm2, [%rsi+%rdx], 0xb1 - lea %rsi, [%rsi+%rdx*2] - - movdqa %xmm1, [%rsi] - movdqa %xmm3, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(wd, 0, 2, 4, 6, 1, 3) - punpck(qdq, 0, 1, 2, 3, 4, 5) - - movdqa [%rdi+16*4], %xmm0 - movdqa [%rdi+16*5], %xmm4 - movdqa [%rdi+16*6], %xmm1 - movdqa [%rdi+16*7], %xmm5 - - add %rdi, 128 - - dec %ecx - jnz SwizzleBlock8_sse2_1 - - ret - -// -// SwizzleBlock4 -// - -.globl SwizzleBlock4_sse2 - .type SwizzleBlock4_sse2, @function -SwizzleBlock4_sse2: - - mov %ecx, 2 - - mov %eax, 0x0f0f0f0f - movd %xmm7, %eax - pshufd %xmm7, %xmm7, 0 - - .align 16 -SwizzleBlock4_sse2_1: - // col 0, 2 - - movdqa %xmm0, [%rsi] - movdqa %xmm2, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - movdqa %xmm1, [%rsi] - movdqa %xmm3, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - pshuflw %xmm1, %xmm1, 0xb1 - pshuflw %xmm3, %xmm3, 0xb1 - pshufhw %xmm1, %xmm1, 0xb1 - pshufhw %xmm3, %xmm3, 0xb1 - - punpcknbl - punpck(bw, 0, 2, 4, 6, 1, 3) - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(qdq, 0, 4, 2, 6, 1, 3) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm1 - movdqa [%rdi+16*2], %xmm4 - movdqa [%rdi+16*3], %xmm3 - - // col 1, 3 - - movdqa %xmm0, [%rsi] - movdqa %xmm2, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - movdqa %xmm1, [%rsi] - movdqa %xmm3, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - pshuflw %xmm0, %xmm0, 0xb1 - pshuflw %xmm2, %xmm2, 0xb1 - pshufhw %xmm0, %xmm0, 0xb1 - pshufhw %xmm2, %xmm2, 0xb1 - - punpcknbl - punpck(bw, 0, 2, 4, 6, 1, 3) - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(qdq, 0, 4, 2, 6, 1, 3) - - movdqa [%rdi+16*4], %xmm0 - movdqa [%rdi+16*5], %xmm1 - movdqa [%rdi+16*6], %xmm4 - movdqa [%rdi+16*7], %xmm3 - - add %rdi, 128 - - dec %ecx - jnz SwizzleBlock4_sse2_1 - - ret - -// -// swizzling with unaligned reads -// - -// -// SwizzleBlock32u_sse2 -// - -.globl SwizzleBlock32u_sse2 - .type SwizzleBlock32u_sse2, @function -SwizzleBlock32u_sse2: - - mov %eax, 4 - - cmp %ecx, 0xffffffff - jne SwizzleBlock32u_sse2_2 - - .align 16 -SwizzleBlock32u_sse2_1: - movdqu %xmm0, [%rsi] - movdqu %xmm4, [%rsi+16] - movdqu %xmm1, [%rsi+%rdx] - movdqu %xmm5, [%rsi+%rdx+16] - - punpck(qdq, 0, 4, 1, 5, 2, 6) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm2 - movdqa [%rdi+16*2], %xmm4 - movdqa [%rdi+16*3], %xmm6 - - lea %rsi, [%rsi+%rdx*2] - add %rdi, 64 - - dec %eax - jnz SwizzleBlock32u_sse2_1 - - ret - -SwizzleBlock32u_sse2_2: - - movd %xmm7, %rcx - pshufd %xmm7, %xmm7, 0 - - .align 16 -SwizzleBlock32u_sse2_3: - movdqu %xmm0, [%rsi] - movdqu %xmm4, [%rsi+16] - movdqu %xmm1, [%rsi+%rdx] - movdqu %xmm5, [%rsi+%rdx+16] - - punpck(qdq, 0, 4, 1, 5, 2, 6) - - movdqa %xmm3, %xmm7 - pshufd %xmm5, %xmm7, 0xe4 - movdqa %xmm9, %xmm7 - pshufd %xmm11, %xmm7, 0xe4 - - pandn %xmm3, [%rdi+16*0] - pand %xmm0, %xmm7 - por %xmm0, %xmm3 - movdqa [%rdi+16*0], %xmm0 - - pandn %xmm5, [%rdi+16*1] - pand %xmm2, %xmm7 - por %xmm2, %xmm5 - movdqa [%rdi+16*1], %xmm2 - - pandn %xmm9, [%rdi+16*2] - pand %xmm4, %xmm7 - por %xmm4, %xmm9 - movdqa [%rdi+16*2], %xmm4 - - pandn %xmm11, [%rdi+16*3] - pand %xmm6, %xmm7 - por %xmm6, %xmm11 - movdqa [%rdi+16*3], %xmm6 - - lea %rsi, [%rsi+%rdx*2] - add %rdi, 64 - - dec %eax - jnz SwizzleBlock32u_sse2_3 - - ret - -// -// SwizzleBlock16u_sse2 -// - -.globl SwizzleBlock16u_sse2 - .type SwizzleBlock16u_sse2, @function -SwizzleBlock16u_sse2: - mov %eax, 4 - - .align 16 -SwizzleBlock16u_sse2_1: - movdqu %xmm0, [%rsi] - movdqu %xmm1, [%rsi+16] - movdqu %xmm2, [%rsi+%rdx] - movdqu %xmm3, [%rsi+%rdx+16] - - punpck(wd, 0, 2, 1, 3, 4, 6) - punpck(qdq, 0, 4, 2, 6, 1, 5) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm1 - movdqa [%rdi+16*2], %xmm4 - movdqa [%rdi+16*3], %xmm5 - - lea %rsi, [%rsi+%rdx*2] - add %rdi, 64 - - dec %eax - jnz SwizzleBlock16u_sse2_1 - - ret - -// -// SwizzleBlock8u -// - -.globl SwizzleBlock8u_sse2 - .type SwizzleBlock8u_sse2, @function -SwizzleBlock8u_sse2: - mov %ecx, 2 - - .align 16 -SwizzleBlock8u_sse2_1: - // col 0, 2 - - movdqu %xmm0, [%rsi] - movdqu %xmm2, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - pshufd %xmm1, %xmm0, 0xb1 - pshufd %xmm3, %xmm2, 0xb1 - lea %rsi, [%rsi+%rdx*2] - - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(wd, 0, 2, 4, 6, 1, 3) - punpck(qdq, 0, 1, 2, 3, 4, 5) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm4 - movdqa [%rdi+16*2], %xmm1 - movdqa [%rdi+16*3], %xmm5 - - // col 1, 3 - - movdqu %xmm0, [%rsi] - movdqu %xmm2, [%rsi+%rdx] - pshufd %xmm0, %xmm0, 0xb1 - pshufd %xmm2, %xmm2, 0xb1 - lea %rsi, [%rsi+%rdx*2] - - movdqu %xmm1, [%rsi] - movdqu %xmm3, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(wd, 0, 2, 4, 6, 1, 3) - punpck(qdq, 0, 1, 2, 3, 4, 5) - - movdqa [%rdi+16*4], %xmm0 - movdqa [%rdi+16*5], %xmm4 - movdqa [%rdi+16*6], %xmm1 - movdqa [%rdi+16*7], %xmm5 - - add %rdi, 128 - - dec %ecx - jnz SwizzleBlock8u_sse2_1 - - ret - -// -// SwizzleBlock4u -// - -.globl SwizzleBlock4u_sse2 - .type SwizzleBlock4u_sse2, @function -SwizzleBlock4u_sse2: - - mov %ecx, 2 - - mov %eax, 0xf0f0f0f - movd %xmm7, %eax - pshufd %xmm7, %xmm7, 0 - - .align 16 -SwizzleBlock4u_sse2_1: - // col 0, 2 - - movdqu %xmm0, [%rsi] - movdqu %xmm2, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - movdqu %xmm1, [%rsi] - movdqu %xmm3, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - pshuflw %xmm1, %xmm1, 0xb1 - pshuflw %xmm3, %xmm3, 0xb1 - pshufhw %xmm1, %xmm1, 0xb1 - pshufhw %xmm3, %xmm3, 0xb1 - - punpcknbl - punpck(bw, 0, 2, 4, 6, 1, 3) - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(qdq, 0, 4, 2, 6, 1, 3) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm1 - movdqa [%rdi+16*2], %xmm4 - movdqa [%rdi+16*3], %xmm3 - - // col 1, 3 - - movdqu %xmm0, [%rsi] - movdqu %xmm2, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - movdqu %xmm1, [%rsi] - movdqu %xmm3, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - pshuflw %xmm0, %xmm0, 0xb1 - pshuflw %xmm2, %xmm2, 0xb1 - pshufhw %xmm0, %xmm0, 0xb1 - pshufhw %xmm2, %xmm2, 0xb1 - - punpcknbl - punpck(bw, 0, 2, 4, 6, 1, 3) - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(qdq, 0, 4, 2, 6, 1, 3) - - movdqa [%rdi+16*4], %xmm0 - movdqa [%rdi+16*5], %xmm1 - movdqa [%rdi+16*6], %xmm4 - movdqa [%rdi+16*7], %xmm3 - - add %rdi, 128 - - dec %ecx - jnz SwizzleBlock4u_sse2_1 - - ret - - - .align 16 -s_clut16mask: - .long 0xffff0000 - .long 0xffff0000 - .long 0xffff0000 - .long 0xffff0000 - - .align 16 -s_clut16mask2: - - .long 0x0000ffff - .long 0x0000ffff - .long 0x0000ffff - .long 0x0000ffff - -.globl WriteCLUT_T16_I4_CSM1_sse2 - .type WriteCLUT_T16_I4_CSM1_sse2, @function -WriteCLUT_T16_I4_CSM1_sse2: - movdqa %xmm0, xmmword ptr [%rdi] - movdqa %xmm1, xmmword ptr [%rdi+16] - movdqa %xmm2, xmmword ptr [%rdi+32] - movdqa %xmm3, xmmword ptr [%rdi+48] - - // rearrange - pshuflw %xmm0, %xmm0, 0x88 - pshufhw %xmm0, %xmm0, 0x88 - pshuflw %xmm1, %xmm1, 0x88 - pshufhw %xmm1, %xmm1, 0x88 - pshuflw %xmm2, %xmm2, 0x88 - pshufhw %xmm2, %xmm2, 0x88 - pshuflw %xmm3, %xmm3, 0x88 - pshufhw %xmm3, %xmm3, 0x88 - - shufps %xmm0, %xmm1, 0x88 - shufps %xmm2, %xmm3, 0x88 - - pshufd %xmm0, %xmm0, 0xd8 - pshufd %xmm2, %xmm2, 0xd8 - - pxor %xmm6, %xmm6 - - test %rsi, 15 - jnz WriteUnaligned - - movdqa %xmm7, [%rip+s_clut16mask] // saves upper 16 bits - - // have to save interlaced with the old data - movdqa %xmm4, [%rsi] - movdqa %xmm5, [%rsi+32] - movhlps %xmm1, %xmm0 - movlhps %xmm0, %xmm2 // lower 8 colors - - pand %xmm4, %xmm7 - pand %xmm5, %xmm7 - - shufps %xmm1, %xmm2, 0xe4 // upper 8 colors - movdqa %xmm2, %xmm0 - movdqa %xmm3, %xmm1 - - punpcklwd %xmm0, %xmm6 - punpcklwd %xmm1, %xmm6 - por %xmm0, %xmm4 - por %xmm1, %xmm5 - - punpckhwd %xmm2, %xmm6 - punpckhwd %xmm3, %xmm6 - - movdqa [%rsi], %xmm0 - movdqa [%rsi+32], %xmm1 - - movdqa %xmm5, %xmm7 - pand %xmm7, [%rsi+16] - pand %xmm5, [%rsi+48] - - por %xmm2, %xmm7 - por %xmm3, %xmm5 - - movdqa [%rsi+16], %xmm2 - movdqa [%rsi+48], %xmm3 - jmp WriteCLUT_T16_I4_CSM1_End - -WriteUnaligned: - // %rsi is offset by 2 - sub %rsi, 2 - - movdqa %xmm7, [%rip+s_clut16mask2] // saves lower 16 bits - - // have to save interlaced with the old data - movdqa %xmm4, [%rsi] - movdqa %xmm5, [%rsi+32] - movhlps %xmm1, %xmm0 - movlhps %xmm0, %xmm2 // lower 8 colors - - pand %xmm4, %xmm7 - pand %xmm5, %xmm7 - - shufps %xmm1, %xmm2, 0xe4 // upper 8 colors - movdqa %xmm2, %xmm0 - movdqa %xmm3, %xmm1 - - punpcklwd %xmm0, %xmm6 - punpcklwd %xmm1, %xmm6 - pslld %xmm0, 16 - pslld %xmm1, 16 - por %xmm0, %xmm4 - por %xmm1, %xmm5 - - punpckhwd %xmm2, %xmm6 - punpckhwd %xmm3, %xmm6 - pslld %xmm2, 16 - pslld %xmm3, 16 - - movdqa [%rsi], %xmm0 - movdqa [%rsi+32], %xmm1 - - movdqa %xmm5, %xmm7 - pand %xmm7, [%rsi+16] - pand %xmm5, [%rsi+48] - - por %xmm2, %xmm7 - por %xmm3, %xmm5 - - movdqa [%rsi+16], %xmm2 - movdqa [%rsi+48], %xmm3 -WriteCLUT_T16_I4_CSM1_End: - ret - -#endif diff --git a/plugins/zzogl-pg/opengl/x86-64.asm b/plugins/zzogl-pg/opengl/x86-64.asm deleted file mode 100644 index 6d476dfc6d..0000000000 --- a/plugins/zzogl-pg/opengl/x86-64.asm +++ /dev/null @@ -1,1091 +0,0 @@ -; Copyright (C) 2003-2005 Gabest/zerofrog -; http:;;www.gabest.org -; -; This Program is free software; you can redistribute it and/or modify -; it under the terms of the GNU General Public License as published by -; the Free Software Foundation; either version 2, or (at your option) -; any later version. -; -; This Program is distributed in the hope that it will be useful, -; but WITHOUT ANY WARRANTY; without even the implied warranty of -; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; GNU General Public License for more details. -; -; You should have received a copy of the GNU General Public License -; along with GNU Make; see the file COPYING. If not, write to -; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. -; http:;;www.gnu.org/copyleft/gpl.html -; -; - -extern s_clut16mask:ptr - - .code - -; mmx memcpy implementation, size has to be a multiple of 8 -; returns 0 is equal, nonzero value if not equal -; ~10 times faster than standard memcmp -; (zerofrog) -; u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize) -; rcx - src1 -; rdx - src2 -; r8d - cmpsize -memcmp_mmx proc public - cmp r8d, 32 - jl Done4 - - ; custom test first 8 to make sure things are ok - movq mm0, [rdx] - movq mm1, [rdx+8] - pcmpeqd mm0, [rcx] - pcmpeqd mm1, [rcx+8] - pand mm0, mm1 - movq mm2, [rdx+16] - pmovmskb eax, mm0 - movq mm3, [rdx+24] - - ; check if eq - cmp eax, 0ffh - je NextComp - mov eax, 1 - jmp Finish - -NextComp: - pcmpeqd mm2, [rcx+16] - pcmpeqd mm3, [rcx+24] - pand mm2, mm3 - pmovmskb eax, mm2 - - sub r8d, 32 - add rdx, 32 - add rcx, 32 - - ; check if eq - cmp eax, 0ffh - je ContinueTest - mov eax, 1 - jmp Finish - - cmp r8d, 64 - jl Done8 - -Cmp8: - movq mm0, [rdx] - movq mm1, [rdx+8] - movq mm2, [rdx+16] - movq mm3, [rdx+24] - movq mm4, [rdx+32] - movq mm5, [rdx+40] - movq mm6, [rdx+48] - movq mm7, [rdx+56] - pcmpeqd mm0, [rcx] - pcmpeqd mm1, [rcx+8] - pcmpeqd mm2, [rcx+16] - pcmpeqd mm3, [rcx+24] - pand mm0, mm1 - pcmpeqd mm4, [rcx+32] - pand mm0, mm2 - pcmpeqd mm5, [rcx+40] - pand mm0, mm3 - pcmpeqd mm6, [rcx+48] - pand mm0, mm4 - pcmpeqd mm7, [rcx+56] - pand mm0, mm5 - pand mm0, mm6 - pand mm0, mm7 - pmovmskb eax, mm0 - - ; check if eq - cmp eax, 0ffh - je Continue - mov eax, 1 - jmp Finish - -Continue: - sub r8d, 64 - add rdx, 64 - add rcx, 64 -ContinueTest: - cmp r8d, 64 - jge Cmp8 - -Done8: - test r8d, 020h - jz Done4 - movq mm0, [rdx] - movq mm1, [rdx+8] - movq mm2, [rdx+16] - movq mm3, [rdx+24] - pcmpeqd mm0, [rcx] - pcmpeqd mm1, [rcx+8] - pcmpeqd mm2, [rcx+16] - pcmpeqd mm3, [rcx+24] - pand mm0, mm1 - pand mm0, mm2 - pand mm0, mm3 - pmovmskb eax, mm0 - sub r8d, 32 - add rdx, 32 - add rcx, 32 - - ; check if eq - cmp eax, 0ffh - je Done4 - mov eax, 1 - jmp Finish - -Done4: - cmp r8d, 24 - jne Done2 - movq mm0, [rdx] - movq mm1, [rdx+8] - movq mm2, [rdx+16] - pcmpeqd mm0, [rcx] - pcmpeqd mm1, [rcx+8] - pcmpeqd mm2, [rcx+16] - pand mm0, mm1 - pand mm0, mm2 - pmovmskb eax, mm0 - - ; check if eq - cmp eax, 0ffh - je Done - mov eax, 1 - jmp Finish - -Done2: - cmp r8d, 16 - jne Done1 - - movq mm0, [rdx] - movq mm1, [rdx+8] - pcmpeqd mm0, [rcx] - pcmpeqd mm1, [rcx+8] - pand mm0, mm1 - pmovmskb eax, mm0 - - ; check if eq - cmp eax, 0ffh - je Done - mov eax, 1 - jmp Finish - -Done1: - cmp r8d, 8 - jne Done - - mov eax, [rdx] - mov rdx, [rdx+4] - cmp eax, [rcx] - je Next - mov eax, 1 - jmp Finish - -Next: - cmp rdx, [rcx+4] - je Done - mov eax, 1 - jmp Finish - -Done: - xor eax, eax - -Finish: - emms - ret - -memcmp_mmx endp - -; TestClutChangeMMX -; mov rdx, dst -; mov rcx, src -; mov r8d, entries -TestClutChangeMMX proc public - -Start: - movq mm0, [rdx] - movq mm1, [rdx+8] - pcmpeqd mm0, [rcx] - pcmpeqd mm1, [rcx+16] - - movq mm2, [rdx+16] - movq mm3, [rdx+24] - pcmpeqd mm2, [rcx+32] - pcmpeqd mm3, [rcx+48] - - pand mm0, mm1 - pand mm2, mm3 - movq mm4, [rdx+32] - movq mm5, [rdx+40] - pcmpeqd mm4, [rcx+8] - pcmpeqd mm5, [rcx+24] - - pand mm0, mm2 - pand mm4, mm5 - movq mm6, [rdx+48] - movq mm7, [rdx+56] - pcmpeqd mm6, [rcx+40] - pcmpeqd mm7, [rcx+56] - - pand mm0, mm4 - pand mm6, mm7 - pand mm0, mm6 - - pmovmskb eax, mm0 - cmp eax, 0ffh - je Continue - mov byte ptr [r9], 1 - jmp Return - -Continue: - cmp r8d, 16 - jle Return - - test r8d, 010h - jz AddRcx - sub rcx, 448 ; go back and down one column, -AddRcx: - add rcx, 256 ; go to the right block - - - jne Continue1 - add rcx, 256 ; skip whole block -Continue1: - add rdx, 64 - sub r8d, 16 - jmp Start - -Return: - emms - ret - -TestClutChangeMMX endp - -UnswizzleZ16Target proc public - pxor xmm7, xmm7 - -Z16Loop: - ;; unpack 64 bytes at a time - movdqa xmm0, [rdx] - movdqa xmm2, [rdx+16] - movdqa xmm4, [rdx+32] - movdqa xmm6, [rdx+48] - - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - movdqa xmm5, xmm4 - - punpcklwd xmm0, xmm7 - punpckhwd xmm1, xmm7 - punpcklwd xmm2, xmm7 - punpckhwd xmm3, xmm7 - - ;; start saving - movdqa [rcx], xmm0 - movdqa [rcx+16], xmm1 - - punpcklwd xmm4, xmm7 - punpckhwd xmm5, xmm7 - - movdqa [rcx+32], xmm2 - movdqa [rcx+48], xmm3 - - movdqa xmm0, xmm6 - punpcklwd xmm6, xmm7 - - movdqa [rcx+64], xmm4 - movdqa [rcx+80], xmm5 - - punpckhwd xmm0, xmm7 - - movdqa [rcx+96], xmm6 - movdqa [rcx+112], xmm0 - - add rdx, 64 - add rcx, 128 - sub r9d, 1 - jne Z16Loop - - ret -UnswizzleZ16Target endp - -; -; swizzling -; - -punpck macro op, sd0, sd2, s1, s3, d1, d3 - - movdqa @CatStr(xmm, %d1), @CatStr(xmm, %sd0) - pshufd @CatStr(xmm, %d3), @CatStr(xmm, %sd2), 0e4h - - @CatStr(punpckl, op) @CatStr(xmm, %sd0), @CatStr(xmm, %s1) - @CatStr(punpckh, op) @CatStr(xmm, %d1), @CatStr(xmm, %s1) - @CatStr(punpckl, op) @CatStr(xmm, %sd2), @CatStr(xmm, %s3) - @CatStr(punpckh, op) @CatStr(xmm, %d3), @CatStr(xmm, %s3) - - endm - -punpcknbl macro - - movdqa xmm4, xmm0 - pshufd xmm5, xmm1, 0e4h - - psllq xmm1, 4 - psrlq xmm4, 4 - - movdqa xmm6, xmm7 - pand xmm0, xmm7 - pandn xmm6, xmm1 - por xmm0, xmm6 - - movdqa xmm6, xmm7 - pand xmm4, xmm7 - pandn xmm6, xmm5 - por xmm4, xmm6 - - movdqa xmm1, xmm4 - - movdqa xmm4, xmm2 - pshufd xmm5, xmm3, 0e4h - - psllq xmm3, 4 - psrlq xmm4, 4 - - movdqa xmm6, xmm7 - pand xmm2, xmm7 - pandn xmm6, xmm3 - por xmm2, xmm6 - - movdqa xmm6, xmm7 - pand xmm4, xmm7 - pandn xmm6, xmm5 - por xmm4, xmm6 - - movdqa xmm3, xmm4 - - punpck bw, 0, 2, 1, 3, 4, 6 - - endm - -punpcknbh macro - - movdqa xmm12, xmm8 - pshufd xmm13, xmm9, 0e4h - - psllq xmm9, 4 - psrlq xmm12, 4 - - movdqa xmm14, xmm15 - pand xmm8, xmm15 - pandn xmm14, xmm9 - por xmm8, xmm14 - - movdqa xmm14, xmm15 - pand xmm12, xmm15 - pandn xmm14, xmm13 - por xmm12, xmm14 - - movdqa xmm9, xmm12 - - movdqa xmm12, xmm10 - pshufd xmm13, xmm11, 0e4h - - psllq xmm11, 4 - psrlq xmm12, 4 - - movdqa xmm14, xmm15 - pand xmm10, xmm15 - pandn xmm14, xmm11 - por xmm10, xmm14 - - movdqa xmm14, xmm15 - pand xmm12, xmm15 - pandn xmm14, xmm13 - por xmm12, xmm14 - - movdqa xmm11, xmm12 - - punpck bw, 8, 10, 9, 11, 12, 14 - - endm - -; -; SwizzleBlock32_sse2 -; - -SwizzleBlock32_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov rcx, 4 - - cmp r9d, 0ffffffffh - jne SwizzleBlock32_sse2@WM - - align 16 -@@: - movdqa xmm0, [rsi] - movdqa xmm4, [rsi+16] - movdqa xmm1, [rsi+r8] - movdqa xmm5, [rsi+r8+16] - - punpck qdq, 0, 4, 1, 5, 2, 6 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm2 - movdqa [rdi+16*2], xmm4 - movdqa [rdi+16*3], xmm6 - - lea rsi, [rsi+r8*2] - add rdi, 64 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock32_sse2@WM: - - movd xmm7, r9d - pshufd xmm7, xmm7, 0 - - align 16 -@@: - movdqa xmm0, [rsi] - movdqa xmm4, [rsi+16] - movdqa xmm1, [rsi+r8] - movdqa xmm5, [rsi+r8+16] - - punpck qdq, 0, 4, 1, 5, 2, 6 - - movdqa xmm3, xmm7 - pshufd xmm5, xmm7, 0e4h - movdqa xmm9, xmm7 - pshufd xmm11, xmm7, 0e4h - - pandn xmm3, [rdi+16*0] - pand xmm0, xmm7 - por xmm0, xmm3 - movdqa [rdi+16*0], xmm0 - - pandn xmm5, [rdi+16*1] - pand xmm2, xmm7 - por xmm2, xmm5 - movdqa [rdi+16*1], xmm2 - - pandn xmm9, [rdi+16*2] - pand xmm4, xmm7 - por xmm4, xmm9 - movdqa [rdi+16*2], xmm4 - - pandn xmm11, [rdi+16*3] - pand xmm6, xmm7 - por xmm6, xmm11 - movdqa [edi+16*3], xmm6 - - lea rsi, [rsi+r8*2] - add rdi, 64 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock32_sse2 endp - -; -; SwizzleBlock16_sse2 -; - -SwizzleBlock16_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov rcx, 4 - - align 16 -@@: - movdqa xmm0, [rsi] - movdqa xmm1, [rsi+16] - movdqa xmm2, [rsi+r8] - movdqa xmm3, [rsi+r8+16] - - punpck wd, 0, 2, 1, 3, 4, 6 - punpck qdq, 0, 4, 2, 6, 1, 5 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm1 - movdqa [rdi+16*2], xmm4 - movdqa [rdi+16*3], xmm5 - - lea rsi, [rsi+r8*2] - add rdi, 64 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock16_sse2 endp - -; -; SwizzleBlock8 -; - -SwizzleBlock8_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov ecx, 2 - - align 16 -@@: - ; col 0, 2 - - movdqa xmm0, [rsi] - movdqa xmm2, [rsi+r8] - lea rsi, [rsi+r8*2] - - pshufd xmm1, [rsi], 0b1h - pshufd xmm3, [rsi+r8], 0b1h - lea rsi, [rsi+r8*2] - - punpck bw, 0, 2, 1, 3, 4, 6 - punpck wd, 0, 2, 4, 6, 1, 3 - punpck qdq, 0, 1, 2, 3, 4, 5 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm4 - movdqa [rdi+16*2], xmm1 - movdqa [rdi+16*3], xmm5 - - ; col 1, 3 - - pshufd xmm0, [rsi], 0b1h - pshufd xmm2, [rsi+r8], 0b1h - lea rsi, [rsi+r8*2] - - movdqa xmm1, [rsi] - movdqa xmm3, [rsi+r8] - lea rsi, [rsi+r8*2] - - punpck bw, 0, 2, 1, 3, 4, 6 - punpck wd, 0, 2, 4, 6, 1, 3 - punpck qdq, 0, 1, 2, 3, 4, 5 - - movdqa [rdi+16*4], xmm0 - movdqa [rdi+16*5], xmm4 - movdqa [rdi+16*6], xmm1 - movdqa [rdi+16*7], xmm5 - - add edi, 128 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock8_sse2 endp - -; -; SwizzleBlock4 -; - -SwizzleBlock4_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov rcx, 2 - - mov eax, 0f0f0f0fh - movd xmm7, eax - pshufd xmm7, xmm7, 0 - - align 16 -@@: - ; col 0, 2 - - movdqa xmm0, [rsi] - movdqa xmm2, [rsi+r8] - lea rsi, [rsi+r8*2] - - movdqa xmm1, [rsi] - movdqa xmm3, [rsi+r8] - lea rsi, [rsi+r8*2] - - pshuflw xmm1, xmm1, 0b1h - pshuflw xmm3, xmm3, 0b1h - pshufhw xmm1, xmm1, 0b1h - pshufhw xmm3, xmm3, 0b1h - - punpcknbl - punpck bw, 0, 2, 4, 6, 1, 3 - punpck bw, 0, 2, 1, 3, 4, 6 - punpck qdq, 0, 4, 2, 6, 1, 3 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm1 - movdqa [rdi+16*2], xmm4 - movdqa [rdi+16*3], xmm3 - - ; col 1, 3 - - movdqa xmm0, [rsi] - movdqa xmm2, [rsi+r8] - lea esi, [rsi+r8*2] - - movdqa xmm1, [rsi] - movdqa xmm3, [rsi+r8] - lea rsi, [rsi+r8*2] - - pshuflw xmm0, xmm0, 0b1h - pshuflw xmm2, xmm2, 0b1h - pshufhw xmm0, xmm0, 0b1h - pshufhw xmm2, xmm2, 0b1h - - punpcknbl - punpck bw, 0, 2, 4, 6, 1, 3 - punpck bw, 0, 2, 1, 3, 4, 6 - punpck qdq, 0, 4, 2, 6, 1, 3 - - movdqa [rdi+16*4], xmm0 - movdqa [rdi+16*5], xmm1 - movdqa [rdi+16*6], xmm4 - movdqa [rdi+16*7], xmm3 - - add rdi, 128 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock4_sse2 endp - -; -; swizzling with unaligned reads -; - -; -; SwizzleBlock32u_sse2 -; - -SwizzleBlock32u_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov rcx, 4 - - cmp r9d, 0ffffffffh - jne SwizzleBlock32u_sse2@WM - - align 16 -@@: - movdqu xmm0, [rsi] - movdqu xmm4, [rsi+16] - movdqu xmm1, [rsi+r8] - movdqu xmm5, [rsi+r8+16] - - punpck qdq, 0, 4, 1, 5, 2, 6 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm2 - movdqa [rdi+16*2], xmm4 - movdqa [rdi+16*3], xmm6 - - lea rsi, [rsi+r8*2] - add rdi, 64 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock32u_sse2@WM: - - movd xmm7, r9d - pshufd xmm7, xmm7, 0 - - align 16 -@@: - movdqu xmm0, [rsi] - movdqu xmm4, [rsi+16] - movdqu xmm1, [rsi+r8] - movdqu xmm5, [rsi+r8+16] - - punpck qdq, 0, 4, 1, 5, 2, 6 - - movdqa xmm3, xmm7 - pshufd xmm5, xmm7, 0e4h - movdqa xmm9, xmm7 - pshufd xmm11, xmm7, 0e4h - - pandn xmm3, [rdi+16*0] - pand xmm0, xmm7 - por xmm0, xmm3 - movdqa [rdi+16*0], xmm0 - - pandn xmm5, [rdi+16*1] - pand xmm2, xmm7 - por xmm2, xmm5 - movdqa [rdi+16*1], xmm2 - - pandn xmm9, [rdi+16*2] - pand xmm4, xmm7 - por xmm4, xmm9 - movdqa [rdi+16*2], xmm4 - - pandn xmm11, [rdi+16*3] - pand xmm6, xmm7 - por xmm6, xmm11 - movdqa [edi+16*3], xmm6 - - lea rsi, [rsi+r8*2] - add rdi, 64 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock32u_sse2 endp - -; -; SwizzleBlock16u_sse2 -; - -SwizzleBlock16u_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov rcx, 4 - - align 16 -@@: - movdqu xmm0, [rsi] - movdqu xmm1, [rsi+16] - movdqu xmm2, [rsi+r8] - movdqu xmm3, [rsi+r8+16] - - punpck wd, 0, 2, 1, 3, 4, 6 - punpck qdq, 0, 4, 2, 6, 1, 5 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm1 - movdqa [rdi+16*2], xmm4 - movdqa [rdi+16*3], xmm5 - - lea rsi, [rsi+r8*2] - add rdi, 64 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock16u_sse2 endp - -; -; SwizzleBlock8u -; - -SwizzleBlock8u_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov ecx, 2 - - align 16 -@@: - ; col 0, 2 - - movdqu xmm0, [rsi] - movdqu xmm2, [rsi+r8] - lea rsi, [rsi+r8*2] - - pshufd xmm1, xmm0, 0b1h - pshufd xmm3, xmm2, 0b1h - lea rsi, [rsi+r8*2] - - punpck bw, 0, 2, 1, 3, 4, 6 - punpck wd, 0, 2, 4, 6, 1, 3 - punpck qdq, 0, 1, 2, 3, 4, 5 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm4 - movdqa [rdi+16*2], xmm1 - movdqa [rdi+16*3], xmm5 - - ; col 1, 3 - - movdqu xmm0, [rsi] - movdqu xmm2, [rsi+r8] - pshufd xmm0, xmm0, 0b1h - pshufd xmm2, xmm2, 0b1h - lea rsi, [rsi+r8*2] - - movdqu xmm1, [rsi] - movdqu xmm3, [rsi+r8] - lea rsi, [rsi+r8*2] - - punpck bw, 0, 2, 1, 3, 4, 6 - punpck wd, 0, 2, 4, 6, 1, 3 - punpck qdq, 0, 1, 2, 3, 4, 5 - - movdqa [rdi+16*4], xmm0 - movdqa [rdi+16*5], xmm4 - movdqa [rdi+16*6], xmm1 - movdqa [rdi+16*7], xmm5 - - add edi, 128 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock8u_sse2 endp - -; -; SwizzleBlock4u -; - -SwizzleBlock4u_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov rcx, 2 - - mov eax, 0f0f0f0fh - movd xmm7, eax - pshufd xmm7, xmm7, 0 - - align 16 -@@: - ; col 0, 2 - - movdqu xmm0, [rsi] - movdqu xmm2, [rsi+r8] - lea rsi, [rsi+r8*2] - - movdqu xmm1, [rsi] - movdqu xmm3, [rsi+r8] - lea rsi, [rsi+r8*2] - - pshuflw xmm1, xmm1, 0b1h - pshuflw xmm3, xmm3, 0b1h - pshufhw xmm1, xmm1, 0b1h - pshufhw xmm3, xmm3, 0b1h - - punpcknbl - punpck bw, 0, 2, 4, 6, 1, 3 - punpck bw, 0, 2, 1, 3, 4, 6 - punpck qdq, 0, 4, 2, 6, 1, 3 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm1 - movdqa [rdi+16*2], xmm4 - movdqa [rdi+16*3], xmm3 - - ; col 1, 3 - - movdqu xmm0, [rsi] - movdqu xmm2, [rsi+r8] - lea esi, [rsi+r8*2] - - movdqu xmm1, [rsi] - movdqu xmm3, [rsi+r8] - lea rsi, [rsi+r8*2] - - pshuflw xmm0, xmm0, 0b1h - pshuflw xmm2, xmm2, 0b1h - pshufhw xmm0, xmm0, 0b1h - pshufhw xmm2, xmm2, 0b1h - - punpcknbl - punpck bw, 0, 2, 4, 6, 1, 3 - punpck bw, 0, 2, 1, 3, 4, 6 - punpck qdq, 0, 4, 2, 6, 1, 3 - - movdqa [rdi+16*4], xmm0 - movdqa [rdi+16*5], xmm1 - movdqa [rdi+16*6], xmm4 - movdqa [rdi+16*7], xmm3 - - add rdi, 128 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock4u_sse2 endp - -WriteCLUT_T16_I4_CSM1_sse2 proc public - movdqa xmm0, XMMWORD PTR [rcx] - movdqa xmm1, XMMWORD PTR [rcx+16] - movdqa xmm2, XMMWORD PTR [rcx+32] - movdqa xmm3, XMMWORD PTR [rcx+48] - - ;; rearrange - pshuflw xmm0, xmm0, 088h - pshufhw xmm0, xmm0, 088h - pshuflw xmm1, xmm1, 088h - pshufhw xmm1, xmm1, 088h - pshuflw xmm2, xmm2, 088h - pshufhw xmm2, xmm2, 088h - pshuflw xmm3, xmm3, 088h - pshufhw xmm3, xmm3, 088h - - shufps xmm0, xmm1, 088h - shufps xmm2, xmm3, 088h - - pshufd xmm0, xmm0, 0d8h - pshufd xmm2, xmm2, 0d8h - - pxor xmm6, xmm6 - mov rax, offset s_clut16mask - - test rdx, 15 - jnz WriteUnaligned - - movdqa xmm7, XMMWORD PTR [rax] ;; saves upper 16 bits - - ;; have to save interlaced with the old data - movdqa xmm4, [rdx] - movdqa xmm5, [rdx+32] - movhlps xmm1, xmm0 - movlhps xmm0, xmm2 ;; lower 8 colors - - pand xmm4, xmm7 - pand xmm5, xmm7 - - shufps xmm1, xmm2, 0e4h ;; upper 8 colors - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - punpcklwd xmm0, xmm6 - punpcklwd xmm1, xmm6 - por xmm0, xmm4 - por xmm1, xmm5 - - punpckhwd xmm2, xmm6 - punpckhwd xmm3, xmm6 - - movdqa [rdx], xmm0 - movdqa [rdx+32], xmm1 - - movdqa xmm5, xmm7 - pand xmm7, [rdx+16] - pand xmm5, [rdx+48] - - por xmm2, xmm7 - por xmm3, xmm5 - - movdqa [rdx+16], xmm2 - movdqa [rdx+48], xmm3 - jmp WriteCLUT_T16_I4_CSM1_End - -WriteUnaligned: - ;; rdx is offset by 2 - sub rdx, 2 - - movdqa xmm7, XMMWORD PTR [rax+16] ;; saves lower 16 bits - - ;; have to save interlaced with the old data - movdqa xmm4, [rdx] - movdqa xmm5, [rdx+32] - movhlps xmm1, xmm0 - movlhps xmm0, xmm2 ;; lower 8 colors - - pand xmm4, xmm7 - pand xmm5, xmm7 - - shufps xmm1, xmm2, 0e4h ;; upper 8 colors - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - punpcklwd xmm0, xmm6 - punpcklwd xmm1, xmm6 - pslld xmm0, 16 - pslld xmm1, 16 - por xmm0, xmm4 - por xmm1, xmm5 - - punpckhwd xmm2, xmm6 - punpckhwd xmm3, xmm6 - pslld xmm2, 16 - pslld xmm3, 16 - - movdqa [rdx], xmm0 - movdqa [rdx+32], xmm1 - - movdqa xmm5, xmm7 - pand xmm7, [rdx+16] - pand xmm5, [rdx+48] - - por xmm2, xmm7 - por xmm3, xmm5 - - movdqa [rdx+16], xmm2 - movdqa [rdx+48], xmm3 -WriteCLUT_T16_I4_CSM1_End: - ret - -WriteCLUT_T16_I4_CSM1_sse2 endp - -end \ No newline at end of file diff --git a/plugins/zzogl-pg/opengl/zerogs.cpp b/plugins/zzogl-pg/opengl/zerogs.cpp index 96d29c3771..e0cf3b7067 100644 --- a/plugins/zzogl-pg/opengl/zerogs.cpp +++ b/plugins/zzogl-pg/opengl/zerogs.cpp @@ -829,9 +829,6 @@ void ZeroGS::ExtWrite() //////////// // Caches // //////////// -#ifdef __x86_64__ -extern "C" void TestClutChangeMMX(void* src, void* dst, int entries, void* pret); -#endif bool ZeroGS::CheckChangeInClut(u32 highdword, u32 psm) { @@ -869,10 +866,6 @@ bool ZeroGS::CheckChangeInClut(u32 highdword, u32 psm) // do a fast test with MMX #ifdef _MSC_VER - -#ifdef __x86_64__ - TestClutChangeMMX(dst, src, entries, &bRet); -#else int storeebx; __asm { mov storeebx, ebx @@ -937,63 +930,8 @@ Return: emms mov ebx, storeebx } -#endif // __x86_64__ #else // linux - -#ifdef __x86_64__ - __asm__( - ".intel_syntax\n" -"Start:\n" - "movq %%mm0, [%%rcx]\n" - "movq %%mm1, [%%rcx+8]\n" - "pcmpeqd %%mm0, [%%rdx]\n" - "pcmpeqd %%mm1, [%%rdx+16]\n" - "movq %%mm2, [%%rcx+16]\n" - "movq %%mm3, [%%rcx+24]\n" - "pcmpeqd %%mm2, [%%rdx+32]\n" - "pcmpeqd %%mm3, [%%rdx+48]\n" - "pand %%mm0, %%mm1\n" - "pand %%mm2, %%mm3\n" - "movq %%mm4, [%%rcx+32]\n" - "movq %%mm5, [%%rcx+40]\n" - "pcmpeqd %%mm4, [%%rdx+8]\n" - "pcmpeqd %%mm5, [%%rdx+24]\n" - "pand %%mm0, %%mm2\n" - "pand %%mm4, %%mm5\n" - "movq %%mm6, [%%rcx+48]\n" - "movq %%mm7, [%%rcx+56]\n" - "pcmpeqd %%mm6, [%%rdx+40]\n" - "pcmpeqd %%mm7, [%%rdx+56]\n" - "pand %%mm0, %%mm4\n" - "pand %%mm6, %%mm7\n" - "pand %%mm0, %%mm6\n" - "pmovmskb %%eax, %%mm0\n" - "cmp %%eax, 0xff\n" - "je Continue\n" - ".att_syntax\n" - "movb $1, %0\n" - ".intel_syntax\n" - "jmp Return\n" -"Continue:\n" - "cmp %%rbx, 16\n" - "jle Return\n" - "test %%rbx, 0x10\n" - "jz AddRcx\n" - "sub %%rdx, 448\n" // go back and down one column -"AddRcx:\n" - "add %%rdx, 256\n" // go to the right block - "cmp %%rbx, 0x90\n" - "jne Continue1\n" - "add %%rdx, 256\n" // skip whole block -"Continue1:\n" - "add %%rcx, 64\n" - "sub %%rbx, 16\n" - "jmp Start\n" -"Return:\n" - "emms\n" - ".att_syntax\n" : "=m"(bRet) : "c"(dst), "d"(src), "b"(entries) : "rax", "memory"); -#else // do a fast test with MMX __asm__( ".intel_syntax\n" @@ -1046,7 +984,6 @@ Return: "Return:\n" "emms\n" ".att_syntax\n" : "=m"(bRet) : "c"(dst), "d"(src), "b"(entries) : "eax", "memory"); -#endif // __x86_64__ #endif // _WIN32