From 8528fd305f92b48f20fdc6ecc79111e0c131bf15 Mon Sep 17 00:00:00 2001 From: arcum42 Date: Fri, 8 Jan 2010 08:09:10 +0000 Subject: [PATCH] Fix Linux compiling, and remove vestigal x64 code in ZeroGS. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2412 96395faa-99c1-11dd-bbfe-3dabce05a288 --- pcsx2/Linux/pcsx2.cbp | 1132 +++++++++++++------------- pcsx2/gui/wxAppWithHelpers.cpp | 2 +- pcsx2/gui/wxAppWithHelpers.h | 2 + plugins/zerogs/opengl/Mem.h | 9 +- plugins/zerogs/opengl/Regs.h | 4 - plugins/zerogs/opengl/memcpy_amd.cpp | 4 +- plugins/zerogs/opengl/targets.cpp | 8 - plugins/zerogs/opengl/x86-64.S | 906 --------------------- plugins/zerogs/opengl/x86-64.asm | 1091 ------------------------- plugins/zerogs/opengl/x86.cpp | 7 +- plugins/zerogs/opengl/zerogs.cpp | 61 -- 11 files changed, 575 insertions(+), 2651 deletions(-) delete mode 100644 plugins/zerogs/opengl/x86-64.S delete mode 100644 plugins/zerogs/opengl/x86-64.asm diff --git a/pcsx2/Linux/pcsx2.cbp b/pcsx2/Linux/pcsx2.cbp index 31edeeccee..a9d7eb013b 100644 --- a/pcsx2/Linux/pcsx2.cbp +++ b/pcsx2/Linux/pcsx2.cbp @@ -1,565 +1,567 @@ - - - - - - + + + + + + diff --git a/pcsx2/gui/wxAppWithHelpers.cpp b/pcsx2/gui/wxAppWithHelpers.cpp index 9958f3ba30..92d77dc49c 100644 --- a/pcsx2/gui/wxAppWithHelpers.cpp +++ b/pcsx2/gui/wxAppWithHelpers.cpp @@ -121,7 +121,7 @@ bool wxAppWithHelpers::OnInit() Connect( pxEvt_MessageBox, pxMessageBoxEventThing (wxAppWithHelpers::OnMessageBox) ); Connect( pxEvt_Assertion, pxMessageBoxEventThing (wxAppWithHelpers::OnMessageBox) ); Connect( pxEvt_Ping, pxPingEventHandler (wxAppWithHelpers::OnPingEvent) ); - Connect( wxEvt_Idle, wxIdleEventHandler (wxAppWithHelpers::OnIdleEvent) ); + Connect( wxEVT_IDLE, wxIdleEventHandler (wxAppWithHelpers::OnIdleEvent) ); Connect( m_PingTimer.GetId(), wxEVT_TIMER, wxTimerEventHandler(wxAppWithHelpers::OnPingTimeout) ); diff --git a/pcsx2/gui/wxAppWithHelpers.h b/pcsx2/gui/wxAppWithHelpers.h index 6df63ce54b..18842d1aa0 100644 --- a/pcsx2/gui/wxAppWithHelpers.h +++ b/pcsx2/gui/wxAppWithHelpers.h @@ -244,9 +244,11 @@ public: pxAssertionEvent& SetInstData( MsgboxEventResult& instdata ); pxAssertionEvent& SetStacktrace( const wxString& trace ); + ~pxAssertionEvent() throw() { } protected: virtual int _DoDialog() const; + }; // -------------------------------------------------------------------------------------- diff --git a/plugins/zerogs/opengl/Mem.h b/plugins/zerogs/opengl/Mem.h index 63317313a8..206b361de5 100644 --- a/plugins/zerogs/opengl/Mem.h +++ b/plugins/zerogs/opengl/Mem.h @@ -359,11 +359,8 @@ static __forceinline void writePixel32_0(void* pmem, int x, int y, u32 pixel, u3 static __forceinline void writePixel24_0(void* pmem, int x, int y, u32 pixel, u32 bw) { u8 *buf = (u8*)&((u32*)pmem)[getPixelAddress32_0(x, y, bw)]; u8 *pix = (u8*)&pixel; -#if defined(_MSC_VER) && defined(__x86_64__) - memcpy(buf, pix, 3); -#else + buf[0] = pix[0]; buf[1] = pix[1]; buf[2] = pix[2]; -#endif } static __forceinline void writePixel16_0(void* pmem, int x, int y, u32 pixel, u32 bw) { @@ -406,11 +403,7 @@ static __forceinline void writePixel32Z_0(void* pmem, int x, int y, u32 pixel, u static __forceinline void writePixel24Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) { u8 *buf = (u8*)pmem + 4*getPixelAddress32Z_0(x, y, bw); u8 *pix = (u8*)&pixel; -#if defined(_MSC_VER) && defined(__x86_64__) - memcpy(buf, pix, 3); -#else buf[0] = pix[0]; buf[1] = pix[1]; buf[2] = pix[2]; -#endif } static __forceinline void writePixel16Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) { diff --git a/plugins/zerogs/opengl/Regs.h b/plugins/zerogs/opengl/Regs.h index 40a4b4b065..c76fba67a0 100644 --- a/plugins/zerogs/opengl/Regs.h +++ b/plugins/zerogs/opengl/Regs.h @@ -23,11 +23,7 @@ typedef void (__fastcall *GIFRegHandler)(u32* data); #else -#ifdef __x86_64__ -typedef void (*GIFRegHandler)(u32* data); -#else typedef void (__fastcall *GIFRegHandler)(u32* data); -#endif #endif diff --git a/plugins/zerogs/opengl/memcpy_amd.cpp b/plugins/zerogs/opengl/memcpy_amd.cpp index dfd1119330..df0efd5478 100644 --- a/plugins/zerogs/opengl/memcpy_amd.cpp +++ b/plugins/zerogs/opengl/memcpy_amd.cpp @@ -78,7 +78,7 @@ MEMCPY_AMD.CPP extern "C" { #include "PS2Etypes.h" -#if defined(_MSC_VER) && !defined(__x86_64__) +#if defined(_MSC_VER) void * memcpy_amd(void *dest, const void *src, size_t n) { @@ -461,7 +461,7 @@ End: } #else // _MSC_VER -// assume gcc or mingw or win x64 +// assume gcc #include #include diff --git a/plugins/zerogs/opengl/targets.cpp b/plugins/zerogs/opengl/targets.cpp index 874ce4b8c7..c006084981 100644 --- a/plugins/zerogs/opengl/targets.cpp +++ b/plugins/zerogs/opengl/targets.cpp @@ -1602,10 +1602,6 @@ inline list::iterator ZeroGS::CMemoryTargetMngr::DestroyTargetIte return it; } -#if defined(_MSC_VER) && defined(__x86_64__) -extern "C" void UnswizzleZ16Target(void* dst, void* src, int iters); -#endif - ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forcevalidate) { int nbStart, nbEnd; @@ -1915,9 +1911,6 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info #if defined(_MSC_VER) -#if defined(__x86_64__) - UnswizzleZ16Target(dst, src, iters); -#else __asm { mov edx, iters pxor xmm7, xmm7 @@ -1966,7 +1959,6 @@ Z16Loop: sub edx, 1 jne Z16Loop } -#endif // __x86_64__ #else // _MSC_VER __asm__(".intel_syntax\n" diff --git a/plugins/zerogs/opengl/x86-64.S b/plugins/zerogs/opengl/x86-64.S deleted file mode 100644 index 6f221e7b33..0000000000 --- a/plugins/zerogs/opengl/x86-64.S +++ /dev/null @@ -1,906 +0,0 @@ -## Copyright (C) 2005-2006 zerofrog(@gmail.com) -# -# This Program is free software you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation either ve%rsion 2, or (at your option) -# any later ve%rsion. -# -# This Program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Make see the file COPYING. If not, write to -# the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. -# http://www.gnu.org/copyleft/gpl.html -# -# -.intel_syntax - -## mmx memcpy implementation, size has to be a multiple of 8 -## returns 0 is equal, nonzero value if not equal -## ~10 times faster than standard memcmp -## (zerofrog) -## u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize) -## %rdi - src1 -## %rsi - src2 -## edx - cmpsize -.globl memcmp_mmx - .type memcmp_mmx, @function -memcmp_mmx: - cmp %edx, 32 - jl Done4 - - ## custom test first 8 to make sure things are ok - movq %mm0, [%rsi] - movq %mm1, [%rsi+8] - pcmpeqd %mm0, [%rdi] - pcmpeqd %mm1, [%rdi+8] - pand %mm0, %mm1 - movq %mm2, [%rsi+16] - pmovmskb %eax, %mm0 - movq %mm3, [%rsi+24] - - // check if eq - cmp %eax, 0xff - je NextComp - mov %eax, 1 - jmp End - -NextComp: - pcmpeqd %mm2, [%rdi+16] - pcmpeqd %mm3, [%rdi+24] - pand %mm2, %mm3 - pmovmskb %eax, %mm2 - - sub %edx, 32 - add %rsi, 32 - add %rdi, 32 - - // check if eq - cmp %eax, 0xff - je ContinueTest - mov %eax, 1 - jmp End - - cmp %edx, 64 - jl Done8 - -Cmp8: - movq %mm0, [%rsi] - movq %mm1, [%rsi+8] - movq %mm2, [%rsi+16] - movq %mm3, [%rsi+24] - movq %mm4, [%rsi+32] - movq %mm5, [%rsi+40] - movq %mm6, [%rsi+48] - movq %mm7, [%rsi+56] - pcmpeqd %mm0, [%rdi] - pcmpeqd %mm1, [%rdi+8] - pcmpeqd %mm2, [%rdi+16] - pcmpeqd %mm3, [%rdi+24] - pand %mm0, %mm1 - pcmpeqd %mm4, [%rdi+32] - pand %mm0, %mm2 - pcmpeqd %mm5, [%rdi+40] - pand %mm0, %mm3 - pcmpeqd %mm6, [%rdi+48] - pand %mm0, %mm4 - pcmpeqd %mm7, [%rdi+56] - pand %mm0, %mm5 - pand %mm0, %mm6 - pand %mm0, %mm7 - pmovmskb %eax, %mm0 - - // check if eq - cmp %eax, 0xff - je Continue - mov %eax, 1 - jmp End - -Continue: - sub %edx, 64 - add %rsi, 64 - add %rdi, 64 -ContinueTest: - cmp %edx, 64 - jge Cmp8 - -Done8: - test %edx, 0x20 - jz Done4 - movq %mm0, [%rsi] - movq %mm1, [%rsi+8] - movq %mm2, [%rsi+16] - movq %mm3, [%rsi+24] - pcmpeqd %mm0, [%rdi] - pcmpeqd %mm1, [%rdi+8] - pcmpeqd %mm2, [%rdi+16] - pcmpeqd %mm3, [%rdi+24] - pand %mm0, %mm1 - pand %mm0, %mm2 - pand %mm0, %mm3 - pmovmskb %eax, %mm0 - sub %edx, 32 - add %rsi, 32 - add %rdi, 32 - - // check if eq - cmp %eax, 0xff - je Done4 - mov %eax, 1 - jmp End - -Done4: - cmp %edx, 24 - jne Done2 - movq %mm0, [%rsi] - movq %mm1, [%rsi+8] - movq %mm2, [%rsi+16] - pcmpeqd %mm0, [%rdi] - pcmpeqd %mm1, [%rdi+8] - pcmpeqd %mm2, [%rdi+16] - pand %mm0, %mm1 - pand %mm0, %mm2 - pmovmskb %eax, %mm0 - - // check if eq - cmp %eax, 0xff - je Done - mov %eax, 1 - jmp End - -Done2: - cmp %edx, 16 - jne Done1 - - movq %mm0, [%rsi] - movq %mm1, [%rsi+8] - pcmpeqd %mm0, [%rdi] - pcmpeqd %mm1, [%rdi+8] - pand %mm0, %mm1 - pmovmskb %eax, %mm0 - - // check if eq - cmp %eax, 0xff - je Done - mov %eax, 1 - jmp End - -Done1: - cmp %edx, 8 - jne Done - - mov %eax, [%rsi] - mov %rsi, [%rsi+4] - cmp %eax, [%rdi] - je Next - mov %eax, 1 - jmp End - -Next: - cmp %rsi, [%rdi+4] - je Done - mov %eax, 1 - jmp End - -Done: - xor %eax, %eax - -End: - emms - ret - -#ifdef ZEROGS_SSE2 -// SSE2 extensions - -#define punpck(op, sd0, sd2, s1, s3, d1, d3) \ - movdqa %xmm##d1, %xmm##sd0; \ - pshufd %xmm##d3, %xmm##sd2, 0xe4; \ - punpckl##op %xmm##sd0, %xmm##s1; \ - punpckh##op %xmm##d1, %xmm##s1; \ - punpckl##op %xmm##sd2, %xmm##s3; \ - punpckh##op %xmm##d3, %xmm##s3; \ - -#define punpcknbl \ - movdqa %xmm4, %xmm0; \ - pshufd %xmm5, %xmm1, 0xe4; \ - \ - psllq %xmm1, 4; \ - psrlq %xmm4, 4; \ - \ - movdqa %xmm6, %xmm7; \ - pand %xmm0, %xmm7; \ - pandn %xmm6, %xmm1; \ - por %xmm0, %xmm6; \ - \ - movdqa %xmm6, %xmm7; \ - pand %xmm4, %xmm7; \ - pandn %xmm6, %xmm5; \ - por %xmm4, %xmm6; \ - \ - movdqa %xmm1, %xmm4; \ - \ - movdqa %xmm4, %xmm2; \ - pshufd %xmm5, %xmm3, 0xe4; \ - \ - psllq %xmm3, 4; \ - psrlq %xmm4, 4; \ - \ - movdqa %xmm6, %xmm7; \ - pand %xmm2, %xmm7; \ - pandn %xmm6, %xmm3; \ - por %xmm2, %xmm6; \ - \ - movdqa %xmm6, %xmm7; \ - pand %xmm4, %xmm7; \ - pandn %xmm6, %xmm5; \ - por %xmm4, %xmm6; \ - \ - movdqa %xmm3, %xmm4; \ - \ - punpck(bw, 0, 2, 1, 3, 4, 6); \ - -#define punpcknbh \ - movdqa %xmm12, %xmm8; \ - pshufd %xmm13, %xmm9, 0xe4; \ - \ - psllq %xmm9, 4; \ - psrlq %xmm12, 4; \ - \ - movdqa %xmm14, %xmm15; \ - pand %xmm8, %xmm15; \ - pandn %xmm14, %xmm9; \ - por %xmm8, %xmm14; \ - \ - movdqa %xmm14, %xmm15; \ - pand %xmm12, %xmm15; \ - pandn %xmm14, %xmm13; \ - por %xmm12, %xmm14; \ - \ - movdqa %xmm9, %xmm12; \ - \ - movdqa %xmm12, %xmm10; \ - pshufd %xmm13, %xmm11, 0xe4; \ - \ - psllq %xmm11, 4; \ - psrlq %xmm12, 4; \ - \ - movdqa %xmm14, %xmm15; \ - pand %xmm10, %xmm15; \ - pandn %xmm14, %xmm11; \ - por %xmm10, %xmm14; \ - \ - movdqa %xmm14, %xmm15; \ - pand %xmm12, %xmm15; \ - pandn %xmm14, %xmm13; \ - por %xmm12, %xmm14; \ - \ - movdqa %xmm11, %xmm12; \ - \ - punpck(bw, 8, 10, 9, 11, 12, 14); \ - -// -// SwizzleBlock32_sse2 -// - -.globl SwizzleBlock32_sse2 - .type SwizzleBlock32_sse2, @function -SwizzleBlock32_sse2: - - mov %eax, 4 - - cmp %ecx, 0xffffffff - jne SwizzleBlock32_sse2_2 - - .align 16 -SwizzleBlock32_sse2_1: - movdqa %xmm0, [%rsi] - movdqa %xmm4, [%rsi+16] - movdqa %xmm1, [%rsi+%rdx] - movdqa %xmm5, [%rsi+%rdx+16] - - punpck(qdq, 0, 4, 1, 5, 2, 6) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm2 - movdqa [%rdi+16*2], %xmm4 - movdqa [%rdi+16*3], %xmm6 - - lea %rsi, [%rsi+%rdx*2] - add %rdi, 64 - - dec %eax - jnz SwizzleBlock32_sse2_1 - - ret - -SwizzleBlock32_sse2_2: - - movd %xmm7, %rcx - pshufd %xmm7, %xmm7, 0 - - .align 16 -SwizzleBlock32_sse2_3: - movdqa %xmm0, [%rsi] - movdqa %xmm4, [%rsi+16] - movdqa %xmm1, [%rsi+%rdx] - movdqa %xmm5, [%rsi+%rdx+16] - - punpck(qdq, 0, 4, 1, 5, 2, 6) - - movdqa %xmm3, %xmm7 - pshufd %xmm5, %xmm7, 0xe4 - movdqa %xmm9, %xmm7 - pshufd %xmm11, %xmm7, 0xe4 - - pandn %xmm3, [%rdi+16*0] - pand %xmm0, %xmm7 - por %xmm0, %xmm3 - movdqa [%rdi+16*0], %xmm0 - - pandn %xmm5, [%rdi+16*1] - pand %xmm2, %xmm7 - por %xmm2, %xmm5 - movdqa [%rdi+16*1], %xmm2 - - pandn %xmm9, [%rdi+16*2] - pand %xmm4, %xmm7 - por %xmm4, %xmm9 - movdqa [%rdi+16*2], %xmm4 - - pandn %xmm11, [%rdi+16*3] - pand %xmm6, %xmm7 - por %xmm6, %xmm11 - movdqa [%rdi+16*3], %xmm6 - - lea %rsi, [%rsi+%rdx*2] - add %rdi, 64 - - dec %eax - jnz SwizzleBlock32_sse2_3 - - ret - -// -// SwizzleBlock16_sse2 -// - -.globl SwizzleBlock16_sse2 - .type SwizzleBlock16_sse2, @function -SwizzleBlock16_sse2: - - mov %eax, 4 - - .align 16 -SwizzleBlock16_sse2_1: - movdqa %xmm0, [%rsi] - movdqa %xmm1, [%rsi+16] - movdqa %xmm2, [%rsi+%rdx] - movdqa %xmm3, [%rsi+%rdx+16] - - punpck(wd, 0, 2, 1, 3, 4, 6) - punpck(qdq, 0, 4, 2, 6, 1, 5) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm1 - movdqa [%rdi+16*2], %xmm4 - movdqa [%rdi+16*3], %xmm5 - - lea %rsi, [%rsi+%rdx*2] - add %rdi, 64 - - dec %eax - jnz SwizzleBlock16_sse2_1 - - ret - -// -// SwizzleBlock8 -// - -.globl SwizzleBlock8_sse2 - .type SwizzleBlock8_sse2, @function -SwizzleBlock8_sse2: - - mov %ecx, 2 - - .align 16 -SwizzleBlock8_sse2_1: - // col 0, 2 - - movdqa %xmm0, [%rsi] - movdqa %xmm2, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - pshufd %xmm1, [%rsi], 0xb1 - pshufd %xmm3, [%rsi+%rdx], 0xb1 - lea %rsi, [%rsi+%rdx*2] - - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(wd, 0, 2, 4, 6, 1, 3) - punpck(qdq, 0, 1, 2, 3, 4, 5) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm4 - movdqa [%rdi+16*2], %xmm1 - movdqa [%rdi+16*3], %xmm5 - - // col 1, 3 - - pshufd %xmm0, [%rsi], 0xb1 - pshufd %xmm2, [%rsi+%rdx], 0xb1 - lea %rsi, [%rsi+%rdx*2] - - movdqa %xmm1, [%rsi] - movdqa %xmm3, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(wd, 0, 2, 4, 6, 1, 3) - punpck(qdq, 0, 1, 2, 3, 4, 5) - - movdqa [%rdi+16*4], %xmm0 - movdqa [%rdi+16*5], %xmm4 - movdqa [%rdi+16*6], %xmm1 - movdqa [%rdi+16*7], %xmm5 - - add %rdi, 128 - - dec %ecx - jnz SwizzleBlock8_sse2_1 - - ret - -// -// SwizzleBlock4 -// - -.globl SwizzleBlock4_sse2 - .type SwizzleBlock4_sse2, @function -SwizzleBlock4_sse2: - - mov %ecx, 2 - - mov %eax, 0x0f0f0f0f - movd %xmm7, %eax - pshufd %xmm7, %xmm7, 0 - - .align 16 -SwizzleBlock4_sse2_1: - // col 0, 2 - - movdqa %xmm0, [%rsi] - movdqa %xmm2, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - movdqa %xmm1, [%rsi] - movdqa %xmm3, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - pshuflw %xmm1, %xmm1, 0xb1 - pshuflw %xmm3, %xmm3, 0xb1 - pshufhw %xmm1, %xmm1, 0xb1 - pshufhw %xmm3, %xmm3, 0xb1 - - punpcknbl - punpck(bw, 0, 2, 4, 6, 1, 3) - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(qdq, 0, 4, 2, 6, 1, 3) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm1 - movdqa [%rdi+16*2], %xmm4 - movdqa [%rdi+16*3], %xmm3 - - // col 1, 3 - - movdqa %xmm0, [%rsi] - movdqa %xmm2, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - movdqa %xmm1, [%rsi] - movdqa %xmm3, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - pshuflw %xmm0, %xmm0, 0xb1 - pshuflw %xmm2, %xmm2, 0xb1 - pshufhw %xmm0, %xmm0, 0xb1 - pshufhw %xmm2, %xmm2, 0xb1 - - punpcknbl - punpck(bw, 0, 2, 4, 6, 1, 3) - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(qdq, 0, 4, 2, 6, 1, 3) - - movdqa [%rdi+16*4], %xmm0 - movdqa [%rdi+16*5], %xmm1 - movdqa [%rdi+16*6], %xmm4 - movdqa [%rdi+16*7], %xmm3 - - add %rdi, 128 - - dec %ecx - jnz SwizzleBlock4_sse2_1 - - ret - -// -// swizzling with unaligned reads -// - -// -// SwizzleBlock32u_sse2 -// - -.globl SwizzleBlock32u_sse2 - .type SwizzleBlock32u_sse2, @function -SwizzleBlock32u_sse2: - - mov %eax, 4 - - cmp %ecx, 0xffffffff - jne SwizzleBlock32u_sse2_2 - - .align 16 -SwizzleBlock32u_sse2_1: - movdqu %xmm0, [%rsi] - movdqu %xmm4, [%rsi+16] - movdqu %xmm1, [%rsi+%rdx] - movdqu %xmm5, [%rsi+%rdx+16] - - punpck(qdq, 0, 4, 1, 5, 2, 6) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm2 - movdqa [%rdi+16*2], %xmm4 - movdqa [%rdi+16*3], %xmm6 - - lea %rsi, [%rsi+%rdx*2] - add %rdi, 64 - - dec %eax - jnz SwizzleBlock32u_sse2_1 - - ret - -SwizzleBlock32u_sse2_2: - - movd %xmm7, %rcx - pshufd %xmm7, %xmm7, 0 - - .align 16 -SwizzleBlock32u_sse2_3: - movdqu %xmm0, [%rsi] - movdqu %xmm4, [%rsi+16] - movdqu %xmm1, [%rsi+%rdx] - movdqu %xmm5, [%rsi+%rdx+16] - - punpck(qdq, 0, 4, 1, 5, 2, 6) - - movdqa %xmm3, %xmm7 - pshufd %xmm5, %xmm7, 0xe4 - movdqa %xmm9, %xmm7 - pshufd %xmm11, %xmm7, 0xe4 - - pandn %xmm3, [%rdi+16*0] - pand %xmm0, %xmm7 - por %xmm0, %xmm3 - movdqa [%rdi+16*0], %xmm0 - - pandn %xmm5, [%rdi+16*1] - pand %xmm2, %xmm7 - por %xmm2, %xmm5 - movdqa [%rdi+16*1], %xmm2 - - pandn %xmm9, [%rdi+16*2] - pand %xmm4, %xmm7 - por %xmm4, %xmm9 - movdqa [%rdi+16*2], %xmm4 - - pandn %xmm11, [%rdi+16*3] - pand %xmm6, %xmm7 - por %xmm6, %xmm11 - movdqa [%rdi+16*3], %xmm6 - - lea %rsi, [%rsi+%rdx*2] - add %rdi, 64 - - dec %eax - jnz SwizzleBlock32u_sse2_3 - - ret - -// -// SwizzleBlock16u_sse2 -// - -.globl SwizzleBlock16u_sse2 - .type SwizzleBlock16u_sse2, @function -SwizzleBlock16u_sse2: - mov %eax, 4 - - .align 16 -SwizzleBlock16u_sse2_1: - movdqu %xmm0, [%rsi] - movdqu %xmm1, [%rsi+16] - movdqu %xmm2, [%rsi+%rdx] - movdqu %xmm3, [%rsi+%rdx+16] - - punpck(wd, 0, 2, 1, 3, 4, 6) - punpck(qdq, 0, 4, 2, 6, 1, 5) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm1 - movdqa [%rdi+16*2], %xmm4 - movdqa [%rdi+16*3], %xmm5 - - lea %rsi, [%rsi+%rdx*2] - add %rdi, 64 - - dec %eax - jnz SwizzleBlock16u_sse2_1 - - ret - -// -// SwizzleBlock8u -// - -.globl SwizzleBlock8u_sse2 - .type SwizzleBlock8u_sse2, @function -SwizzleBlock8u_sse2: - mov %ecx, 2 - - .align 16 -SwizzleBlock8u_sse2_1: - // col 0, 2 - - movdqu %xmm0, [%rsi] - movdqu %xmm2, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - pshufd %xmm1, %xmm0, 0xb1 - pshufd %xmm3, %xmm2, 0xb1 - lea %rsi, [%rsi+%rdx*2] - - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(wd, 0, 2, 4, 6, 1, 3) - punpck(qdq, 0, 1, 2, 3, 4, 5) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm4 - movdqa [%rdi+16*2], %xmm1 - movdqa [%rdi+16*3], %xmm5 - - // col 1, 3 - - movdqu %xmm0, [%rsi] - movdqu %xmm2, [%rsi+%rdx] - pshufd %xmm0, %xmm0, 0xb1 - pshufd %xmm2, %xmm2, 0xb1 - lea %rsi, [%rsi+%rdx*2] - - movdqu %xmm1, [%rsi] - movdqu %xmm3, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(wd, 0, 2, 4, 6, 1, 3) - punpck(qdq, 0, 1, 2, 3, 4, 5) - - movdqa [%rdi+16*4], %xmm0 - movdqa [%rdi+16*5], %xmm4 - movdqa [%rdi+16*6], %xmm1 - movdqa [%rdi+16*7], %xmm5 - - add %rdi, 128 - - dec %ecx - jnz SwizzleBlock8u_sse2_1 - - ret - -// -// SwizzleBlock4u -// - -.globl SwizzleBlock4u_sse2 - .type SwizzleBlock4u_sse2, @function -SwizzleBlock4u_sse2: - - mov %ecx, 2 - - mov %eax, 0xf0f0f0f - movd %xmm7, %eax - pshufd %xmm7, %xmm7, 0 - - .align 16 -SwizzleBlock4u_sse2_1: - // col 0, 2 - - movdqu %xmm0, [%rsi] - movdqu %xmm2, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - movdqu %xmm1, [%rsi] - movdqu %xmm3, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - pshuflw %xmm1, %xmm1, 0xb1 - pshuflw %xmm3, %xmm3, 0xb1 - pshufhw %xmm1, %xmm1, 0xb1 - pshufhw %xmm3, %xmm3, 0xb1 - - punpcknbl - punpck(bw, 0, 2, 4, 6, 1, 3) - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(qdq, 0, 4, 2, 6, 1, 3) - - movdqa [%rdi+16*0], %xmm0 - movdqa [%rdi+16*1], %xmm1 - movdqa [%rdi+16*2], %xmm4 - movdqa [%rdi+16*3], %xmm3 - - // col 1, 3 - - movdqu %xmm0, [%rsi] - movdqu %xmm2, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - movdqu %xmm1, [%rsi] - movdqu %xmm3, [%rsi+%rdx] - lea %rsi, [%rsi+%rdx*2] - - pshuflw %xmm0, %xmm0, 0xb1 - pshuflw %xmm2, %xmm2, 0xb1 - pshufhw %xmm0, %xmm0, 0xb1 - pshufhw %xmm2, %xmm2, 0xb1 - - punpcknbl - punpck(bw, 0, 2, 4, 6, 1, 3) - punpck(bw, 0, 2, 1, 3, 4, 6) - punpck(qdq, 0, 4, 2, 6, 1, 3) - - movdqa [%rdi+16*4], %xmm0 - movdqa [%rdi+16*5], %xmm1 - movdqa [%rdi+16*6], %xmm4 - movdqa [%rdi+16*7], %xmm3 - - add %rdi, 128 - - dec %ecx - jnz SwizzleBlock4u_sse2_1 - - ret - - - .align 16 -s_clut16mask: - .long 0xffff0000 - .long 0xffff0000 - .long 0xffff0000 - .long 0xffff0000 - - .align 16 -s_clut16mask2: - - .long 0x0000ffff - .long 0x0000ffff - .long 0x0000ffff - .long 0x0000ffff - -.globl WriteCLUT_T16_I4_CSM1_sse2 - .type WriteCLUT_T16_I4_CSM1_sse2, @function -WriteCLUT_T16_I4_CSM1_sse2: - movdqa %xmm0, xmmword ptr [%rdi] - movdqa %xmm1, xmmword ptr [%rdi+16] - movdqa %xmm2, xmmword ptr [%rdi+32] - movdqa %xmm3, xmmword ptr [%rdi+48] - - // rearrange - pshuflw %xmm0, %xmm0, 0x88 - pshufhw %xmm0, %xmm0, 0x88 - pshuflw %xmm1, %xmm1, 0x88 - pshufhw %xmm1, %xmm1, 0x88 - pshuflw %xmm2, %xmm2, 0x88 - pshufhw %xmm2, %xmm2, 0x88 - pshuflw %xmm3, %xmm3, 0x88 - pshufhw %xmm3, %xmm3, 0x88 - - shufps %xmm0, %xmm1, 0x88 - shufps %xmm2, %xmm3, 0x88 - - pshufd %xmm0, %xmm0, 0xd8 - pshufd %xmm2, %xmm2, 0xd8 - - pxor %xmm6, %xmm6 - - test %rsi, 15 - jnz WriteUnaligned - - movdqa %xmm7, [%rip+s_clut16mask] // saves upper 16 bits - - // have to save interlaced with the old data - movdqa %xmm4, [%rsi] - movdqa %xmm5, [%rsi+32] - movhlps %xmm1, %xmm0 - movlhps %xmm0, %xmm2 // lower 8 colors - - pand %xmm4, %xmm7 - pand %xmm5, %xmm7 - - shufps %xmm1, %xmm2, 0xe4 // upper 8 colors - movdqa %xmm2, %xmm0 - movdqa %xmm3, %xmm1 - - punpcklwd %xmm0, %xmm6 - punpcklwd %xmm1, %xmm6 - por %xmm0, %xmm4 - por %xmm1, %xmm5 - - punpckhwd %xmm2, %xmm6 - punpckhwd %xmm3, %xmm6 - - movdqa [%rsi], %xmm0 - movdqa [%rsi+32], %xmm1 - - movdqa %xmm5, %xmm7 - pand %xmm7, [%rsi+16] - pand %xmm5, [%rsi+48] - - por %xmm2, %xmm7 - por %xmm3, %xmm5 - - movdqa [%rsi+16], %xmm2 - movdqa [%rsi+48], %xmm3 - jmp WriteCLUT_T16_I4_CSM1_End - -WriteUnaligned: - // %rsi is offset by 2 - sub %rsi, 2 - - movdqa %xmm7, [%rip+s_clut16mask2] // saves lower 16 bits - - // have to save interlaced with the old data - movdqa %xmm4, [%rsi] - movdqa %xmm5, [%rsi+32] - movhlps %xmm1, %xmm0 - movlhps %xmm0, %xmm2 // lower 8 colors - - pand %xmm4, %xmm7 - pand %xmm5, %xmm7 - - shufps %xmm1, %xmm2, 0xe4 // upper 8 colors - movdqa %xmm2, %xmm0 - movdqa %xmm3, %xmm1 - - punpcklwd %xmm0, %xmm6 - punpcklwd %xmm1, %xmm6 - pslld %xmm0, 16 - pslld %xmm1, 16 - por %xmm0, %xmm4 - por %xmm1, %xmm5 - - punpckhwd %xmm2, %xmm6 - punpckhwd %xmm3, %xmm6 - pslld %xmm2, 16 - pslld %xmm3, 16 - - movdqa [%rsi], %xmm0 - movdqa [%rsi+32], %xmm1 - - movdqa %xmm5, %xmm7 - pand %xmm7, [%rsi+16] - pand %xmm5, [%rsi+48] - - por %xmm2, %xmm7 - por %xmm3, %xmm5 - - movdqa [%rsi+16], %xmm2 - movdqa [%rsi+48], %xmm3 -WriteCLUT_T16_I4_CSM1_End: - ret - -#endif diff --git a/plugins/zerogs/opengl/x86-64.asm b/plugins/zerogs/opengl/x86-64.asm deleted file mode 100644 index 6d476dfc6d..0000000000 --- a/plugins/zerogs/opengl/x86-64.asm +++ /dev/null @@ -1,1091 +0,0 @@ -; Copyright (C) 2003-2005 Gabest/zerofrog -; http:;;www.gabest.org -; -; This Program is free software; you can redistribute it and/or modify -; it under the terms of the GNU General Public License as published by -; the Free Software Foundation; either version 2, or (at your option) -; any later version. -; -; This Program is distributed in the hope that it will be useful, -; but WITHOUT ANY WARRANTY; without even the implied warranty of -; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; GNU General Public License for more details. -; -; You should have received a copy of the GNU General Public License -; along with GNU Make; see the file COPYING. If not, write to -; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. -; http:;;www.gnu.org/copyleft/gpl.html -; -; - -extern s_clut16mask:ptr - - .code - -; mmx memcpy implementation, size has to be a multiple of 8 -; returns 0 is equal, nonzero value if not equal -; ~10 times faster than standard memcmp -; (zerofrog) -; u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize) -; rcx - src1 -; rdx - src2 -; r8d - cmpsize -memcmp_mmx proc public - cmp r8d, 32 - jl Done4 - - ; custom test first 8 to make sure things are ok - movq mm0, [rdx] - movq mm1, [rdx+8] - pcmpeqd mm0, [rcx] - pcmpeqd mm1, [rcx+8] - pand mm0, mm1 - movq mm2, [rdx+16] - pmovmskb eax, mm0 - movq mm3, [rdx+24] - - ; check if eq - cmp eax, 0ffh - je NextComp - mov eax, 1 - jmp Finish - -NextComp: - pcmpeqd mm2, [rcx+16] - pcmpeqd mm3, [rcx+24] - pand mm2, mm3 - pmovmskb eax, mm2 - - sub r8d, 32 - add rdx, 32 - add rcx, 32 - - ; check if eq - cmp eax, 0ffh - je ContinueTest - mov eax, 1 - jmp Finish - - cmp r8d, 64 - jl Done8 - -Cmp8: - movq mm0, [rdx] - movq mm1, [rdx+8] - movq mm2, [rdx+16] - movq mm3, [rdx+24] - movq mm4, [rdx+32] - movq mm5, [rdx+40] - movq mm6, [rdx+48] - movq mm7, [rdx+56] - pcmpeqd mm0, [rcx] - pcmpeqd mm1, [rcx+8] - pcmpeqd mm2, [rcx+16] - pcmpeqd mm3, [rcx+24] - pand mm0, mm1 - pcmpeqd mm4, [rcx+32] - pand mm0, mm2 - pcmpeqd mm5, [rcx+40] - pand mm0, mm3 - pcmpeqd mm6, [rcx+48] - pand mm0, mm4 - pcmpeqd mm7, [rcx+56] - pand mm0, mm5 - pand mm0, mm6 - pand mm0, mm7 - pmovmskb eax, mm0 - - ; check if eq - cmp eax, 0ffh - je Continue - mov eax, 1 - jmp Finish - -Continue: - sub r8d, 64 - add rdx, 64 - add rcx, 64 -ContinueTest: - cmp r8d, 64 - jge Cmp8 - -Done8: - test r8d, 020h - jz Done4 - movq mm0, [rdx] - movq mm1, [rdx+8] - movq mm2, [rdx+16] - movq mm3, [rdx+24] - pcmpeqd mm0, [rcx] - pcmpeqd mm1, [rcx+8] - pcmpeqd mm2, [rcx+16] - pcmpeqd mm3, [rcx+24] - pand mm0, mm1 - pand mm0, mm2 - pand mm0, mm3 - pmovmskb eax, mm0 - sub r8d, 32 - add rdx, 32 - add rcx, 32 - - ; check if eq - cmp eax, 0ffh - je Done4 - mov eax, 1 - jmp Finish - -Done4: - cmp r8d, 24 - jne Done2 - movq mm0, [rdx] - movq mm1, [rdx+8] - movq mm2, [rdx+16] - pcmpeqd mm0, [rcx] - pcmpeqd mm1, [rcx+8] - pcmpeqd mm2, [rcx+16] - pand mm0, mm1 - pand mm0, mm2 - pmovmskb eax, mm0 - - ; check if eq - cmp eax, 0ffh - je Done - mov eax, 1 - jmp Finish - -Done2: - cmp r8d, 16 - jne Done1 - - movq mm0, [rdx] - movq mm1, [rdx+8] - pcmpeqd mm0, [rcx] - pcmpeqd mm1, [rcx+8] - pand mm0, mm1 - pmovmskb eax, mm0 - - ; check if eq - cmp eax, 0ffh - je Done - mov eax, 1 - jmp Finish - -Done1: - cmp r8d, 8 - jne Done - - mov eax, [rdx] - mov rdx, [rdx+4] - cmp eax, [rcx] - je Next - mov eax, 1 - jmp Finish - -Next: - cmp rdx, [rcx+4] - je Done - mov eax, 1 - jmp Finish - -Done: - xor eax, eax - -Finish: - emms - ret - -memcmp_mmx endp - -; TestClutChangeMMX -; mov rdx, dst -; mov rcx, src -; mov r8d, entries -TestClutChangeMMX proc public - -Start: - movq mm0, [rdx] - movq mm1, [rdx+8] - pcmpeqd mm0, [rcx] - pcmpeqd mm1, [rcx+16] - - movq mm2, [rdx+16] - movq mm3, [rdx+24] - pcmpeqd mm2, [rcx+32] - pcmpeqd mm3, [rcx+48] - - pand mm0, mm1 - pand mm2, mm3 - movq mm4, [rdx+32] - movq mm5, [rdx+40] - pcmpeqd mm4, [rcx+8] - pcmpeqd mm5, [rcx+24] - - pand mm0, mm2 - pand mm4, mm5 - movq mm6, [rdx+48] - movq mm7, [rdx+56] - pcmpeqd mm6, [rcx+40] - pcmpeqd mm7, [rcx+56] - - pand mm0, mm4 - pand mm6, mm7 - pand mm0, mm6 - - pmovmskb eax, mm0 - cmp eax, 0ffh - je Continue - mov byte ptr [r9], 1 - jmp Return - -Continue: - cmp r8d, 16 - jle Return - - test r8d, 010h - jz AddRcx - sub rcx, 448 ; go back and down one column, -AddRcx: - add rcx, 256 ; go to the right block - - - jne Continue1 - add rcx, 256 ; skip whole block -Continue1: - add rdx, 64 - sub r8d, 16 - jmp Start - -Return: - emms - ret - -TestClutChangeMMX endp - -UnswizzleZ16Target proc public - pxor xmm7, xmm7 - -Z16Loop: - ;; unpack 64 bytes at a time - movdqa xmm0, [rdx] - movdqa xmm2, [rdx+16] - movdqa xmm4, [rdx+32] - movdqa xmm6, [rdx+48] - - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - movdqa xmm5, xmm4 - - punpcklwd xmm0, xmm7 - punpckhwd xmm1, xmm7 - punpcklwd xmm2, xmm7 - punpckhwd xmm3, xmm7 - - ;; start saving - movdqa [rcx], xmm0 - movdqa [rcx+16], xmm1 - - punpcklwd xmm4, xmm7 - punpckhwd xmm5, xmm7 - - movdqa [rcx+32], xmm2 - movdqa [rcx+48], xmm3 - - movdqa xmm0, xmm6 - punpcklwd xmm6, xmm7 - - movdqa [rcx+64], xmm4 - movdqa [rcx+80], xmm5 - - punpckhwd xmm0, xmm7 - - movdqa [rcx+96], xmm6 - movdqa [rcx+112], xmm0 - - add rdx, 64 - add rcx, 128 - sub r9d, 1 - jne Z16Loop - - ret -UnswizzleZ16Target endp - -; -; swizzling -; - -punpck macro op, sd0, sd2, s1, s3, d1, d3 - - movdqa @CatStr(xmm, %d1), @CatStr(xmm, %sd0) - pshufd @CatStr(xmm, %d3), @CatStr(xmm, %sd2), 0e4h - - @CatStr(punpckl, op) @CatStr(xmm, %sd0), @CatStr(xmm, %s1) - @CatStr(punpckh, op) @CatStr(xmm, %d1), @CatStr(xmm, %s1) - @CatStr(punpckl, op) @CatStr(xmm, %sd2), @CatStr(xmm, %s3) - @CatStr(punpckh, op) @CatStr(xmm, %d3), @CatStr(xmm, %s3) - - endm - -punpcknbl macro - - movdqa xmm4, xmm0 - pshufd xmm5, xmm1, 0e4h - - psllq xmm1, 4 - psrlq xmm4, 4 - - movdqa xmm6, xmm7 - pand xmm0, xmm7 - pandn xmm6, xmm1 - por xmm0, xmm6 - - movdqa xmm6, xmm7 - pand xmm4, xmm7 - pandn xmm6, xmm5 - por xmm4, xmm6 - - movdqa xmm1, xmm4 - - movdqa xmm4, xmm2 - pshufd xmm5, xmm3, 0e4h - - psllq xmm3, 4 - psrlq xmm4, 4 - - movdqa xmm6, xmm7 - pand xmm2, xmm7 - pandn xmm6, xmm3 - por xmm2, xmm6 - - movdqa xmm6, xmm7 - pand xmm4, xmm7 - pandn xmm6, xmm5 - por xmm4, xmm6 - - movdqa xmm3, xmm4 - - punpck bw, 0, 2, 1, 3, 4, 6 - - endm - -punpcknbh macro - - movdqa xmm12, xmm8 - pshufd xmm13, xmm9, 0e4h - - psllq xmm9, 4 - psrlq xmm12, 4 - - movdqa xmm14, xmm15 - pand xmm8, xmm15 - pandn xmm14, xmm9 - por xmm8, xmm14 - - movdqa xmm14, xmm15 - pand xmm12, xmm15 - pandn xmm14, xmm13 - por xmm12, xmm14 - - movdqa xmm9, xmm12 - - movdqa xmm12, xmm10 - pshufd xmm13, xmm11, 0e4h - - psllq xmm11, 4 - psrlq xmm12, 4 - - movdqa xmm14, xmm15 - pand xmm10, xmm15 - pandn xmm14, xmm11 - por xmm10, xmm14 - - movdqa xmm14, xmm15 - pand xmm12, xmm15 - pandn xmm14, xmm13 - por xmm12, xmm14 - - movdqa xmm11, xmm12 - - punpck bw, 8, 10, 9, 11, 12, 14 - - endm - -; -; SwizzleBlock32_sse2 -; - -SwizzleBlock32_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov rcx, 4 - - cmp r9d, 0ffffffffh - jne SwizzleBlock32_sse2@WM - - align 16 -@@: - movdqa xmm0, [rsi] - movdqa xmm4, [rsi+16] - movdqa xmm1, [rsi+r8] - movdqa xmm5, [rsi+r8+16] - - punpck qdq, 0, 4, 1, 5, 2, 6 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm2 - movdqa [rdi+16*2], xmm4 - movdqa [rdi+16*3], xmm6 - - lea rsi, [rsi+r8*2] - add rdi, 64 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock32_sse2@WM: - - movd xmm7, r9d - pshufd xmm7, xmm7, 0 - - align 16 -@@: - movdqa xmm0, [rsi] - movdqa xmm4, [rsi+16] - movdqa xmm1, [rsi+r8] - movdqa xmm5, [rsi+r8+16] - - punpck qdq, 0, 4, 1, 5, 2, 6 - - movdqa xmm3, xmm7 - pshufd xmm5, xmm7, 0e4h - movdqa xmm9, xmm7 - pshufd xmm11, xmm7, 0e4h - - pandn xmm3, [rdi+16*0] - pand xmm0, xmm7 - por xmm0, xmm3 - movdqa [rdi+16*0], xmm0 - - pandn xmm5, [rdi+16*1] - pand xmm2, xmm7 - por xmm2, xmm5 - movdqa [rdi+16*1], xmm2 - - pandn xmm9, [rdi+16*2] - pand xmm4, xmm7 - por xmm4, xmm9 - movdqa [rdi+16*2], xmm4 - - pandn xmm11, [rdi+16*3] - pand xmm6, xmm7 - por xmm6, xmm11 - movdqa [edi+16*3], xmm6 - - lea rsi, [rsi+r8*2] - add rdi, 64 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock32_sse2 endp - -; -; SwizzleBlock16_sse2 -; - -SwizzleBlock16_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov rcx, 4 - - align 16 -@@: - movdqa xmm0, [rsi] - movdqa xmm1, [rsi+16] - movdqa xmm2, [rsi+r8] - movdqa xmm3, [rsi+r8+16] - - punpck wd, 0, 2, 1, 3, 4, 6 - punpck qdq, 0, 4, 2, 6, 1, 5 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm1 - movdqa [rdi+16*2], xmm4 - movdqa [rdi+16*3], xmm5 - - lea rsi, [rsi+r8*2] - add rdi, 64 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock16_sse2 endp - -; -; SwizzleBlock8 -; - -SwizzleBlock8_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov ecx, 2 - - align 16 -@@: - ; col 0, 2 - - movdqa xmm0, [rsi] - movdqa xmm2, [rsi+r8] - lea rsi, [rsi+r8*2] - - pshufd xmm1, [rsi], 0b1h - pshufd xmm3, [rsi+r8], 0b1h - lea rsi, [rsi+r8*2] - - punpck bw, 0, 2, 1, 3, 4, 6 - punpck wd, 0, 2, 4, 6, 1, 3 - punpck qdq, 0, 1, 2, 3, 4, 5 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm4 - movdqa [rdi+16*2], xmm1 - movdqa [rdi+16*3], xmm5 - - ; col 1, 3 - - pshufd xmm0, [rsi], 0b1h - pshufd xmm2, [rsi+r8], 0b1h - lea rsi, [rsi+r8*2] - - movdqa xmm1, [rsi] - movdqa xmm3, [rsi+r8] - lea rsi, [rsi+r8*2] - - punpck bw, 0, 2, 1, 3, 4, 6 - punpck wd, 0, 2, 4, 6, 1, 3 - punpck qdq, 0, 1, 2, 3, 4, 5 - - movdqa [rdi+16*4], xmm0 - movdqa [rdi+16*5], xmm4 - movdqa [rdi+16*6], xmm1 - movdqa [rdi+16*7], xmm5 - - add edi, 128 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock8_sse2 endp - -; -; SwizzleBlock4 -; - -SwizzleBlock4_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov rcx, 2 - - mov eax, 0f0f0f0fh - movd xmm7, eax - pshufd xmm7, xmm7, 0 - - align 16 -@@: - ; col 0, 2 - - movdqa xmm0, [rsi] - movdqa xmm2, [rsi+r8] - lea rsi, [rsi+r8*2] - - movdqa xmm1, [rsi] - movdqa xmm3, [rsi+r8] - lea rsi, [rsi+r8*2] - - pshuflw xmm1, xmm1, 0b1h - pshuflw xmm3, xmm3, 0b1h - pshufhw xmm1, xmm1, 0b1h - pshufhw xmm3, xmm3, 0b1h - - punpcknbl - punpck bw, 0, 2, 4, 6, 1, 3 - punpck bw, 0, 2, 1, 3, 4, 6 - punpck qdq, 0, 4, 2, 6, 1, 3 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm1 - movdqa [rdi+16*2], xmm4 - movdqa [rdi+16*3], xmm3 - - ; col 1, 3 - - movdqa xmm0, [rsi] - movdqa xmm2, [rsi+r8] - lea esi, [rsi+r8*2] - - movdqa xmm1, [rsi] - movdqa xmm3, [rsi+r8] - lea rsi, [rsi+r8*2] - - pshuflw xmm0, xmm0, 0b1h - pshuflw xmm2, xmm2, 0b1h - pshufhw xmm0, xmm0, 0b1h - pshufhw xmm2, xmm2, 0b1h - - punpcknbl - punpck bw, 0, 2, 4, 6, 1, 3 - punpck bw, 0, 2, 1, 3, 4, 6 - punpck qdq, 0, 4, 2, 6, 1, 3 - - movdqa [rdi+16*4], xmm0 - movdqa [rdi+16*5], xmm1 - movdqa [rdi+16*6], xmm4 - movdqa [rdi+16*7], xmm3 - - add rdi, 128 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock4_sse2 endp - -; -; swizzling with unaligned reads -; - -; -; SwizzleBlock32u_sse2 -; - -SwizzleBlock32u_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov rcx, 4 - - cmp r9d, 0ffffffffh - jne SwizzleBlock32u_sse2@WM - - align 16 -@@: - movdqu xmm0, [rsi] - movdqu xmm4, [rsi+16] - movdqu xmm1, [rsi+r8] - movdqu xmm5, [rsi+r8+16] - - punpck qdq, 0, 4, 1, 5, 2, 6 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm2 - movdqa [rdi+16*2], xmm4 - movdqa [rdi+16*3], xmm6 - - lea rsi, [rsi+r8*2] - add rdi, 64 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock32u_sse2@WM: - - movd xmm7, r9d - pshufd xmm7, xmm7, 0 - - align 16 -@@: - movdqu xmm0, [rsi] - movdqu xmm4, [rsi+16] - movdqu xmm1, [rsi+r8] - movdqu xmm5, [rsi+r8+16] - - punpck qdq, 0, 4, 1, 5, 2, 6 - - movdqa xmm3, xmm7 - pshufd xmm5, xmm7, 0e4h - movdqa xmm9, xmm7 - pshufd xmm11, xmm7, 0e4h - - pandn xmm3, [rdi+16*0] - pand xmm0, xmm7 - por xmm0, xmm3 - movdqa [rdi+16*0], xmm0 - - pandn xmm5, [rdi+16*1] - pand xmm2, xmm7 - por xmm2, xmm5 - movdqa [rdi+16*1], xmm2 - - pandn xmm9, [rdi+16*2] - pand xmm4, xmm7 - por xmm4, xmm9 - movdqa [rdi+16*2], xmm4 - - pandn xmm11, [rdi+16*3] - pand xmm6, xmm7 - por xmm6, xmm11 - movdqa [edi+16*3], xmm6 - - lea rsi, [rsi+r8*2] - add rdi, 64 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock32u_sse2 endp - -; -; SwizzleBlock16u_sse2 -; - -SwizzleBlock16u_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov rcx, 4 - - align 16 -@@: - movdqu xmm0, [rsi] - movdqu xmm1, [rsi+16] - movdqu xmm2, [rsi+r8] - movdqu xmm3, [rsi+r8+16] - - punpck wd, 0, 2, 1, 3, 4, 6 - punpck qdq, 0, 4, 2, 6, 1, 5 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm1 - movdqa [rdi+16*2], xmm4 - movdqa [rdi+16*3], xmm5 - - lea rsi, [rsi+r8*2] - add rdi, 64 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock16u_sse2 endp - -; -; SwizzleBlock8u -; - -SwizzleBlock8u_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov ecx, 2 - - align 16 -@@: - ; col 0, 2 - - movdqu xmm0, [rsi] - movdqu xmm2, [rsi+r8] - lea rsi, [rsi+r8*2] - - pshufd xmm1, xmm0, 0b1h - pshufd xmm3, xmm2, 0b1h - lea rsi, [rsi+r8*2] - - punpck bw, 0, 2, 1, 3, 4, 6 - punpck wd, 0, 2, 4, 6, 1, 3 - punpck qdq, 0, 1, 2, 3, 4, 5 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm4 - movdqa [rdi+16*2], xmm1 - movdqa [rdi+16*3], xmm5 - - ; col 1, 3 - - movdqu xmm0, [rsi] - movdqu xmm2, [rsi+r8] - pshufd xmm0, xmm0, 0b1h - pshufd xmm2, xmm2, 0b1h - lea rsi, [rsi+r8*2] - - movdqu xmm1, [rsi] - movdqu xmm3, [rsi+r8] - lea rsi, [rsi+r8*2] - - punpck bw, 0, 2, 1, 3, 4, 6 - punpck wd, 0, 2, 4, 6, 1, 3 - punpck qdq, 0, 1, 2, 3, 4, 5 - - movdqa [rdi+16*4], xmm0 - movdqa [rdi+16*5], xmm4 - movdqa [rdi+16*6], xmm1 - movdqa [rdi+16*7], xmm5 - - add edi, 128 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock8u_sse2 endp - -; -; SwizzleBlock4u -; - -SwizzleBlock4u_sse2 proc public - - push rsi - push rdi - - mov rdi, rcx - mov rsi, rdx - mov rcx, 2 - - mov eax, 0f0f0f0fh - movd xmm7, eax - pshufd xmm7, xmm7, 0 - - align 16 -@@: - ; col 0, 2 - - movdqu xmm0, [rsi] - movdqu xmm2, [rsi+r8] - lea rsi, [rsi+r8*2] - - movdqu xmm1, [rsi] - movdqu xmm3, [rsi+r8] - lea rsi, [rsi+r8*2] - - pshuflw xmm1, xmm1, 0b1h - pshuflw xmm3, xmm3, 0b1h - pshufhw xmm1, xmm1, 0b1h - pshufhw xmm3, xmm3, 0b1h - - punpcknbl - punpck bw, 0, 2, 4, 6, 1, 3 - punpck bw, 0, 2, 1, 3, 4, 6 - punpck qdq, 0, 4, 2, 6, 1, 3 - - movdqa [rdi+16*0], xmm0 - movdqa [rdi+16*1], xmm1 - movdqa [rdi+16*2], xmm4 - movdqa [rdi+16*3], xmm3 - - ; col 1, 3 - - movdqu xmm0, [rsi] - movdqu xmm2, [rsi+r8] - lea esi, [rsi+r8*2] - - movdqu xmm1, [rsi] - movdqu xmm3, [rsi+r8] - lea rsi, [rsi+r8*2] - - pshuflw xmm0, xmm0, 0b1h - pshuflw xmm2, xmm2, 0b1h - pshufhw xmm0, xmm0, 0b1h - pshufhw xmm2, xmm2, 0b1h - - punpcknbl - punpck bw, 0, 2, 4, 6, 1, 3 - punpck bw, 0, 2, 1, 3, 4, 6 - punpck qdq, 0, 4, 2, 6, 1, 3 - - movdqa [rdi+16*4], xmm0 - movdqa [rdi+16*5], xmm1 - movdqa [rdi+16*6], xmm4 - movdqa [rdi+16*7], xmm3 - - add rdi, 128 - - dec rcx - jnz @B - - pop rdi - pop rsi - - ret - -SwizzleBlock4u_sse2 endp - -WriteCLUT_T16_I4_CSM1_sse2 proc public - movdqa xmm0, XMMWORD PTR [rcx] - movdqa xmm1, XMMWORD PTR [rcx+16] - movdqa xmm2, XMMWORD PTR [rcx+32] - movdqa xmm3, XMMWORD PTR [rcx+48] - - ;; rearrange - pshuflw xmm0, xmm0, 088h - pshufhw xmm0, xmm0, 088h - pshuflw xmm1, xmm1, 088h - pshufhw xmm1, xmm1, 088h - pshuflw xmm2, xmm2, 088h - pshufhw xmm2, xmm2, 088h - pshuflw xmm3, xmm3, 088h - pshufhw xmm3, xmm3, 088h - - shufps xmm0, xmm1, 088h - shufps xmm2, xmm3, 088h - - pshufd xmm0, xmm0, 0d8h - pshufd xmm2, xmm2, 0d8h - - pxor xmm6, xmm6 - mov rax, offset s_clut16mask - - test rdx, 15 - jnz WriteUnaligned - - movdqa xmm7, XMMWORD PTR [rax] ;; saves upper 16 bits - - ;; have to save interlaced with the old data - movdqa xmm4, [rdx] - movdqa xmm5, [rdx+32] - movhlps xmm1, xmm0 - movlhps xmm0, xmm2 ;; lower 8 colors - - pand xmm4, xmm7 - pand xmm5, xmm7 - - shufps xmm1, xmm2, 0e4h ;; upper 8 colors - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - punpcklwd xmm0, xmm6 - punpcklwd xmm1, xmm6 - por xmm0, xmm4 - por xmm1, xmm5 - - punpckhwd xmm2, xmm6 - punpckhwd xmm3, xmm6 - - movdqa [rdx], xmm0 - movdqa [rdx+32], xmm1 - - movdqa xmm5, xmm7 - pand xmm7, [rdx+16] - pand xmm5, [rdx+48] - - por xmm2, xmm7 - por xmm3, xmm5 - - movdqa [rdx+16], xmm2 - movdqa [rdx+48], xmm3 - jmp WriteCLUT_T16_I4_CSM1_End - -WriteUnaligned: - ;; rdx is offset by 2 - sub rdx, 2 - - movdqa xmm7, XMMWORD PTR [rax+16] ;; saves lower 16 bits - - ;; have to save interlaced with the old data - movdqa xmm4, [rdx] - movdqa xmm5, [rdx+32] - movhlps xmm1, xmm0 - movlhps xmm0, xmm2 ;; lower 8 colors - - pand xmm4, xmm7 - pand xmm5, xmm7 - - shufps xmm1, xmm2, 0e4h ;; upper 8 colors - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - punpcklwd xmm0, xmm6 - punpcklwd xmm1, xmm6 - pslld xmm0, 16 - pslld xmm1, 16 - por xmm0, xmm4 - por xmm1, xmm5 - - punpckhwd xmm2, xmm6 - punpckhwd xmm3, xmm6 - pslld xmm2, 16 - pslld xmm3, 16 - - movdqa [rdx], xmm0 - movdqa [rdx+32], xmm1 - - movdqa xmm5, xmm7 - pand xmm7, [rdx+16] - pand xmm5, [rdx+48] - - por xmm2, xmm7 - por xmm3, xmm5 - - movdqa [rdx+16], xmm2 - movdqa [rdx+48], xmm3 -WriteCLUT_T16_I4_CSM1_End: - ret - -WriteCLUT_T16_I4_CSM1_sse2 endp - -end \ No newline at end of file diff --git a/plugins/zerogs/opengl/x86.cpp b/plugins/zerogs/opengl/x86.cpp index 26b988b2fd..712eacd9d4 100644 --- a/plugins/zerogs/opengl/x86.cpp +++ b/plugins/zerogs/opengl/x86.cpp @@ -23,7 +23,7 @@ #include "Mem.h" #include "x86.h" -#if defined(ZEROGS_SSE2) && (defined(_WIN32)||defined(__x86_64__)) +#if defined(ZEROGS_SSE2) && defined(_WIN32) #include #include #endif @@ -292,7 +292,7 @@ _FrameSwizzleBlock(A4_, (src[2*j]+src[2*j+1]+src[2*j+srcpitch]+src[2*j+srcpitch+ // } //} -#if (defined(_WIN32)||defined(__x86_64__)) +#if defined(_WIN32) extern "C" void __fastcall WriteCLUT_T32_I8_CSM1_sse2(u32* vm, u32* clut) { @@ -351,8 +351,6 @@ PCSX2_ALIGNED16(int s_clut16mask[8]) = { 0xffff0000, 0xffff0000, 0xffff0000, 0xf 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff}; } -#if !defined(__x86_64__) - extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut) { __asm { @@ -467,7 +465,6 @@ WriteUnaligned: End: } } -#endif // __x86_64__ #endif // _MSC_VER #endif // ZEROGS_SSE2 diff --git a/plugins/zerogs/opengl/zerogs.cpp b/plugins/zerogs/opengl/zerogs.cpp index 04ab21c810..ae3cca5c57 100644 --- a/plugins/zerogs/opengl/zerogs.cpp +++ b/plugins/zerogs/opengl/zerogs.cpp @@ -5108,9 +5108,6 @@ void ZeroGS::ExtWrite() //////////// // Caches // //////////// -#ifdef __x86_64__ -extern "C" void TestClutChangeMMX(void* src, void* dst, int entries, void* pret); -#endif bool ZeroGS::CheckChangeInClut(u32 highdword, u32 psm) { @@ -5148,9 +5145,6 @@ bool ZeroGS::CheckChangeInClut(u32 highdword, u32 psm) // do a fast test with MMX #ifdef _MSC_VER -#ifdef __x86_64__ - TestClutChangeMMX(dst, src, entries, &bRet); -#else int storeebx; __asm { mov storeebx, ebx @@ -5215,63 +5209,9 @@ Return: emms mov ebx, storeebx } -#endif // __x86_64__ #else // linux -#ifdef __x86_64__ - __asm__( - ".intel_syntax\n" -"Start:\n" - "movq %%mm0, [%%rcx]\n" - "movq %%mm1, [%%rcx+8]\n" - "pcmpeqd %%mm0, [%%rdx]\n" - "pcmpeqd %%mm1, [%%rdx+16]\n" - "movq %%mm2, [%%rcx+16]\n" - "movq %%mm3, [%%rcx+24]\n" - "pcmpeqd %%mm2, [%%rdx+32]\n" - "pcmpeqd %%mm3, [%%rdx+48]\n" - "pand %%mm0, %%mm1\n" - "pand %%mm2, %%mm3\n" - "movq %%mm4, [%%rcx+32]\n" - "movq %%mm5, [%%rcx+40]\n" - "pcmpeqd %%mm4, [%%rdx+8]\n" - "pcmpeqd %%mm5, [%%rdx+24]\n" - "pand %%mm0, %%mm2\n" - "pand %%mm4, %%mm5\n" - "movq %%mm6, [%%rcx+48]\n" - "movq %%mm7, [%%rcx+56]\n" - "pcmpeqd %%mm6, [%%rdx+40]\n" - "pcmpeqd %%mm7, [%%rdx+56]\n" - "pand %%mm0, %%mm4\n" - "pand %%mm6, %%mm7\n" - "pand %%mm0, %%mm6\n" - "pmovmskb %%eax, %%mm0\n" - "cmp %%eax, 0xff\n" - "je Continue\n" - ".att_syntax\n" - "movb $1, %0\n" - ".intel_syntax\n" - "jmp Return\n" -"Continue:\n" - "cmp %%rbx, 16\n" - "jle Return\n" - "test %%rbx, 0x10\n" - "jz AddRcx\n" - "sub %%rdx, 448\n" // go back and down one column -"AddRcx:\n" - "add %%rdx, 256\n" // go to the right block - "cmp %%rbx, 0x90\n" - "jne Continue1\n" - "add %%rdx, 256\n" // skip whole block -"Continue1:\n" - "add %%rcx, 64\n" - "sub %%rbx, 16\n" - "jmp Start\n" -"Return:\n" - "emms\n" - ".att_syntax\n" : "=m"(bRet) : "c"(dst), "d"(src), "b"(entries) : "rax", "memory");// Breaks -fPIC -#else // do a fast test with MMX __asm__( ".intel_syntax\n" @@ -5324,7 +5264,6 @@ Return: "Return:\n" "emms\n" ".att_syntax\n" : "=m"(bRet) : "c"(dst), "d"(src), "b"(entries) : "eax", "memory"); // Breaks -fPIC -#endif // __x86_64__ #endif // _WIN32