diff --git a/pcsx2/Linux/pcsx2.cbp b/pcsx2/Linux/pcsx2.cbp
index 31edeeccee..a9d7eb013b 100644
--- a/pcsx2/Linux/pcsx2.cbp
+++ b/pcsx2/Linux/pcsx2.cbp
@@ -1,565 +1,567 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/pcsx2/gui/wxAppWithHelpers.cpp b/pcsx2/gui/wxAppWithHelpers.cpp
index 9958f3ba30..92d77dc49c 100644
--- a/pcsx2/gui/wxAppWithHelpers.cpp
+++ b/pcsx2/gui/wxAppWithHelpers.cpp
@@ -121,7 +121,7 @@ bool wxAppWithHelpers::OnInit()
Connect( pxEvt_MessageBox, pxMessageBoxEventThing (wxAppWithHelpers::OnMessageBox) );
Connect( pxEvt_Assertion, pxMessageBoxEventThing (wxAppWithHelpers::OnMessageBox) );
Connect( pxEvt_Ping, pxPingEventHandler (wxAppWithHelpers::OnPingEvent) );
- Connect( wxEvt_Idle, wxIdleEventHandler (wxAppWithHelpers::OnIdleEvent) );
+ Connect( wxEVT_IDLE, wxIdleEventHandler (wxAppWithHelpers::OnIdleEvent) );
Connect( m_PingTimer.GetId(), wxEVT_TIMER, wxTimerEventHandler(wxAppWithHelpers::OnPingTimeout) );
diff --git a/pcsx2/gui/wxAppWithHelpers.h b/pcsx2/gui/wxAppWithHelpers.h
index 6df63ce54b..18842d1aa0 100644
--- a/pcsx2/gui/wxAppWithHelpers.h
+++ b/pcsx2/gui/wxAppWithHelpers.h
@@ -244,9 +244,11 @@ public:
pxAssertionEvent& SetInstData( MsgboxEventResult& instdata );
pxAssertionEvent& SetStacktrace( const wxString& trace );
+ ~pxAssertionEvent() throw() { }
protected:
virtual int _DoDialog() const;
+
};
// --------------------------------------------------------------------------------------
diff --git a/plugins/zerogs/opengl/Mem.h b/plugins/zerogs/opengl/Mem.h
index 63317313a8..206b361de5 100644
--- a/plugins/zerogs/opengl/Mem.h
+++ b/plugins/zerogs/opengl/Mem.h
@@ -359,11 +359,8 @@ static __forceinline void writePixel32_0(void* pmem, int x, int y, u32 pixel, u3
static __forceinline void writePixel24_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
u8 *buf = (u8*)&((u32*)pmem)[getPixelAddress32_0(x, y, bw)];
u8 *pix = (u8*)&pixel;
-#if defined(_MSC_VER) && defined(__x86_64__)
- memcpy(buf, pix, 3);
-#else
+
buf[0] = pix[0]; buf[1] = pix[1]; buf[2] = pix[2];
-#endif
}
static __forceinline void writePixel16_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
@@ -406,11 +403,7 @@ static __forceinline void writePixel32Z_0(void* pmem, int x, int y, u32 pixel, u
static __forceinline void writePixel24Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
u8 *buf = (u8*)pmem + 4*getPixelAddress32Z_0(x, y, bw);
u8 *pix = (u8*)&pixel;
-#if defined(_MSC_VER) && defined(__x86_64__)
- memcpy(buf, pix, 3);
-#else
buf[0] = pix[0]; buf[1] = pix[1]; buf[2] = pix[2];
-#endif
}
static __forceinline void writePixel16Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
diff --git a/plugins/zerogs/opengl/Regs.h b/plugins/zerogs/opengl/Regs.h
index 40a4b4b065..c76fba67a0 100644
--- a/plugins/zerogs/opengl/Regs.h
+++ b/plugins/zerogs/opengl/Regs.h
@@ -23,11 +23,7 @@
typedef void (__fastcall *GIFRegHandler)(u32* data);
#else
-#ifdef __x86_64__
-typedef void (*GIFRegHandler)(u32* data);
-#else
typedef void (__fastcall *GIFRegHandler)(u32* data);
-#endif
#endif
diff --git a/plugins/zerogs/opengl/memcpy_amd.cpp b/plugins/zerogs/opengl/memcpy_amd.cpp
index dfd1119330..df0efd5478 100644
--- a/plugins/zerogs/opengl/memcpy_amd.cpp
+++ b/plugins/zerogs/opengl/memcpy_amd.cpp
@@ -78,7 +78,7 @@ MEMCPY_AMD.CPP
extern "C" {
#include "PS2Etypes.h"
-#if defined(_MSC_VER) && !defined(__x86_64__)
+#if defined(_MSC_VER)
void * memcpy_amd(void *dest, const void *src, size_t n)
{
@@ -461,7 +461,7 @@ End:
}
#else // _MSC_VER
-// assume gcc or mingw or win x64
+// assume gcc
#include
#include
diff --git a/plugins/zerogs/opengl/targets.cpp b/plugins/zerogs/opengl/targets.cpp
index 874ce4b8c7..c006084981 100644
--- a/plugins/zerogs/opengl/targets.cpp
+++ b/plugins/zerogs/opengl/targets.cpp
@@ -1602,10 +1602,6 @@ inline list::iterator ZeroGS::CMemoryTargetMngr::DestroyTargetIte
return it;
}
-#if defined(_MSC_VER) && defined(__x86_64__)
-extern "C" void UnswizzleZ16Target(void* dst, void* src, int iters);
-#endif
-
ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forcevalidate)
{
int nbStart, nbEnd;
@@ -1915,9 +1911,6 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
#if defined(_MSC_VER)
-#if defined(__x86_64__)
- UnswizzleZ16Target(dst, src, iters);
-#else
__asm {
mov edx, iters
pxor xmm7, xmm7
@@ -1966,7 +1959,6 @@ Z16Loop:
sub edx, 1
jne Z16Loop
}
-#endif // __x86_64__
#else // _MSC_VER
__asm__(".intel_syntax\n"
diff --git a/plugins/zerogs/opengl/x86-64.S b/plugins/zerogs/opengl/x86-64.S
deleted file mode 100644
index 6f221e7b33..0000000000
--- a/plugins/zerogs/opengl/x86-64.S
+++ /dev/null
@@ -1,906 +0,0 @@
-## Copyright (C) 2005-2006 zerofrog(@gmail.com)
-#
-# This Program is free software you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation either ve%rsion 2, or (at your option)
-# any later ve%rsion.
-#
-# This Program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with GNU Make see the file COPYING. If not, write to
-# the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
-# http://www.gnu.org/copyleft/gpl.html
-#
-#
-.intel_syntax
-
-## mmx memcpy implementation, size has to be a multiple of 8
-## returns 0 is equal, nonzero value if not equal
-## ~10 times faster than standard memcmp
-## (zerofrog)
-## u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
-## %rdi - src1
-## %rsi - src2
-## edx - cmpsize
-.globl memcmp_mmx
- .type memcmp_mmx, @function
-memcmp_mmx:
- cmp %edx, 32
- jl Done4
-
- ## custom test first 8 to make sure things are ok
- movq %mm0, [%rsi]
- movq %mm1, [%rsi+8]
- pcmpeqd %mm0, [%rdi]
- pcmpeqd %mm1, [%rdi+8]
- pand %mm0, %mm1
- movq %mm2, [%rsi+16]
- pmovmskb %eax, %mm0
- movq %mm3, [%rsi+24]
-
- // check if eq
- cmp %eax, 0xff
- je NextComp
- mov %eax, 1
- jmp End
-
-NextComp:
- pcmpeqd %mm2, [%rdi+16]
- pcmpeqd %mm3, [%rdi+24]
- pand %mm2, %mm3
- pmovmskb %eax, %mm2
-
- sub %edx, 32
- add %rsi, 32
- add %rdi, 32
-
- // check if eq
- cmp %eax, 0xff
- je ContinueTest
- mov %eax, 1
- jmp End
-
- cmp %edx, 64
- jl Done8
-
-Cmp8:
- movq %mm0, [%rsi]
- movq %mm1, [%rsi+8]
- movq %mm2, [%rsi+16]
- movq %mm3, [%rsi+24]
- movq %mm4, [%rsi+32]
- movq %mm5, [%rsi+40]
- movq %mm6, [%rsi+48]
- movq %mm7, [%rsi+56]
- pcmpeqd %mm0, [%rdi]
- pcmpeqd %mm1, [%rdi+8]
- pcmpeqd %mm2, [%rdi+16]
- pcmpeqd %mm3, [%rdi+24]
- pand %mm0, %mm1
- pcmpeqd %mm4, [%rdi+32]
- pand %mm0, %mm2
- pcmpeqd %mm5, [%rdi+40]
- pand %mm0, %mm3
- pcmpeqd %mm6, [%rdi+48]
- pand %mm0, %mm4
- pcmpeqd %mm7, [%rdi+56]
- pand %mm0, %mm5
- pand %mm0, %mm6
- pand %mm0, %mm7
- pmovmskb %eax, %mm0
-
- // check if eq
- cmp %eax, 0xff
- je Continue
- mov %eax, 1
- jmp End
-
-Continue:
- sub %edx, 64
- add %rsi, 64
- add %rdi, 64
-ContinueTest:
- cmp %edx, 64
- jge Cmp8
-
-Done8:
- test %edx, 0x20
- jz Done4
- movq %mm0, [%rsi]
- movq %mm1, [%rsi+8]
- movq %mm2, [%rsi+16]
- movq %mm3, [%rsi+24]
- pcmpeqd %mm0, [%rdi]
- pcmpeqd %mm1, [%rdi+8]
- pcmpeqd %mm2, [%rdi+16]
- pcmpeqd %mm3, [%rdi+24]
- pand %mm0, %mm1
- pand %mm0, %mm2
- pand %mm0, %mm3
- pmovmskb %eax, %mm0
- sub %edx, 32
- add %rsi, 32
- add %rdi, 32
-
- // check if eq
- cmp %eax, 0xff
- je Done4
- mov %eax, 1
- jmp End
-
-Done4:
- cmp %edx, 24
- jne Done2
- movq %mm0, [%rsi]
- movq %mm1, [%rsi+8]
- movq %mm2, [%rsi+16]
- pcmpeqd %mm0, [%rdi]
- pcmpeqd %mm1, [%rdi+8]
- pcmpeqd %mm2, [%rdi+16]
- pand %mm0, %mm1
- pand %mm0, %mm2
- pmovmskb %eax, %mm0
-
- // check if eq
- cmp %eax, 0xff
- je Done
- mov %eax, 1
- jmp End
-
-Done2:
- cmp %edx, 16
- jne Done1
-
- movq %mm0, [%rsi]
- movq %mm1, [%rsi+8]
- pcmpeqd %mm0, [%rdi]
- pcmpeqd %mm1, [%rdi+8]
- pand %mm0, %mm1
- pmovmskb %eax, %mm0
-
- // check if eq
- cmp %eax, 0xff
- je Done
- mov %eax, 1
- jmp End
-
-Done1:
- cmp %edx, 8
- jne Done
-
- mov %eax, [%rsi]
- mov %rsi, [%rsi+4]
- cmp %eax, [%rdi]
- je Next
- mov %eax, 1
- jmp End
-
-Next:
- cmp %rsi, [%rdi+4]
- je Done
- mov %eax, 1
- jmp End
-
-Done:
- xor %eax, %eax
-
-End:
- emms
- ret
-
-#ifdef ZEROGS_SSE2
-// SSE2 extensions
-
-#define punpck(op, sd0, sd2, s1, s3, d1, d3) \
- movdqa %xmm##d1, %xmm##sd0; \
- pshufd %xmm##d3, %xmm##sd2, 0xe4; \
- punpckl##op %xmm##sd0, %xmm##s1; \
- punpckh##op %xmm##d1, %xmm##s1; \
- punpckl##op %xmm##sd2, %xmm##s3; \
- punpckh##op %xmm##d3, %xmm##s3; \
-
-#define punpcknbl \
- movdqa %xmm4, %xmm0; \
- pshufd %xmm5, %xmm1, 0xe4; \
- \
- psllq %xmm1, 4; \
- psrlq %xmm4, 4; \
- \
- movdqa %xmm6, %xmm7; \
- pand %xmm0, %xmm7; \
- pandn %xmm6, %xmm1; \
- por %xmm0, %xmm6; \
- \
- movdqa %xmm6, %xmm7; \
- pand %xmm4, %xmm7; \
- pandn %xmm6, %xmm5; \
- por %xmm4, %xmm6; \
- \
- movdqa %xmm1, %xmm4; \
- \
- movdqa %xmm4, %xmm2; \
- pshufd %xmm5, %xmm3, 0xe4; \
- \
- psllq %xmm3, 4; \
- psrlq %xmm4, 4; \
- \
- movdqa %xmm6, %xmm7; \
- pand %xmm2, %xmm7; \
- pandn %xmm6, %xmm3; \
- por %xmm2, %xmm6; \
- \
- movdqa %xmm6, %xmm7; \
- pand %xmm4, %xmm7; \
- pandn %xmm6, %xmm5; \
- por %xmm4, %xmm6; \
- \
- movdqa %xmm3, %xmm4; \
- \
- punpck(bw, 0, 2, 1, 3, 4, 6); \
-
-#define punpcknbh \
- movdqa %xmm12, %xmm8; \
- pshufd %xmm13, %xmm9, 0xe4; \
- \
- psllq %xmm9, 4; \
- psrlq %xmm12, 4; \
- \
- movdqa %xmm14, %xmm15; \
- pand %xmm8, %xmm15; \
- pandn %xmm14, %xmm9; \
- por %xmm8, %xmm14; \
- \
- movdqa %xmm14, %xmm15; \
- pand %xmm12, %xmm15; \
- pandn %xmm14, %xmm13; \
- por %xmm12, %xmm14; \
- \
- movdqa %xmm9, %xmm12; \
- \
- movdqa %xmm12, %xmm10; \
- pshufd %xmm13, %xmm11, 0xe4; \
- \
- psllq %xmm11, 4; \
- psrlq %xmm12, 4; \
- \
- movdqa %xmm14, %xmm15; \
- pand %xmm10, %xmm15; \
- pandn %xmm14, %xmm11; \
- por %xmm10, %xmm14; \
- \
- movdqa %xmm14, %xmm15; \
- pand %xmm12, %xmm15; \
- pandn %xmm14, %xmm13; \
- por %xmm12, %xmm14; \
- \
- movdqa %xmm11, %xmm12; \
- \
- punpck(bw, 8, 10, 9, 11, 12, 14); \
-
-//
-// SwizzleBlock32_sse2
-//
-
-.globl SwizzleBlock32_sse2
- .type SwizzleBlock32_sse2, @function
-SwizzleBlock32_sse2:
-
- mov %eax, 4
-
- cmp %ecx, 0xffffffff
- jne SwizzleBlock32_sse2_2
-
- .align 16
-SwizzleBlock32_sse2_1:
- movdqa %xmm0, [%rsi]
- movdqa %xmm4, [%rsi+16]
- movdqa %xmm1, [%rsi+%rdx]
- movdqa %xmm5, [%rsi+%rdx+16]
-
- punpck(qdq, 0, 4, 1, 5, 2, 6)
-
- movdqa [%rdi+16*0], %xmm0
- movdqa [%rdi+16*1], %xmm2
- movdqa [%rdi+16*2], %xmm4
- movdqa [%rdi+16*3], %xmm6
-
- lea %rsi, [%rsi+%rdx*2]
- add %rdi, 64
-
- dec %eax
- jnz SwizzleBlock32_sse2_1
-
- ret
-
-SwizzleBlock32_sse2_2:
-
- movd %xmm7, %rcx
- pshufd %xmm7, %xmm7, 0
-
- .align 16
-SwizzleBlock32_sse2_3:
- movdqa %xmm0, [%rsi]
- movdqa %xmm4, [%rsi+16]
- movdqa %xmm1, [%rsi+%rdx]
- movdqa %xmm5, [%rsi+%rdx+16]
-
- punpck(qdq, 0, 4, 1, 5, 2, 6)
-
- movdqa %xmm3, %xmm7
- pshufd %xmm5, %xmm7, 0xe4
- movdqa %xmm9, %xmm7
- pshufd %xmm11, %xmm7, 0xe4
-
- pandn %xmm3, [%rdi+16*0]
- pand %xmm0, %xmm7
- por %xmm0, %xmm3
- movdqa [%rdi+16*0], %xmm0
-
- pandn %xmm5, [%rdi+16*1]
- pand %xmm2, %xmm7
- por %xmm2, %xmm5
- movdqa [%rdi+16*1], %xmm2
-
- pandn %xmm9, [%rdi+16*2]
- pand %xmm4, %xmm7
- por %xmm4, %xmm9
- movdqa [%rdi+16*2], %xmm4
-
- pandn %xmm11, [%rdi+16*3]
- pand %xmm6, %xmm7
- por %xmm6, %xmm11
- movdqa [%rdi+16*3], %xmm6
-
- lea %rsi, [%rsi+%rdx*2]
- add %rdi, 64
-
- dec %eax
- jnz SwizzleBlock32_sse2_3
-
- ret
-
-//
-// SwizzleBlock16_sse2
-//
-
-.globl SwizzleBlock16_sse2
- .type SwizzleBlock16_sse2, @function
-SwizzleBlock16_sse2:
-
- mov %eax, 4
-
- .align 16
-SwizzleBlock16_sse2_1:
- movdqa %xmm0, [%rsi]
- movdqa %xmm1, [%rsi+16]
- movdqa %xmm2, [%rsi+%rdx]
- movdqa %xmm3, [%rsi+%rdx+16]
-
- punpck(wd, 0, 2, 1, 3, 4, 6)
- punpck(qdq, 0, 4, 2, 6, 1, 5)
-
- movdqa [%rdi+16*0], %xmm0
- movdqa [%rdi+16*1], %xmm1
- movdqa [%rdi+16*2], %xmm4
- movdqa [%rdi+16*3], %xmm5
-
- lea %rsi, [%rsi+%rdx*2]
- add %rdi, 64
-
- dec %eax
- jnz SwizzleBlock16_sse2_1
-
- ret
-
-//
-// SwizzleBlock8
-//
-
-.globl SwizzleBlock8_sse2
- .type SwizzleBlock8_sse2, @function
-SwizzleBlock8_sse2:
-
- mov %ecx, 2
-
- .align 16
-SwizzleBlock8_sse2_1:
- // col 0, 2
-
- movdqa %xmm0, [%rsi]
- movdqa %xmm2, [%rsi+%rdx]
- lea %rsi, [%rsi+%rdx*2]
-
- pshufd %xmm1, [%rsi], 0xb1
- pshufd %xmm3, [%rsi+%rdx], 0xb1
- lea %rsi, [%rsi+%rdx*2]
-
- punpck(bw, 0, 2, 1, 3, 4, 6)
- punpck(wd, 0, 2, 4, 6, 1, 3)
- punpck(qdq, 0, 1, 2, 3, 4, 5)
-
- movdqa [%rdi+16*0], %xmm0
- movdqa [%rdi+16*1], %xmm4
- movdqa [%rdi+16*2], %xmm1
- movdqa [%rdi+16*3], %xmm5
-
- // col 1, 3
-
- pshufd %xmm0, [%rsi], 0xb1
- pshufd %xmm2, [%rsi+%rdx], 0xb1
- lea %rsi, [%rsi+%rdx*2]
-
- movdqa %xmm1, [%rsi]
- movdqa %xmm3, [%rsi+%rdx]
- lea %rsi, [%rsi+%rdx*2]
-
- punpck(bw, 0, 2, 1, 3, 4, 6)
- punpck(wd, 0, 2, 4, 6, 1, 3)
- punpck(qdq, 0, 1, 2, 3, 4, 5)
-
- movdqa [%rdi+16*4], %xmm0
- movdqa [%rdi+16*5], %xmm4
- movdqa [%rdi+16*6], %xmm1
- movdqa [%rdi+16*7], %xmm5
-
- add %rdi, 128
-
- dec %ecx
- jnz SwizzleBlock8_sse2_1
-
- ret
-
-//
-// SwizzleBlock4
-//
-
-.globl SwizzleBlock4_sse2
- .type SwizzleBlock4_sse2, @function
-SwizzleBlock4_sse2:
-
- mov %ecx, 2
-
- mov %eax, 0x0f0f0f0f
- movd %xmm7, %eax
- pshufd %xmm7, %xmm7, 0
-
- .align 16
-SwizzleBlock4_sse2_1:
- // col 0, 2
-
- movdqa %xmm0, [%rsi]
- movdqa %xmm2, [%rsi+%rdx]
- lea %rsi, [%rsi+%rdx*2]
-
- movdqa %xmm1, [%rsi]
- movdqa %xmm3, [%rsi+%rdx]
- lea %rsi, [%rsi+%rdx*2]
-
- pshuflw %xmm1, %xmm1, 0xb1
- pshuflw %xmm3, %xmm3, 0xb1
- pshufhw %xmm1, %xmm1, 0xb1
- pshufhw %xmm3, %xmm3, 0xb1
-
- punpcknbl
- punpck(bw, 0, 2, 4, 6, 1, 3)
- punpck(bw, 0, 2, 1, 3, 4, 6)
- punpck(qdq, 0, 4, 2, 6, 1, 3)
-
- movdqa [%rdi+16*0], %xmm0
- movdqa [%rdi+16*1], %xmm1
- movdqa [%rdi+16*2], %xmm4
- movdqa [%rdi+16*3], %xmm3
-
- // col 1, 3
-
- movdqa %xmm0, [%rsi]
- movdqa %xmm2, [%rsi+%rdx]
- lea %rsi, [%rsi+%rdx*2]
-
- movdqa %xmm1, [%rsi]
- movdqa %xmm3, [%rsi+%rdx]
- lea %rsi, [%rsi+%rdx*2]
-
- pshuflw %xmm0, %xmm0, 0xb1
- pshuflw %xmm2, %xmm2, 0xb1
- pshufhw %xmm0, %xmm0, 0xb1
- pshufhw %xmm2, %xmm2, 0xb1
-
- punpcknbl
- punpck(bw, 0, 2, 4, 6, 1, 3)
- punpck(bw, 0, 2, 1, 3, 4, 6)
- punpck(qdq, 0, 4, 2, 6, 1, 3)
-
- movdqa [%rdi+16*4], %xmm0
- movdqa [%rdi+16*5], %xmm1
- movdqa [%rdi+16*6], %xmm4
- movdqa [%rdi+16*7], %xmm3
-
- add %rdi, 128
-
- dec %ecx
- jnz SwizzleBlock4_sse2_1
-
- ret
-
-//
-// swizzling with unaligned reads
-//
-
-//
-// SwizzleBlock32u_sse2
-//
-
-.globl SwizzleBlock32u_sse2
- .type SwizzleBlock32u_sse2, @function
-SwizzleBlock32u_sse2:
-
- mov %eax, 4
-
- cmp %ecx, 0xffffffff
- jne SwizzleBlock32u_sse2_2
-
- .align 16
-SwizzleBlock32u_sse2_1:
- movdqu %xmm0, [%rsi]
- movdqu %xmm4, [%rsi+16]
- movdqu %xmm1, [%rsi+%rdx]
- movdqu %xmm5, [%rsi+%rdx+16]
-
- punpck(qdq, 0, 4, 1, 5, 2, 6)
-
- movdqa [%rdi+16*0], %xmm0
- movdqa [%rdi+16*1], %xmm2
- movdqa [%rdi+16*2], %xmm4
- movdqa [%rdi+16*3], %xmm6
-
- lea %rsi, [%rsi+%rdx*2]
- add %rdi, 64
-
- dec %eax
- jnz SwizzleBlock32u_sse2_1
-
- ret
-
-SwizzleBlock32u_sse2_2:
-
- movd %xmm7, %rcx
- pshufd %xmm7, %xmm7, 0
-
- .align 16
-SwizzleBlock32u_sse2_3:
- movdqu %xmm0, [%rsi]
- movdqu %xmm4, [%rsi+16]
- movdqu %xmm1, [%rsi+%rdx]
- movdqu %xmm5, [%rsi+%rdx+16]
-
- punpck(qdq, 0, 4, 1, 5, 2, 6)
-
- movdqa %xmm3, %xmm7
- pshufd %xmm5, %xmm7, 0xe4
- movdqa %xmm9, %xmm7
- pshufd %xmm11, %xmm7, 0xe4
-
- pandn %xmm3, [%rdi+16*0]
- pand %xmm0, %xmm7
- por %xmm0, %xmm3
- movdqa [%rdi+16*0], %xmm0
-
- pandn %xmm5, [%rdi+16*1]
- pand %xmm2, %xmm7
- por %xmm2, %xmm5
- movdqa [%rdi+16*1], %xmm2
-
- pandn %xmm9, [%rdi+16*2]
- pand %xmm4, %xmm7
- por %xmm4, %xmm9
- movdqa [%rdi+16*2], %xmm4
-
- pandn %xmm11, [%rdi+16*3]
- pand %xmm6, %xmm7
- por %xmm6, %xmm11
- movdqa [%rdi+16*3], %xmm6
-
- lea %rsi, [%rsi+%rdx*2]
- add %rdi, 64
-
- dec %eax
- jnz SwizzleBlock32u_sse2_3
-
- ret
-
-//
-// SwizzleBlock16u_sse2
-//
-
-.globl SwizzleBlock16u_sse2
- .type SwizzleBlock16u_sse2, @function
-SwizzleBlock16u_sse2:
- mov %eax, 4
-
- .align 16
-SwizzleBlock16u_sse2_1:
- movdqu %xmm0, [%rsi]
- movdqu %xmm1, [%rsi+16]
- movdqu %xmm2, [%rsi+%rdx]
- movdqu %xmm3, [%rsi+%rdx+16]
-
- punpck(wd, 0, 2, 1, 3, 4, 6)
- punpck(qdq, 0, 4, 2, 6, 1, 5)
-
- movdqa [%rdi+16*0], %xmm0
- movdqa [%rdi+16*1], %xmm1
- movdqa [%rdi+16*2], %xmm4
- movdqa [%rdi+16*3], %xmm5
-
- lea %rsi, [%rsi+%rdx*2]
- add %rdi, 64
-
- dec %eax
- jnz SwizzleBlock16u_sse2_1
-
- ret
-
-//
-// SwizzleBlock8u
-//
-
-.globl SwizzleBlock8u_sse2
- .type SwizzleBlock8u_sse2, @function
-SwizzleBlock8u_sse2:
- mov %ecx, 2
-
- .align 16
-SwizzleBlock8u_sse2_1:
- // col 0, 2
-
- movdqu %xmm0, [%rsi]
- movdqu %xmm2, [%rsi+%rdx]
- lea %rsi, [%rsi+%rdx*2]
-
- pshufd %xmm1, %xmm0, 0xb1
- pshufd %xmm3, %xmm2, 0xb1
- lea %rsi, [%rsi+%rdx*2]
-
- punpck(bw, 0, 2, 1, 3, 4, 6)
- punpck(wd, 0, 2, 4, 6, 1, 3)
- punpck(qdq, 0, 1, 2, 3, 4, 5)
-
- movdqa [%rdi+16*0], %xmm0
- movdqa [%rdi+16*1], %xmm4
- movdqa [%rdi+16*2], %xmm1
- movdqa [%rdi+16*3], %xmm5
-
- // col 1, 3
-
- movdqu %xmm0, [%rsi]
- movdqu %xmm2, [%rsi+%rdx]
- pshufd %xmm0, %xmm0, 0xb1
- pshufd %xmm2, %xmm2, 0xb1
- lea %rsi, [%rsi+%rdx*2]
-
- movdqu %xmm1, [%rsi]
- movdqu %xmm3, [%rsi+%rdx]
- lea %rsi, [%rsi+%rdx*2]
-
- punpck(bw, 0, 2, 1, 3, 4, 6)
- punpck(wd, 0, 2, 4, 6, 1, 3)
- punpck(qdq, 0, 1, 2, 3, 4, 5)
-
- movdqa [%rdi+16*4], %xmm0
- movdqa [%rdi+16*5], %xmm4
- movdqa [%rdi+16*6], %xmm1
- movdqa [%rdi+16*7], %xmm5
-
- add %rdi, 128
-
- dec %ecx
- jnz SwizzleBlock8u_sse2_1
-
- ret
-
-//
-// SwizzleBlock4u
-//
-
-.globl SwizzleBlock4u_sse2
- .type SwizzleBlock4u_sse2, @function
-SwizzleBlock4u_sse2:
-
- mov %ecx, 2
-
- mov %eax, 0xf0f0f0f
- movd %xmm7, %eax
- pshufd %xmm7, %xmm7, 0
-
- .align 16
-SwizzleBlock4u_sse2_1:
- // col 0, 2
-
- movdqu %xmm0, [%rsi]
- movdqu %xmm2, [%rsi+%rdx]
- lea %rsi, [%rsi+%rdx*2]
-
- movdqu %xmm1, [%rsi]
- movdqu %xmm3, [%rsi+%rdx]
- lea %rsi, [%rsi+%rdx*2]
-
- pshuflw %xmm1, %xmm1, 0xb1
- pshuflw %xmm3, %xmm3, 0xb1
- pshufhw %xmm1, %xmm1, 0xb1
- pshufhw %xmm3, %xmm3, 0xb1
-
- punpcknbl
- punpck(bw, 0, 2, 4, 6, 1, 3)
- punpck(bw, 0, 2, 1, 3, 4, 6)
- punpck(qdq, 0, 4, 2, 6, 1, 3)
-
- movdqa [%rdi+16*0], %xmm0
- movdqa [%rdi+16*1], %xmm1
- movdqa [%rdi+16*2], %xmm4
- movdqa [%rdi+16*3], %xmm3
-
- // col 1, 3
-
- movdqu %xmm0, [%rsi]
- movdqu %xmm2, [%rsi+%rdx]
- lea %rsi, [%rsi+%rdx*2]
-
- movdqu %xmm1, [%rsi]
- movdqu %xmm3, [%rsi+%rdx]
- lea %rsi, [%rsi+%rdx*2]
-
- pshuflw %xmm0, %xmm0, 0xb1
- pshuflw %xmm2, %xmm2, 0xb1
- pshufhw %xmm0, %xmm0, 0xb1
- pshufhw %xmm2, %xmm2, 0xb1
-
- punpcknbl
- punpck(bw, 0, 2, 4, 6, 1, 3)
- punpck(bw, 0, 2, 1, 3, 4, 6)
- punpck(qdq, 0, 4, 2, 6, 1, 3)
-
- movdqa [%rdi+16*4], %xmm0
- movdqa [%rdi+16*5], %xmm1
- movdqa [%rdi+16*6], %xmm4
- movdqa [%rdi+16*7], %xmm3
-
- add %rdi, 128
-
- dec %ecx
- jnz SwizzleBlock4u_sse2_1
-
- ret
-
-
- .align 16
-s_clut16mask:
- .long 0xffff0000
- .long 0xffff0000
- .long 0xffff0000
- .long 0xffff0000
-
- .align 16
-s_clut16mask2:
-
- .long 0x0000ffff
- .long 0x0000ffff
- .long 0x0000ffff
- .long 0x0000ffff
-
-.globl WriteCLUT_T16_I4_CSM1_sse2
- .type WriteCLUT_T16_I4_CSM1_sse2, @function
-WriteCLUT_T16_I4_CSM1_sse2:
- movdqa %xmm0, xmmword ptr [%rdi]
- movdqa %xmm1, xmmword ptr [%rdi+16]
- movdqa %xmm2, xmmword ptr [%rdi+32]
- movdqa %xmm3, xmmword ptr [%rdi+48]
-
- // rearrange
- pshuflw %xmm0, %xmm0, 0x88
- pshufhw %xmm0, %xmm0, 0x88
- pshuflw %xmm1, %xmm1, 0x88
- pshufhw %xmm1, %xmm1, 0x88
- pshuflw %xmm2, %xmm2, 0x88
- pshufhw %xmm2, %xmm2, 0x88
- pshuflw %xmm3, %xmm3, 0x88
- pshufhw %xmm3, %xmm3, 0x88
-
- shufps %xmm0, %xmm1, 0x88
- shufps %xmm2, %xmm3, 0x88
-
- pshufd %xmm0, %xmm0, 0xd8
- pshufd %xmm2, %xmm2, 0xd8
-
- pxor %xmm6, %xmm6
-
- test %rsi, 15
- jnz WriteUnaligned
-
- movdqa %xmm7, [%rip+s_clut16mask] // saves upper 16 bits
-
- // have to save interlaced with the old data
- movdqa %xmm4, [%rsi]
- movdqa %xmm5, [%rsi+32]
- movhlps %xmm1, %xmm0
- movlhps %xmm0, %xmm2 // lower 8 colors
-
- pand %xmm4, %xmm7
- pand %xmm5, %xmm7
-
- shufps %xmm1, %xmm2, 0xe4 // upper 8 colors
- movdqa %xmm2, %xmm0
- movdqa %xmm3, %xmm1
-
- punpcklwd %xmm0, %xmm6
- punpcklwd %xmm1, %xmm6
- por %xmm0, %xmm4
- por %xmm1, %xmm5
-
- punpckhwd %xmm2, %xmm6
- punpckhwd %xmm3, %xmm6
-
- movdqa [%rsi], %xmm0
- movdqa [%rsi+32], %xmm1
-
- movdqa %xmm5, %xmm7
- pand %xmm7, [%rsi+16]
- pand %xmm5, [%rsi+48]
-
- por %xmm2, %xmm7
- por %xmm3, %xmm5
-
- movdqa [%rsi+16], %xmm2
- movdqa [%rsi+48], %xmm3
- jmp WriteCLUT_T16_I4_CSM1_End
-
-WriteUnaligned:
- // %rsi is offset by 2
- sub %rsi, 2
-
- movdqa %xmm7, [%rip+s_clut16mask2] // saves lower 16 bits
-
- // have to save interlaced with the old data
- movdqa %xmm4, [%rsi]
- movdqa %xmm5, [%rsi+32]
- movhlps %xmm1, %xmm0
- movlhps %xmm0, %xmm2 // lower 8 colors
-
- pand %xmm4, %xmm7
- pand %xmm5, %xmm7
-
- shufps %xmm1, %xmm2, 0xe4 // upper 8 colors
- movdqa %xmm2, %xmm0
- movdqa %xmm3, %xmm1
-
- punpcklwd %xmm0, %xmm6
- punpcklwd %xmm1, %xmm6
- pslld %xmm0, 16
- pslld %xmm1, 16
- por %xmm0, %xmm4
- por %xmm1, %xmm5
-
- punpckhwd %xmm2, %xmm6
- punpckhwd %xmm3, %xmm6
- pslld %xmm2, 16
- pslld %xmm3, 16
-
- movdqa [%rsi], %xmm0
- movdqa [%rsi+32], %xmm1
-
- movdqa %xmm5, %xmm7
- pand %xmm7, [%rsi+16]
- pand %xmm5, [%rsi+48]
-
- por %xmm2, %xmm7
- por %xmm3, %xmm5
-
- movdqa [%rsi+16], %xmm2
- movdqa [%rsi+48], %xmm3
-WriteCLUT_T16_I4_CSM1_End:
- ret
-
-#endif
diff --git a/plugins/zerogs/opengl/x86-64.asm b/plugins/zerogs/opengl/x86-64.asm
deleted file mode 100644
index 6d476dfc6d..0000000000
--- a/plugins/zerogs/opengl/x86-64.asm
+++ /dev/null
@@ -1,1091 +0,0 @@
-; Copyright (C) 2003-2005 Gabest/zerofrog
-; http:;;www.gabest.org
-;
-; This Program is free software; you can redistribute it and/or modify
-; it under the terms of the GNU General Public License as published by
-; the Free Software Foundation; either version 2, or (at your option)
-; any later version.
-;
-; This Program is distributed in the hope that it will be useful,
-; but WITHOUT ANY WARRANTY; without even the implied warranty of
-; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-; GNU General Public License for more details.
-;
-; You should have received a copy of the GNU General Public License
-; along with GNU Make; see the file COPYING. If not, write to
-; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
-; http:;;www.gnu.org/copyleft/gpl.html
-;
-;
-
-extern s_clut16mask:ptr
-
- .code
-
-; mmx memcpy implementation, size has to be a multiple of 8
-; returns 0 is equal, nonzero value if not equal
-; ~10 times faster than standard memcmp
-; (zerofrog)
-; u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
-; rcx - src1
-; rdx - src2
-; r8d - cmpsize
-memcmp_mmx proc public
- cmp r8d, 32
- jl Done4
-
- ; custom test first 8 to make sure things are ok
- movq mm0, [rdx]
- movq mm1, [rdx+8]
- pcmpeqd mm0, [rcx]
- pcmpeqd mm1, [rcx+8]
- pand mm0, mm1
- movq mm2, [rdx+16]
- pmovmskb eax, mm0
- movq mm3, [rdx+24]
-
- ; check if eq
- cmp eax, 0ffh
- je NextComp
- mov eax, 1
- jmp Finish
-
-NextComp:
- pcmpeqd mm2, [rcx+16]
- pcmpeqd mm3, [rcx+24]
- pand mm2, mm3
- pmovmskb eax, mm2
-
- sub r8d, 32
- add rdx, 32
- add rcx, 32
-
- ; check if eq
- cmp eax, 0ffh
- je ContinueTest
- mov eax, 1
- jmp Finish
-
- cmp r8d, 64
- jl Done8
-
-Cmp8:
- movq mm0, [rdx]
- movq mm1, [rdx+8]
- movq mm2, [rdx+16]
- movq mm3, [rdx+24]
- movq mm4, [rdx+32]
- movq mm5, [rdx+40]
- movq mm6, [rdx+48]
- movq mm7, [rdx+56]
- pcmpeqd mm0, [rcx]
- pcmpeqd mm1, [rcx+8]
- pcmpeqd mm2, [rcx+16]
- pcmpeqd mm3, [rcx+24]
- pand mm0, mm1
- pcmpeqd mm4, [rcx+32]
- pand mm0, mm2
- pcmpeqd mm5, [rcx+40]
- pand mm0, mm3
- pcmpeqd mm6, [rcx+48]
- pand mm0, mm4
- pcmpeqd mm7, [rcx+56]
- pand mm0, mm5
- pand mm0, mm6
- pand mm0, mm7
- pmovmskb eax, mm0
-
- ; check if eq
- cmp eax, 0ffh
- je Continue
- mov eax, 1
- jmp Finish
-
-Continue:
- sub r8d, 64
- add rdx, 64
- add rcx, 64
-ContinueTest:
- cmp r8d, 64
- jge Cmp8
-
-Done8:
- test r8d, 020h
- jz Done4
- movq mm0, [rdx]
- movq mm1, [rdx+8]
- movq mm2, [rdx+16]
- movq mm3, [rdx+24]
- pcmpeqd mm0, [rcx]
- pcmpeqd mm1, [rcx+8]
- pcmpeqd mm2, [rcx+16]
- pcmpeqd mm3, [rcx+24]
- pand mm0, mm1
- pand mm0, mm2
- pand mm0, mm3
- pmovmskb eax, mm0
- sub r8d, 32
- add rdx, 32
- add rcx, 32
-
- ; check if eq
- cmp eax, 0ffh
- je Done4
- mov eax, 1
- jmp Finish
-
-Done4:
- cmp r8d, 24
- jne Done2
- movq mm0, [rdx]
- movq mm1, [rdx+8]
- movq mm2, [rdx+16]
- pcmpeqd mm0, [rcx]
- pcmpeqd mm1, [rcx+8]
- pcmpeqd mm2, [rcx+16]
- pand mm0, mm1
- pand mm0, mm2
- pmovmskb eax, mm0
-
- ; check if eq
- cmp eax, 0ffh
- je Done
- mov eax, 1
- jmp Finish
-
-Done2:
- cmp r8d, 16
- jne Done1
-
- movq mm0, [rdx]
- movq mm1, [rdx+8]
- pcmpeqd mm0, [rcx]
- pcmpeqd mm1, [rcx+8]
- pand mm0, mm1
- pmovmskb eax, mm0
-
- ; check if eq
- cmp eax, 0ffh
- je Done
- mov eax, 1
- jmp Finish
-
-Done1:
- cmp r8d, 8
- jne Done
-
- mov eax, [rdx]
- mov rdx, [rdx+4]
- cmp eax, [rcx]
- je Next
- mov eax, 1
- jmp Finish
-
-Next:
- cmp rdx, [rcx+4]
- je Done
- mov eax, 1
- jmp Finish
-
-Done:
- xor eax, eax
-
-Finish:
- emms
- ret
-
-memcmp_mmx endp
-
-; TestClutChangeMMX
-; mov rdx, dst
-; mov rcx, src
-; mov r8d, entries
-TestClutChangeMMX proc public
-
-Start:
- movq mm0, [rdx]
- movq mm1, [rdx+8]
- pcmpeqd mm0, [rcx]
- pcmpeqd mm1, [rcx+16]
-
- movq mm2, [rdx+16]
- movq mm3, [rdx+24]
- pcmpeqd mm2, [rcx+32]
- pcmpeqd mm3, [rcx+48]
-
- pand mm0, mm1
- pand mm2, mm3
- movq mm4, [rdx+32]
- movq mm5, [rdx+40]
- pcmpeqd mm4, [rcx+8]
- pcmpeqd mm5, [rcx+24]
-
- pand mm0, mm2
- pand mm4, mm5
- movq mm6, [rdx+48]
- movq mm7, [rdx+56]
- pcmpeqd mm6, [rcx+40]
- pcmpeqd mm7, [rcx+56]
-
- pand mm0, mm4
- pand mm6, mm7
- pand mm0, mm6
-
- pmovmskb eax, mm0
- cmp eax, 0ffh
- je Continue
- mov byte ptr [r9], 1
- jmp Return
-
-Continue:
- cmp r8d, 16
- jle Return
-
- test r8d, 010h
- jz AddRcx
- sub rcx, 448 ; go back and down one column,
-AddRcx:
- add rcx, 256 ; go to the right block
-
-
- jne Continue1
- add rcx, 256 ; skip whole block
-Continue1:
- add rdx, 64
- sub r8d, 16
- jmp Start
-
-Return:
- emms
- ret
-
-TestClutChangeMMX endp
-
-UnswizzleZ16Target proc public
- pxor xmm7, xmm7
-
-Z16Loop:
- ;; unpack 64 bytes at a time
- movdqa xmm0, [rdx]
- movdqa xmm2, [rdx+16]
- movdqa xmm4, [rdx+32]
- movdqa xmm6, [rdx+48]
-
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- movdqa xmm5, xmm4
-
- punpcklwd xmm0, xmm7
- punpckhwd xmm1, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm3, xmm7
-
- ;; start saving
- movdqa [rcx], xmm0
- movdqa [rcx+16], xmm1
-
- punpcklwd xmm4, xmm7
- punpckhwd xmm5, xmm7
-
- movdqa [rcx+32], xmm2
- movdqa [rcx+48], xmm3
-
- movdqa xmm0, xmm6
- punpcklwd xmm6, xmm7
-
- movdqa [rcx+64], xmm4
- movdqa [rcx+80], xmm5
-
- punpckhwd xmm0, xmm7
-
- movdqa [rcx+96], xmm6
- movdqa [rcx+112], xmm0
-
- add rdx, 64
- add rcx, 128
- sub r9d, 1
- jne Z16Loop
-
- ret
-UnswizzleZ16Target endp
-
-;
-; swizzling
-;
-
-punpck macro op, sd0, sd2, s1, s3, d1, d3
-
- movdqa @CatStr(xmm, %d1), @CatStr(xmm, %sd0)
- pshufd @CatStr(xmm, %d3), @CatStr(xmm, %sd2), 0e4h
-
- @CatStr(punpckl, op) @CatStr(xmm, %sd0), @CatStr(xmm, %s1)
- @CatStr(punpckh, op) @CatStr(xmm, %d1), @CatStr(xmm, %s1)
- @CatStr(punpckl, op) @CatStr(xmm, %sd2), @CatStr(xmm, %s3)
- @CatStr(punpckh, op) @CatStr(xmm, %d3), @CatStr(xmm, %s3)
-
- endm
-
-punpcknbl macro
-
- movdqa xmm4, xmm0
- pshufd xmm5, xmm1, 0e4h
-
- psllq xmm1, 4
- psrlq xmm4, 4
-
- movdqa xmm6, xmm7
- pand xmm0, xmm7
- pandn xmm6, xmm1
- por xmm0, xmm6
-
- movdqa xmm6, xmm7
- pand xmm4, xmm7
- pandn xmm6, xmm5
- por xmm4, xmm6
-
- movdqa xmm1, xmm4
-
- movdqa xmm4, xmm2
- pshufd xmm5, xmm3, 0e4h
-
- psllq xmm3, 4
- psrlq xmm4, 4
-
- movdqa xmm6, xmm7
- pand xmm2, xmm7
- pandn xmm6, xmm3
- por xmm2, xmm6
-
- movdqa xmm6, xmm7
- pand xmm4, xmm7
- pandn xmm6, xmm5
- por xmm4, xmm6
-
- movdqa xmm3, xmm4
-
- punpck bw, 0, 2, 1, 3, 4, 6
-
- endm
-
-punpcknbh macro
-
- movdqa xmm12, xmm8
- pshufd xmm13, xmm9, 0e4h
-
- psllq xmm9, 4
- psrlq xmm12, 4
-
- movdqa xmm14, xmm15
- pand xmm8, xmm15
- pandn xmm14, xmm9
- por xmm8, xmm14
-
- movdqa xmm14, xmm15
- pand xmm12, xmm15
- pandn xmm14, xmm13
- por xmm12, xmm14
-
- movdqa xmm9, xmm12
-
- movdqa xmm12, xmm10
- pshufd xmm13, xmm11, 0e4h
-
- psllq xmm11, 4
- psrlq xmm12, 4
-
- movdqa xmm14, xmm15
- pand xmm10, xmm15
- pandn xmm14, xmm11
- por xmm10, xmm14
-
- movdqa xmm14, xmm15
- pand xmm12, xmm15
- pandn xmm14, xmm13
- por xmm12, xmm14
-
- movdqa xmm11, xmm12
-
- punpck bw, 8, 10, 9, 11, 12, 14
-
- endm
-
-;
-; SwizzleBlock32_sse2
-;
-
-SwizzleBlock32_sse2 proc public
-
- push rsi
- push rdi
-
- mov rdi, rcx
- mov rsi, rdx
- mov rcx, 4
-
- cmp r9d, 0ffffffffh
- jne SwizzleBlock32_sse2@WM
-
- align 16
-@@:
- movdqa xmm0, [rsi]
- movdqa xmm4, [rsi+16]
- movdqa xmm1, [rsi+r8]
- movdqa xmm5, [rsi+r8+16]
-
- punpck qdq, 0, 4, 1, 5, 2, 6
-
- movdqa [rdi+16*0], xmm0
- movdqa [rdi+16*1], xmm2
- movdqa [rdi+16*2], xmm4
- movdqa [rdi+16*3], xmm6
-
- lea rsi, [rsi+r8*2]
- add rdi, 64
-
- dec rcx
- jnz @B
-
- pop rdi
- pop rsi
-
- ret
-
-SwizzleBlock32_sse2@WM:
-
- movd xmm7, r9d
- pshufd xmm7, xmm7, 0
-
- align 16
-@@:
- movdqa xmm0, [rsi]
- movdqa xmm4, [rsi+16]
- movdqa xmm1, [rsi+r8]
- movdqa xmm5, [rsi+r8+16]
-
- punpck qdq, 0, 4, 1, 5, 2, 6
-
- movdqa xmm3, xmm7
- pshufd xmm5, xmm7, 0e4h
- movdqa xmm9, xmm7
- pshufd xmm11, xmm7, 0e4h
-
- pandn xmm3, [rdi+16*0]
- pand xmm0, xmm7
- por xmm0, xmm3
- movdqa [rdi+16*0], xmm0
-
- pandn xmm5, [rdi+16*1]
- pand xmm2, xmm7
- por xmm2, xmm5
- movdqa [rdi+16*1], xmm2
-
- pandn xmm9, [rdi+16*2]
- pand xmm4, xmm7
- por xmm4, xmm9
- movdqa [rdi+16*2], xmm4
-
- pandn xmm11, [rdi+16*3]
- pand xmm6, xmm7
- por xmm6, xmm11
- movdqa [edi+16*3], xmm6
-
- lea rsi, [rsi+r8*2]
- add rdi, 64
-
- dec rcx
- jnz @B
-
- pop rdi
- pop rsi
-
- ret
-
-SwizzleBlock32_sse2 endp
-
-;
-; SwizzleBlock16_sse2
-;
-
-SwizzleBlock16_sse2 proc public
-
- push rsi
- push rdi
-
- mov rdi, rcx
- mov rsi, rdx
- mov rcx, 4
-
- align 16
-@@:
- movdqa xmm0, [rsi]
- movdqa xmm1, [rsi+16]
- movdqa xmm2, [rsi+r8]
- movdqa xmm3, [rsi+r8+16]
-
- punpck wd, 0, 2, 1, 3, 4, 6
- punpck qdq, 0, 4, 2, 6, 1, 5
-
- movdqa [rdi+16*0], xmm0
- movdqa [rdi+16*1], xmm1
- movdqa [rdi+16*2], xmm4
- movdqa [rdi+16*3], xmm5
-
- lea rsi, [rsi+r8*2]
- add rdi, 64
-
- dec rcx
- jnz @B
-
- pop rdi
- pop rsi
-
- ret
-
-SwizzleBlock16_sse2 endp
-
-;
-; SwizzleBlock8
-;
-
-SwizzleBlock8_sse2 proc public
-
- push rsi
- push rdi
-
- mov rdi, rcx
- mov rsi, rdx
- mov ecx, 2
-
- align 16
-@@:
- ; col 0, 2
-
- movdqa xmm0, [rsi]
- movdqa xmm2, [rsi+r8]
- lea rsi, [rsi+r8*2]
-
- pshufd xmm1, [rsi], 0b1h
- pshufd xmm3, [rsi+r8], 0b1h
- lea rsi, [rsi+r8*2]
-
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck wd, 0, 2, 4, 6, 1, 3
- punpck qdq, 0, 1, 2, 3, 4, 5
-
- movdqa [rdi+16*0], xmm0
- movdqa [rdi+16*1], xmm4
- movdqa [rdi+16*2], xmm1
- movdqa [rdi+16*3], xmm5
-
- ; col 1, 3
-
- pshufd xmm0, [rsi], 0b1h
- pshufd xmm2, [rsi+r8], 0b1h
- lea rsi, [rsi+r8*2]
-
- movdqa xmm1, [rsi]
- movdqa xmm3, [rsi+r8]
- lea rsi, [rsi+r8*2]
-
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck wd, 0, 2, 4, 6, 1, 3
- punpck qdq, 0, 1, 2, 3, 4, 5
-
- movdqa [rdi+16*4], xmm0
- movdqa [rdi+16*5], xmm4
- movdqa [rdi+16*6], xmm1
- movdqa [rdi+16*7], xmm5
-
- add edi, 128
-
- dec rcx
- jnz @B
-
- pop rdi
- pop rsi
-
- ret
-
-SwizzleBlock8_sse2 endp
-
-;
-; SwizzleBlock4
-;
-
-SwizzleBlock4_sse2 proc public
-
- push rsi
- push rdi
-
- mov rdi, rcx
- mov rsi, rdx
- mov rcx, 2
-
- mov eax, 0f0f0f0fh
- movd xmm7, eax
- pshufd xmm7, xmm7, 0
-
- align 16
-@@:
- ; col 0, 2
-
- movdqa xmm0, [rsi]
- movdqa xmm2, [rsi+r8]
- lea rsi, [rsi+r8*2]
-
- movdqa xmm1, [rsi]
- movdqa xmm3, [rsi+r8]
- lea rsi, [rsi+r8*2]
-
- pshuflw xmm1, xmm1, 0b1h
- pshuflw xmm3, xmm3, 0b1h
- pshufhw xmm1, xmm1, 0b1h
- pshufhw xmm3, xmm3, 0b1h
-
- punpcknbl
- punpck bw, 0, 2, 4, 6, 1, 3
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck qdq, 0, 4, 2, 6, 1, 3
-
- movdqa [rdi+16*0], xmm0
- movdqa [rdi+16*1], xmm1
- movdqa [rdi+16*2], xmm4
- movdqa [rdi+16*3], xmm3
-
- ; col 1, 3
-
- movdqa xmm0, [rsi]
- movdqa xmm2, [rsi+r8]
- lea esi, [rsi+r8*2]
-
- movdqa xmm1, [rsi]
- movdqa xmm3, [rsi+r8]
- lea rsi, [rsi+r8*2]
-
- pshuflw xmm0, xmm0, 0b1h
- pshuflw xmm2, xmm2, 0b1h
- pshufhw xmm0, xmm0, 0b1h
- pshufhw xmm2, xmm2, 0b1h
-
- punpcknbl
- punpck bw, 0, 2, 4, 6, 1, 3
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck qdq, 0, 4, 2, 6, 1, 3
-
- movdqa [rdi+16*4], xmm0
- movdqa [rdi+16*5], xmm1
- movdqa [rdi+16*6], xmm4
- movdqa [rdi+16*7], xmm3
-
- add rdi, 128
-
- dec rcx
- jnz @B
-
- pop rdi
- pop rsi
-
- ret
-
-SwizzleBlock4_sse2 endp
-
-;
-; swizzling with unaligned reads
-;
-
-;
-; SwizzleBlock32u_sse2
-;
-
-SwizzleBlock32u_sse2 proc public
-
- push rsi
- push rdi
-
- mov rdi, rcx
- mov rsi, rdx
- mov rcx, 4
-
- cmp r9d, 0ffffffffh
- jne SwizzleBlock32u_sse2@WM
-
- align 16
-@@:
- movdqu xmm0, [rsi]
- movdqu xmm4, [rsi+16]
- movdqu xmm1, [rsi+r8]
- movdqu xmm5, [rsi+r8+16]
-
- punpck qdq, 0, 4, 1, 5, 2, 6
-
- movdqa [rdi+16*0], xmm0
- movdqa [rdi+16*1], xmm2
- movdqa [rdi+16*2], xmm4
- movdqa [rdi+16*3], xmm6
-
- lea rsi, [rsi+r8*2]
- add rdi, 64
-
- dec rcx
- jnz @B
-
- pop rdi
- pop rsi
-
- ret
-
-SwizzleBlock32u_sse2@WM:
-
- movd xmm7, r9d
- pshufd xmm7, xmm7, 0
-
- align 16
-@@:
- movdqu xmm0, [rsi]
- movdqu xmm4, [rsi+16]
- movdqu xmm1, [rsi+r8]
- movdqu xmm5, [rsi+r8+16]
-
- punpck qdq, 0, 4, 1, 5, 2, 6
-
- movdqa xmm3, xmm7
- pshufd xmm5, xmm7, 0e4h
- movdqa xmm9, xmm7
- pshufd xmm11, xmm7, 0e4h
-
- pandn xmm3, [rdi+16*0]
- pand xmm0, xmm7
- por xmm0, xmm3
- movdqa [rdi+16*0], xmm0
-
- pandn xmm5, [rdi+16*1]
- pand xmm2, xmm7
- por xmm2, xmm5
- movdqa [rdi+16*1], xmm2
-
- pandn xmm9, [rdi+16*2]
- pand xmm4, xmm7
- por xmm4, xmm9
- movdqa [rdi+16*2], xmm4
-
- pandn xmm11, [rdi+16*3]
- pand xmm6, xmm7
- por xmm6, xmm11
- movdqa [edi+16*3], xmm6
-
- lea rsi, [rsi+r8*2]
- add rdi, 64
-
- dec rcx
- jnz @B
-
- pop rdi
- pop rsi
-
- ret
-
-SwizzleBlock32u_sse2 endp
-
-;
-; SwizzleBlock16u_sse2
-;
-
-SwizzleBlock16u_sse2 proc public
-
- push rsi
- push rdi
-
- mov rdi, rcx
- mov rsi, rdx
- mov rcx, 4
-
- align 16
-@@:
- movdqu xmm0, [rsi]
- movdqu xmm1, [rsi+16]
- movdqu xmm2, [rsi+r8]
- movdqu xmm3, [rsi+r8+16]
-
- punpck wd, 0, 2, 1, 3, 4, 6
- punpck qdq, 0, 4, 2, 6, 1, 5
-
- movdqa [rdi+16*0], xmm0
- movdqa [rdi+16*1], xmm1
- movdqa [rdi+16*2], xmm4
- movdqa [rdi+16*3], xmm5
-
- lea rsi, [rsi+r8*2]
- add rdi, 64
-
- dec rcx
- jnz @B
-
- pop rdi
- pop rsi
-
- ret
-
-SwizzleBlock16u_sse2 endp
-
-;
-; SwizzleBlock8u
-;
-
-SwizzleBlock8u_sse2 proc public
-
- push rsi
- push rdi
-
- mov rdi, rcx
- mov rsi, rdx
- mov ecx, 2
-
- align 16
-@@:
- ; col 0, 2
-
- movdqu xmm0, [rsi]
- movdqu xmm2, [rsi+r8]
- lea rsi, [rsi+r8*2]
-
- pshufd xmm1, xmm0, 0b1h
- pshufd xmm3, xmm2, 0b1h
- lea rsi, [rsi+r8*2]
-
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck wd, 0, 2, 4, 6, 1, 3
- punpck qdq, 0, 1, 2, 3, 4, 5
-
- movdqa [rdi+16*0], xmm0
- movdqa [rdi+16*1], xmm4
- movdqa [rdi+16*2], xmm1
- movdqa [rdi+16*3], xmm5
-
- ; col 1, 3
-
- movdqu xmm0, [rsi]
- movdqu xmm2, [rsi+r8]
- pshufd xmm0, xmm0, 0b1h
- pshufd xmm2, xmm2, 0b1h
- lea rsi, [rsi+r8*2]
-
- movdqu xmm1, [rsi]
- movdqu xmm3, [rsi+r8]
- lea rsi, [rsi+r8*2]
-
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck wd, 0, 2, 4, 6, 1, 3
- punpck qdq, 0, 1, 2, 3, 4, 5
-
- movdqa [rdi+16*4], xmm0
- movdqa [rdi+16*5], xmm4
- movdqa [rdi+16*6], xmm1
- movdqa [rdi+16*7], xmm5
-
- add edi, 128
-
- dec rcx
- jnz @B
-
- pop rdi
- pop rsi
-
- ret
-
-SwizzleBlock8u_sse2 endp
-
-;
-; SwizzleBlock4u
-;
-
-SwizzleBlock4u_sse2 proc public
-
- push rsi
- push rdi
-
- mov rdi, rcx
- mov rsi, rdx
- mov rcx, 2
-
- mov eax, 0f0f0f0fh
- movd xmm7, eax
- pshufd xmm7, xmm7, 0
-
- align 16
-@@:
- ; col 0, 2
-
- movdqu xmm0, [rsi]
- movdqu xmm2, [rsi+r8]
- lea rsi, [rsi+r8*2]
-
- movdqu xmm1, [rsi]
- movdqu xmm3, [rsi+r8]
- lea rsi, [rsi+r8*2]
-
- pshuflw xmm1, xmm1, 0b1h
- pshuflw xmm3, xmm3, 0b1h
- pshufhw xmm1, xmm1, 0b1h
- pshufhw xmm3, xmm3, 0b1h
-
- punpcknbl
- punpck bw, 0, 2, 4, 6, 1, 3
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck qdq, 0, 4, 2, 6, 1, 3
-
- movdqa [rdi+16*0], xmm0
- movdqa [rdi+16*1], xmm1
- movdqa [rdi+16*2], xmm4
- movdqa [rdi+16*3], xmm3
-
- ; col 1, 3
-
- movdqu xmm0, [rsi]
- movdqu xmm2, [rsi+r8]
- lea esi, [rsi+r8*2]
-
- movdqu xmm1, [rsi]
- movdqu xmm3, [rsi+r8]
- lea rsi, [rsi+r8*2]
-
- pshuflw xmm0, xmm0, 0b1h
- pshuflw xmm2, xmm2, 0b1h
- pshufhw xmm0, xmm0, 0b1h
- pshufhw xmm2, xmm2, 0b1h
-
- punpcknbl
- punpck bw, 0, 2, 4, 6, 1, 3
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck qdq, 0, 4, 2, 6, 1, 3
-
- movdqa [rdi+16*4], xmm0
- movdqa [rdi+16*5], xmm1
- movdqa [rdi+16*6], xmm4
- movdqa [rdi+16*7], xmm3
-
- add rdi, 128
-
- dec rcx
- jnz @B
-
- pop rdi
- pop rsi
-
- ret
-
-SwizzleBlock4u_sse2 endp
-
-WriteCLUT_T16_I4_CSM1_sse2 proc public
- movdqa xmm0, XMMWORD PTR [rcx]
- movdqa xmm1, XMMWORD PTR [rcx+16]
- movdqa xmm2, XMMWORD PTR [rcx+32]
- movdqa xmm3, XMMWORD PTR [rcx+48]
-
- ;; rearrange
- pshuflw xmm0, xmm0, 088h
- pshufhw xmm0, xmm0, 088h
- pshuflw xmm1, xmm1, 088h
- pshufhw xmm1, xmm1, 088h
- pshuflw xmm2, xmm2, 088h
- pshufhw xmm2, xmm2, 088h
- pshuflw xmm3, xmm3, 088h
- pshufhw xmm3, xmm3, 088h
-
- shufps xmm0, xmm1, 088h
- shufps xmm2, xmm3, 088h
-
- pshufd xmm0, xmm0, 0d8h
- pshufd xmm2, xmm2, 0d8h
-
- pxor xmm6, xmm6
- mov rax, offset s_clut16mask
-
- test rdx, 15
- jnz WriteUnaligned
-
- movdqa xmm7, XMMWORD PTR [rax] ;; saves upper 16 bits
-
- ;; have to save interlaced with the old data
- movdqa xmm4, [rdx]
- movdqa xmm5, [rdx+32]
- movhlps xmm1, xmm0
- movlhps xmm0, xmm2 ;; lower 8 colors
-
- pand xmm4, xmm7
- pand xmm5, xmm7
-
- shufps xmm1, xmm2, 0e4h ;; upper 8 colors
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- punpcklwd xmm0, xmm6
- punpcklwd xmm1, xmm6
- por xmm0, xmm4
- por xmm1, xmm5
-
- punpckhwd xmm2, xmm6
- punpckhwd xmm3, xmm6
-
- movdqa [rdx], xmm0
- movdqa [rdx+32], xmm1
-
- movdqa xmm5, xmm7
- pand xmm7, [rdx+16]
- pand xmm5, [rdx+48]
-
- por xmm2, xmm7
- por xmm3, xmm5
-
- movdqa [rdx+16], xmm2
- movdqa [rdx+48], xmm3
- jmp WriteCLUT_T16_I4_CSM1_End
-
-WriteUnaligned:
- ;; rdx is offset by 2
- sub rdx, 2
-
- movdqa xmm7, XMMWORD PTR [rax+16] ;; saves lower 16 bits
-
- ;; have to save interlaced with the old data
- movdqa xmm4, [rdx]
- movdqa xmm5, [rdx+32]
- movhlps xmm1, xmm0
- movlhps xmm0, xmm2 ;; lower 8 colors
-
- pand xmm4, xmm7
- pand xmm5, xmm7
-
- shufps xmm1, xmm2, 0e4h ;; upper 8 colors
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- punpcklwd xmm0, xmm6
- punpcklwd xmm1, xmm6
- pslld xmm0, 16
- pslld xmm1, 16
- por xmm0, xmm4
- por xmm1, xmm5
-
- punpckhwd xmm2, xmm6
- punpckhwd xmm3, xmm6
- pslld xmm2, 16
- pslld xmm3, 16
-
- movdqa [rdx], xmm0
- movdqa [rdx+32], xmm1
-
- movdqa xmm5, xmm7
- pand xmm7, [rdx+16]
- pand xmm5, [rdx+48]
-
- por xmm2, xmm7
- por xmm3, xmm5
-
- movdqa [rdx+16], xmm2
- movdqa [rdx+48], xmm3
-WriteCLUT_T16_I4_CSM1_End:
- ret
-
-WriteCLUT_T16_I4_CSM1_sse2 endp
-
-end
\ No newline at end of file
diff --git a/plugins/zerogs/opengl/x86.cpp b/plugins/zerogs/opengl/x86.cpp
index 26b988b2fd..712eacd9d4 100644
--- a/plugins/zerogs/opengl/x86.cpp
+++ b/plugins/zerogs/opengl/x86.cpp
@@ -23,7 +23,7 @@
#include "Mem.h"
#include "x86.h"
-#if defined(ZEROGS_SSE2) && (defined(_WIN32)||defined(__x86_64__))
+#if defined(ZEROGS_SSE2) && defined(_WIN32)
#include
#include
#endif
@@ -292,7 +292,7 @@ _FrameSwizzleBlock(A4_, (src[2*j]+src[2*j+1]+src[2*j+srcpitch]+src[2*j+srcpitch+
// }
//}
-#if (defined(_WIN32)||defined(__x86_64__))
+#if defined(_WIN32)
extern "C" void __fastcall WriteCLUT_T32_I8_CSM1_sse2(u32* vm, u32* clut)
{
@@ -351,8 +351,6 @@ PCSX2_ALIGNED16(int s_clut16mask[8]) = { 0xffff0000, 0xffff0000, 0xffff0000, 0xf
0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff};
}
-#if !defined(__x86_64__)
-
extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut)
{
__asm {
@@ -467,7 +465,6 @@ WriteUnaligned:
End:
}
}
-#endif // __x86_64__
#endif // _MSC_VER
#endif // ZEROGS_SSE2
diff --git a/plugins/zerogs/opengl/zerogs.cpp b/plugins/zerogs/opengl/zerogs.cpp
index 04ab21c810..ae3cca5c57 100644
--- a/plugins/zerogs/opengl/zerogs.cpp
+++ b/plugins/zerogs/opengl/zerogs.cpp
@@ -5108,9 +5108,6 @@ void ZeroGS::ExtWrite()
////////////
// Caches //
////////////
-#ifdef __x86_64__
-extern "C" void TestClutChangeMMX(void* src, void* dst, int entries, void* pret);
-#endif
bool ZeroGS::CheckChangeInClut(u32 highdword, u32 psm)
{
@@ -5148,9 +5145,6 @@ bool ZeroGS::CheckChangeInClut(u32 highdword, u32 psm)
// do a fast test with MMX
#ifdef _MSC_VER
-#ifdef __x86_64__
- TestClutChangeMMX(dst, src, entries, &bRet);
-#else
int storeebx;
__asm {
mov storeebx, ebx
@@ -5215,63 +5209,9 @@ Return:
emms
mov ebx, storeebx
}
-#endif // __x86_64__
#else // linux
-#ifdef __x86_64__
- __asm__(
- ".intel_syntax\n"
-"Start:\n"
- "movq %%mm0, [%%rcx]\n"
- "movq %%mm1, [%%rcx+8]\n"
- "pcmpeqd %%mm0, [%%rdx]\n"
- "pcmpeqd %%mm1, [%%rdx+16]\n"
- "movq %%mm2, [%%rcx+16]\n"
- "movq %%mm3, [%%rcx+24]\n"
- "pcmpeqd %%mm2, [%%rdx+32]\n"
- "pcmpeqd %%mm3, [%%rdx+48]\n"
- "pand %%mm0, %%mm1\n"
- "pand %%mm2, %%mm3\n"
- "movq %%mm4, [%%rcx+32]\n"
- "movq %%mm5, [%%rcx+40]\n"
- "pcmpeqd %%mm4, [%%rdx+8]\n"
- "pcmpeqd %%mm5, [%%rdx+24]\n"
- "pand %%mm0, %%mm2\n"
- "pand %%mm4, %%mm5\n"
- "movq %%mm6, [%%rcx+48]\n"
- "movq %%mm7, [%%rcx+56]\n"
- "pcmpeqd %%mm6, [%%rdx+40]\n"
- "pcmpeqd %%mm7, [%%rdx+56]\n"
- "pand %%mm0, %%mm4\n"
- "pand %%mm6, %%mm7\n"
- "pand %%mm0, %%mm6\n"
- "pmovmskb %%eax, %%mm0\n"
- "cmp %%eax, 0xff\n"
- "je Continue\n"
- ".att_syntax\n"
- "movb $1, %0\n"
- ".intel_syntax\n"
- "jmp Return\n"
-"Continue:\n"
- "cmp %%rbx, 16\n"
- "jle Return\n"
- "test %%rbx, 0x10\n"
- "jz AddRcx\n"
- "sub %%rdx, 448\n" // go back and down one column
-"AddRcx:\n"
- "add %%rdx, 256\n" // go to the right block
- "cmp %%rbx, 0x90\n"
- "jne Continue1\n"
- "add %%rdx, 256\n" // skip whole block
-"Continue1:\n"
- "add %%rcx, 64\n"
- "sub %%rbx, 16\n"
- "jmp Start\n"
-"Return:\n"
- "emms\n"
- ".att_syntax\n" : "=m"(bRet) : "c"(dst), "d"(src), "b"(entries) : "rax", "memory");// Breaks -fPIC
-#else
// do a fast test with MMX
__asm__(
".intel_syntax\n"
@@ -5324,7 +5264,6 @@ Return:
"Return:\n"
"emms\n"
".att_syntax\n" : "=m"(bRet) : "c"(dst), "d"(src), "b"(entries) : "eax", "memory"); // Breaks -fPIC
-#endif // __x86_64__
#endif // _WIN32