diff --git a/plugins/zerogs/dx/GS.h b/plugins/zerogs/dx/GS.h index be6278c306..09220c5ae3 100644 --- a/plugins/zerogs/dx/GS.h +++ b/plugins/zerogs/dx/GS.h @@ -647,7 +647,6 @@ char *SysLibError(); // Gets previous error loading sysbols void SysCloseLibrary(void *lib); // Closes Library void SysMessage(char *fmt, ...); -extern "C" void * memcpy_amd(void *dest, const void *src, size_t n); extern "C" u8 memcmp_mmx(const void *dest, const void *src, int n); template diff --git a/plugins/zerogs/dx/Windows/zerogs.vcxproj b/plugins/zerogs/dx/Windows/zerogs.vcxproj index d6a6ee394d..e6a1bb3458 100644 --- a/plugins/zerogs/dx/Windows/zerogs.vcxproj +++ b/plugins/zerogs/dx/Windows/zerogs.vcxproj @@ -148,7 +148,6 @@ - @@ -199,4 +198,4 @@ - \ No newline at end of file + diff --git a/plugins/zerogs/dx/Windows/zerogs.vcxproj.filters b/plugins/zerogs/dx/Windows/zerogs.vcxproj.filters index 3cb5dfc59c..3ad0324729 100644 --- a/plugins/zerogs/dx/Windows/zerogs.vcxproj.filters +++ b/plugins/zerogs/dx/Windows/zerogs.vcxproj.filters @@ -24,9 +24,6 @@ Source Files - - Source Files - Source Files @@ -94,4 +91,4 @@ Source Files - \ No newline at end of file + diff --git a/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj b/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj index bb37a5c7dd..0dae19a784 100644 --- a/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj +++ b/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj @@ -154,7 +154,6 @@ - @@ -205,4 +204,4 @@ - \ No newline at end of file + diff --git a/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj.filters b/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj.filters index 3cb5dfc59c..3ad0324729 100644 --- a/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj.filters +++ b/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj.filters @@ -24,9 +24,6 @@ Source Files - - Source Files - Source Files @@ -94,4 +91,4 @@ Source Files - \ No newline at end of file + diff --git a/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj b/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj index 44b52f8b2b..12ee597037 100644 --- a/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj +++ b/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj @@ -154,7 +154,6 @@ - @@ -205,4 +204,4 @@ - \ No newline at end of file + diff --git a/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj.filters b/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj.filters index 9a6158833a..e273346c31 100644 --- a/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj.filters +++ b/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj.filters @@ -24,9 +24,6 @@ Source Files - - Source Files - Source Files @@ -94,4 +91,4 @@ Source Files - \ No newline at end of file + diff --git a/plugins/zerogs/dx/memcpy_amd.cpp b/plugins/zerogs/dx/memcpy_amd.cpp deleted file mode 100644 index 0e16a64e5a..0000000000 --- a/plugins/zerogs/dx/memcpy_amd.cpp +++ /dev/null @@ -1,479 +0,0 @@ -/****************************************************************************** - - Copyright (c) 2001 Advanced Micro Devices, Inc. - - LIMITATION OF LIABILITY: THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY - EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, - NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY - PARTICULAR PURPOSE. IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY - DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS, - BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR - INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY - OF SUCH DAMAGES. BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION - OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY - NOT APPLY TO YOU. - - AMD does not assume any responsibility for any errors which may appear in the - Materials nor any responsibility to support or update the Materials. AMD retains - the right to make changes to its test specifications at any time, without notice. - - NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any - further information, software, technical information, know-how, or show-how - available to you. - - So that all may benefit from your experience, please report any problems - or suggestions about this software to 3dsdk.support@amd.com - - AMD Developer Technologies, M/S 585 - Advanced Micro Devices, Inc. - 5900 E. Ben White Blvd. - Austin, TX 78741 - 3dsdk.support@amd.com -******************************************************************************/ - -#include - -/***************************************************************************** -MEMCPY_AMD.CPP -******************************************************************************/ - -// Very optimized memcpy() routine for AMD Athlon and Duron family. -// This code uses any of FOUR different basic copy methods, depending -// on the transfer size. -// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or -// "Streaming Store"), and also uses the software prefetch instructions, -// be sure you're running on Athlon/Duron or other recent CPU before calling! - -#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy -// The smallest copy uses the X86 "movsd" instruction, in an optimized -// form which is an "unrolled loop". - -#define IN_CACHE_COPY 2 * 1024 // upper limit for movq/movq copy w/SW prefetch -// Next is a copy that uses the MMX registers to copy 8 bytes at a time, -// also using the "unrolled loop" optimization. This code uses -// the software prefetch instruction to get the data into the cache. - -#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch -// For larger blocks, which will spill beyond the cache, it's faster to -// use the Streaming Store instruction MOVNTQ. This write instruction -// bypasses the cache and writes straight to main memory. This code also -// uses the software prefetch instruction to pre-read the data. -// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE" - -#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch -#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch -// For the largest size blocks, a special technique called Block Prefetch -// can be used to accelerate the read operations. Block Prefetch reads -// one address per cache line, for a series of cache lines, in a short loop. -// This is faster than using software prefetch. The technique is great for -// getting maximum read bandwidth, especially in DDR memory systems. - -//#include - -// Inline assembly syntax for use with Visual C++ -#ifdef _WIN32 -#include -#endif - -#include "PS2Etypes.h" - -extern "C" { - -#if defined(_MSC_VER) && !defined(__x86_64__) - -void * memcpy_amd(void *dest, const void *src, size_t n) -{ - __asm { - mov ecx, [n] ; number of bytes to copy - mov edi, [dest] ; destination - mov esi, [src] ; source - mov ebx, ecx ; keep a copy of count - - cld - cmp ecx, TINY_BLOCK_COPY - jb $memcpy_ic_3 ; tiny? skip mmx copy - - cmp ecx, 32*1024 ; don't align between 32k-64k because - jbe $memcpy_do_align ; it appears to be slower - cmp ecx, 64*1024 - jbe $memcpy_align_done -$memcpy_do_align: - mov ecx, 8 ; a trick that's faster than rep movsb... - sub ecx, edi ; align destination to qword - and ecx, 111b ; get the low bits - sub ebx, ecx ; update copy count - neg ecx ; set up to jump into the array - add ecx, offset $memcpy_align_done - jmp ecx ; jump to array of movsb's - -align 4 - movsb - movsb - movsb - movsb - movsb - movsb - movsb - movsb - -$memcpy_align_done: ; destination is dword aligned - mov ecx, ebx ; number of bytes left to copy - shr ecx, 6 ; get 64-byte block count - jz $memcpy_ic_2 ; finish the last few bytes - - cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy - jae $memcpy_uc_test - -// This is small block copy that uses the MMX registers to copy 8 bytes -// at a time. It uses the "unrolled loop" optimization, and also uses -// the software prefetch instruction to get the data into the cache. -align 16 -$memcpy_ic_1: ; 64-byte block copies, in-cache copy - - prefetchnta [esi + (200*64/34+192)] ; start reading ahead - - movq mm0, [esi+0] ; read 64 bits - movq mm1, [esi+8] - movq [edi+0], mm0 ; write 64 bits - movq [edi+8], mm1 ; note: the normal movq writes the - movq mm2, [esi+16] ; data to cache; a cache line will be - movq mm3, [esi+24] ; allocated as needed, to store the data - movq [edi+16], mm2 - movq [edi+24], mm3 - movq mm0, [esi+32] - movq mm1, [esi+40] - movq [edi+32], mm0 - movq [edi+40], mm1 - movq mm2, [esi+48] - movq mm3, [esi+56] - movq [edi+48], mm2 - movq [edi+56], mm3 - - add esi, 64 ; update source pointer - add edi, 64 ; update destination pointer - dec ecx ; count down - jnz $memcpy_ic_1 ; last 64-byte block? - -$memcpy_ic_2: - mov ecx, ebx ; has valid low 6 bits of the byte count -$memcpy_ic_3: - shr ecx, 2 ; dword count - and ecx, 1111b ; only look at the "remainder" bits - neg ecx ; set up to jump into the array - add ecx, offset $memcpy_last_few - jmp ecx ; jump to array of movsd's - -$memcpy_uc_test: - cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy - jae $memcpy_bp_1 - -$memcpy_64_test: - or ecx, ecx ; tail end of block prefetch will jump here - jz $memcpy_ic_2 ; no more 64-byte blocks left - -// For larger blocks, which will spill beyond the cache, it's faster to -// use the Streaming Store instruction MOVNTQ. This write instruction -// bypasses the cache and writes straight to main memory. This code also -// uses the software prefetch instruction to pre-read the data. -align 16 -$memcpy_uc_1: ; 64-byte blocks, uncached copy - - prefetchnta [esi + (200*64/34+192)] ; start reading ahead - - movq mm0,[esi+0] ; read 64 bits - add edi,64 ; update destination pointer - movq mm1,[esi+8] - add esi,64 ; update source pointer - movq mm2,[esi-48] - movntq [edi-64], mm0 ; write 64 bits, bypassing the cache - movq mm0,[esi-40] ; note: movntq also prevents the CPU - movntq [edi-56], mm1 ; from READING the destination address - movq mm1,[esi-32] ; into the cache, only to be over-written - movntq [edi-48], mm2 ; so that also helps performance - movq mm2,[esi-24] - movntq [edi-40], mm0 - movq mm0,[esi-16] - movntq [edi-32], mm1 - movq mm1,[esi-8] - movntq [edi-24], mm2 - movntq [edi-16], mm0 - dec ecx - movntq [edi-8], mm1 - jnz $memcpy_uc_1 ; last 64-byte block? - - jmp $memcpy_ic_2 ; almost done - -// For the largest size blocks, a special technique called Block Prefetch -// can be used to accelerate the read operations. Block Prefetch reads -// one address per cache line, for a series of cache lines, in a short loop. -// This is faster than using software prefetch. The technique is great for -// getting maximum read bandwidth, especially in DDR memory systems. -$memcpy_bp_1: ; large blocks, block prefetch copy - - cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop? - jl $memcpy_64_test ; no, back to regular uncached copy - - mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X - add esi, CACHEBLOCK * 64 ; move to the top of the block -align 16 -$memcpy_bp_2: - mov edx, [esi-64] ; grab one address per cache line - mov edx, [esi-128] ; grab one address per cache line - sub esi, 128 ; go reverse order to suppress HW prefetcher - dec eax ; count down the cache lines - jnz $memcpy_bp_2 ; keep grabbing more lines into cache - - mov eax, CACHEBLOCK ; now that it's in cache, do the copy -align 16 -$memcpy_bp_3: - movq mm0, [esi ] ; read 64 bits - movq mm1, [esi+ 8] - movq mm2, [esi+16] - movq mm3, [esi+24] - movq mm4, [esi+32] - movq mm5, [esi+40] - movq mm6, [esi+48] - movq mm7, [esi+56] - add esi, 64 ; update source pointer - movntq [edi ], mm0 ; write 64 bits, bypassing cache - movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU - movntq [edi+16], mm2 ; from READING the destination address - movntq [edi+24], mm3 ; into the cache, only to be over-written, - movntq [edi+32], mm4 ; so that also helps performance - movntq [edi+40], mm5 - movntq [edi+48], mm6 - movntq [edi+56], mm7 - add edi, 64 ; update dest pointer - - dec eax ; count down - - jnz $memcpy_bp_3 ; keep copying - sub ecx, CACHEBLOCK ; update the 64-byte block count - jmp $memcpy_bp_1 ; keep processing chunks - -// The smallest copy uses the X86 "movsd" instruction, in an optimized -// form which is an "unrolled loop". Then it handles the last few bytes. -align 4 - movsd - movsd ; perform last 1-15 dword copies - movsd - movsd - movsd - movsd - movsd - movsd - movsd - movsd ; perform last 1-7 dword copies - movsd - movsd - movsd - movsd - movsd - movsd - -$memcpy_last_few: ; dword aligned from before movsd's - mov ecx, ebx ; has valid low 2 bits of the byte count - and ecx, 11b ; the last few cows must come home - jz $memcpy_final ; no more, let's leave - rep movsb ; the last 1, 2, or 3 bytes - -$memcpy_final: - emms ; clean up the MMX state - sfence ; flush the write buffer - mov eax, [dest] ; ret value = destination pointer - - } -} - -// mmx memcpy implementation, size has to be a multiple of 8 -// returns 0 is equal, nonzero value if not equal -// ~10 times faster than standard memcmp -// (zerofrog) -u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize) -{ - assert( (cmpsize&7) == 0 ); - - __asm { -push esi - mov ecx, cmpsize - mov edx, src1 - mov esi, src2 - - cmp ecx, 32 - jl Done4 - - // custom test first 8 to make sure things are ok - movq mm0, [esi] - movq mm1, [esi+8] - pcmpeqd mm0, [edx] - pcmpeqd mm1, [edx+8] - pand mm0, mm1 - movq mm2, [esi+16] - pmovmskb eax, mm0 - movq mm3, [esi+24] - - // check if eq - cmp eax, 0xff - je NextComp - mov eax, 1 - jmp End - -NextComp: - pcmpeqd mm2, [edx+16] - pcmpeqd mm3, [edx+24] - pand mm2, mm3 - pmovmskb eax, mm2 - - sub ecx, 32 - add esi, 32 - add edx, 32 - - // check if eq - cmp eax, 0xff - je ContinueTest - mov eax, 1 - jmp End - - cmp ecx, 64 - jl Done8 - -Cmp8: - movq mm0, [esi] - movq mm1, [esi+8] - movq mm2, [esi+16] - movq mm3, [esi+24] - movq mm4, [esi+32] - movq mm5, [esi+40] - movq mm6, [esi+48] - movq mm7, [esi+56] - pcmpeqd mm0, [edx] - pcmpeqd mm1, [edx+8] - pcmpeqd mm2, [edx+16] - pcmpeqd mm3, [edx+24] - pand mm0, mm1 - pcmpeqd mm4, [edx+32] - pand mm0, mm2 - pcmpeqd mm5, [edx+40] - pand mm0, mm3 - pcmpeqd mm6, [edx+48] - pand mm0, mm4 - pcmpeqd mm7, [edx+56] - pand mm0, mm5 - pand mm0, mm6 - pand mm0, mm7 - pmovmskb eax, mm0 - - // check if eq - cmp eax, 0xff - je Continue - mov eax, 1 - jmp End - -Continue: - sub ecx, 64 - add esi, 64 - add edx, 64 -ContinueTest: - cmp ecx, 64 - jge Cmp8 - -Done8: - test ecx, 0x20 - jz Done4 - movq mm0, [esi] - movq mm1, [esi+8] - movq mm2, [esi+16] - movq mm3, [esi+24] - pcmpeqd mm0, [edx] - pcmpeqd mm1, [edx+8] - pcmpeqd mm2, [edx+16] - pcmpeqd mm3, [edx+24] - pand mm0, mm1 - pand mm0, mm2 - pand mm0, mm3 - pmovmskb eax, mm0 - sub ecx, 32 - add esi, 32 - add edx, 32 - - // check if eq - cmp eax, 0xff - je Done4 - mov eax, 1 - jmp End - -Done4: - cmp ecx, 24 - jne Done2 - movq mm0, [esi] - movq mm1, [esi+8] - movq mm2, [esi+16] - pcmpeqd mm0, [edx] - pcmpeqd mm1, [edx+8] - pcmpeqd mm2, [edx+16] - pand mm0, mm1 - pand mm0, mm2 - pmovmskb eax, mm0 - - // check if eq - cmp eax, 0xff - setne al - jmp End - -Done2: - cmp ecx, 16 - jne Done1 - - movq mm0, [esi] - movq mm1, [esi+8] - pcmpeqd mm0, [edx] - pcmpeqd mm1, [edx+8] - pand mm0, mm1 - pmovmskb eax, mm0 - - // check if eq - cmp eax, 0xff - setne al - jmp End - -Done1: - cmp ecx, 8 - jne Done - - mov eax, [esi] - mov esi, [esi+4] - cmp eax, [edx] - je Next - mov eax, 1 - jmp End - -Next: - cmp esi, [edx+4] - setne al - jmp End - -Done: - xor eax, eax - -End: - pop esi - emms - } -} - -#else // _MSC_VER -// assume gcc or mingw or win x64 - -#include -#include - -void * memcpy_amd(void *dest, const void *src, size_t n) -{ -memcpy(dest, src, n); -return dest; -} - - -#endif - -} diff --git a/plugins/zerogs/dx/targets.cpp b/plugins/zerogs/dx/targets.cpp index f7a0726577..67699c15fd 100644 --- a/plugins/zerogs/dx/targets.cpp +++ b/plugins/zerogs/dx/targets.cpp @@ -2026,7 +2026,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info targ->clut.resize(clutsize); if( tex0.cpsm <= 1 ) { // 32 bit - memcpy_amd(&targ->clut[0], ZeroGS::g_pbyGSClut+nClutOffset, clutsize); + memcpy(&targ->clut[0], ZeroGS::g_pbyGSClut+nClutOffset, clutsize); } else { u16* pClutBuffer = (u16*)(ZeroGS::g_pbyGSClut + nClutOffset); @@ -2110,7 +2110,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info targ->memory->ref = 1; } - memcpy_amd(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height); + memcpy(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height); u8* psrc = (u8*)(ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy); @@ -2136,7 +2136,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info targ->memory->ref = 1; } - memcpy_amd(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height); + memcpy(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height); // needs to be 8 bit, use xmm for unpacking u16* dst = (u16*)lock.pBits; @@ -2219,7 +2219,7 @@ Z16Loop: targ->memory = NULL; } - memcpy_amd(lock.pBits, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height ); + memcpy(lock.pBits, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height ); } } diff --git a/plugins/zerogs/dx/zerogs.cpp b/plugins/zerogs/dx/zerogs.cpp index 7ff6637598..2f712e98ac 100644 --- a/plugins/zerogs/dx/zerogs.cpp +++ b/plugins/zerogs/dx/zerogs.cpp @@ -2239,7 +2239,7 @@ void ZeroGS::Flush(int context) } if( curvb.tex0.cpsm <= 1 ) { // 32 bit - memcpy_amd(lock.pBits, ZeroGS::g_pbyGSClut+nClutOffset, clutsize); + memcpy(lock.pBits, ZeroGS::g_pbyGSClut+nClutOffset, clutsize); } else { u16* pClutBuffer = (u16*)(ZeroGS::g_pbyGSClut + nClutOffset); @@ -5087,7 +5087,7 @@ void ZeroGS::CaptureFrame() BYTE* pend = (BYTE*)lock.pBits + (conf.height-1)*width*4; for(int i = 0; i < conf.height; ++i) { - memcpy_amd(&mem[width*4*i], pend - width*4*i, width * 4); + memcpy(&mem[width*4*i], pend - width*4*i, width * 4); } s_ptexAVICapture->UnlockRect(); diff --git a/plugins/zerogs/opengl/CMakeLists.txt b/plugins/zerogs/opengl/CMakeLists.txt index d365e4df1d..fd33949c9a 100644 --- a/plugins/zerogs/opengl/CMakeLists.txt +++ b/plugins/zerogs/opengl/CMakeLists.txt @@ -36,7 +36,6 @@ set(zerogsSources GSmain.cpp GLWinX11.cpp Mem.cpp - memcpy_amd.cpp rasterfont.cpp Regs.cpp targets.cpp diff --git a/plugins/zerogs/opengl/GS.h b/plugins/zerogs/opengl/GS.h index 86b350619b..27686382e2 100644 --- a/plugins/zerogs/opengl/GS.h +++ b/plugins/zerogs/opengl/GS.h @@ -728,7 +728,6 @@ char *SysLibError(); // Gets previous error loading sysbols void SysCloseLibrary(void *lib); // Closes Library void SysMessage(char *fmt, ...); -extern "C" void * memcpy_amd(void *dest, const void *src, size_t n); extern "C" u8 memcmp_mmx(const void *dest, const void *src, int n); template diff --git a/plugins/zerogs/opengl/Makefile.am b/plugins/zerogs/opengl/Makefile.am index cbb71329c5..ffef92ae04 100644 --- a/plugins/zerogs/opengl/Makefile.am +++ b/plugins/zerogs/opengl/Makefile.am @@ -23,7 +23,7 @@ libZeroGSogl_LDFLAGS+=-Wl,-soname,@ZEROGS_SONAME@ libZeroGSogl_LDADD=$(libZeroGSogl_a_OBJECTS) libZeroGSogl_a_SOURCES = \ -GSmain.cpp memcpy_amd.cpp Regs.cpp x86.cpp zpipe.cpp \ +GSmain.cpp Regs.cpp x86.cpp zpipe.cpp \ Mem.cpp rasterfont.cpp targets.cpp zerogs.cpp GifTransfer.cpp GLWinX11.cpp libZeroGSogl_a_SOURCES += x86-32.S diff --git a/plugins/zerogs/opengl/memcpy_amd.cpp b/plugins/zerogs/opengl/memcpy_amd.cpp deleted file mode 100644 index 3b8318086d..0000000000 --- a/plugins/zerogs/opengl/memcpy_amd.cpp +++ /dev/null @@ -1,478 +0,0 @@ -/****************************************************************************** - - Copyright (c) 2001 Advanced Micro Devices, Inc. - - LIMITATION OF LIABILITY: THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY - EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, - NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY - PARTICULAR PURPOSE. IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY - DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS, - BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR - INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY - OF SUCH DAMAGES. BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION - OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY - NOT APPLY TO YOU. - - AMD does not assume any responsibility for any errors which may appear in the - Materials nor any responsibility to support or update the Materials. AMD retains - the right to make changes to its test specifications at any time, without notice. - - NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any - further information, software, technical information, know-how, or show-how - available to you. - - So that all may benefit from your experience, please report any problems - or suggestions about this software to 3dsdk.support@amd.com - - AMD Developer Technologies, M/S 585 - Advanced Micro Devices, Inc. - 5900 E. Ben White Blvd. - Austin, TX 78741 - 3dsdk.support@amd.com -******************************************************************************/ - -#include - -/***************************************************************************** -MEMCPY_AMD.CPP -******************************************************************************/ - -// Very optimized memcpy() routine for AMD Athlon and Duron family. -// This code uses any of FOUR different basic copy methods, depending -// on the transfer size. -// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or -// "Streaming Store"), and also uses the software prefetch instructions, -// be sure you're running on Athlon/Duron or other recent CPU before calling! - -#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy -// The smallest copy uses the X86 "movsd" instruction, in an optimized -// form which is an "unrolled loop". - -#define IN_CACHE_COPY 2 * 1024 // upper limit for movq/movq copy w/SW prefetch -// Next is a copy that uses the MMX registers to copy 8 bytes at a time, -// also using the "unrolled loop" optimization. This code uses -// the software prefetch instruction to get the data into the cache. - -#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch -// For larger blocks, which will spill beyond the cache, it's faster to -// use the Streaming Store instruction MOVNTQ. This write instruction -// bypasses the cache and writes straight to main memory. This code also -// uses the software prefetch instruction to pre-read the data. -// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE" - -#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch -#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch -// For the largest size blocks, a special technique called Block Prefetch -// can be used to accelerate the read operations. Block Prefetch reads -// one address per cache line, for a series of cache lines, in a short loop. -// This is faster than using software prefetch. The technique is great for -// getting maximum read bandwidth, especially in DDR memory systems. - -//#include - -// Inline assembly syntax for use with Visual C++ -#ifdef _WIN32 -#include -#endif - -extern "C" { -#include "PS2Etypes.h" - -#if defined(_MSC_VER) - -void * memcpy_amd(void *dest, const void *src, size_t n) -{ - __asm { - mov ecx, [n] ; number of bytes to copy - mov edi, [dest] ; destination - mov esi, [src] ; source - mov ebx, ecx ; keep a copy of count - - cld - cmp ecx, TINY_BLOCK_COPY - jb $memcpy_ic_3 ; tiny? skip mmx copy - - cmp ecx, 32*1024 ; don't align between 32k-64k because - jbe $memcpy_do_align ; it appears to be slower - cmp ecx, 64*1024 - jbe $memcpy_align_done -$memcpy_do_align: - mov ecx, 8 ; a trick that's faster than rep movsb... - sub ecx, edi ; align destination to qword - and ecx, 111b ; get the low bits - sub ebx, ecx ; update copy count - neg ecx ; set up to jump into the array - add ecx, offset $memcpy_align_done - jmp ecx ; jump to array of movsb's - -align 4 - movsb - movsb - movsb - movsb - movsb - movsb - movsb - movsb - -$memcpy_align_done: ; destination is dword aligned - mov ecx, ebx ; number of bytes left to copy - shr ecx, 6 ; get 64-byte block count - jz $memcpy_ic_2 ; finish the last few bytes - - cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy - jae $memcpy_uc_test - -// This is small block copy that uses the MMX registers to copy 8 bytes -// at a time. It uses the "unrolled loop" optimization, and also uses -// the software prefetch instruction to get the data into the cache. -align 16 -$memcpy_ic_1: ; 64-byte block copies, in-cache copy - - prefetchnta [esi + (200*64/34+192)] ; start reading ahead - - movq mm0, [esi+0] ; read 64 bits - movq mm1, [esi+8] - movq [edi+0], mm0 ; write 64 bits - movq [edi+8], mm1 ; note: the normal movq writes the - movq mm2, [esi+16] ; data to cache; a cache line will be - movq mm3, [esi+24] ; allocated as needed, to store the data - movq [edi+16], mm2 - movq [edi+24], mm3 - movq mm0, [esi+32] - movq mm1, [esi+40] - movq [edi+32], mm0 - movq [edi+40], mm1 - movq mm2, [esi+48] - movq mm3, [esi+56] - movq [edi+48], mm2 - movq [edi+56], mm3 - - add esi, 64 ; update source pointer - add edi, 64 ; update destination pointer - dec ecx ; count down - jnz $memcpy_ic_1 ; last 64-byte block? - -$memcpy_ic_2: - mov ecx, ebx ; has valid low 6 bits of the byte count -$memcpy_ic_3: - shr ecx, 2 ; dword count - and ecx, 1111b ; only look at the "remainder" bits - neg ecx ; set up to jump into the array - add ecx, offset $memcpy_last_few - jmp ecx ; jump to array of movsd's - -$memcpy_uc_test: - cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy - jae $memcpy_bp_1 - -$memcpy_64_test: - or ecx, ecx ; tail end of block prefetch will jump here - jz $memcpy_ic_2 ; no more 64-byte blocks left - -// For larger blocks, which will spill beyond the cache, it's faster to -// use the Streaming Store instruction MOVNTQ. This write instruction -// bypasses the cache and writes straight to main memory. This code also -// uses the software prefetch instruction to pre-read the data. -align 16 -$memcpy_uc_1: ; 64-byte blocks, uncached copy - - prefetchnta [esi + (200*64/34+192)] ; start reading ahead - - movq mm0,[esi+0] ; read 64 bits - add edi,64 ; update destination pointer - movq mm1,[esi+8] - add esi,64 ; update source pointer - movq mm2,[esi-48] - movntq [edi-64], mm0 ; write 64 bits, bypassing the cache - movq mm0,[esi-40] ; note: movntq also prevents the CPU - movntq [edi-56], mm1 ; from READING the destination address - movq mm1,[esi-32] ; into the cache, only to be over-written - movntq [edi-48], mm2 ; so that also helps performance - movq mm2,[esi-24] - movntq [edi-40], mm0 - movq mm0,[esi-16] - movntq [edi-32], mm1 - movq mm1,[esi-8] - movntq [edi-24], mm2 - movntq [edi-16], mm0 - dec ecx - movntq [edi-8], mm1 - jnz $memcpy_uc_1 ; last 64-byte block? - - jmp $memcpy_ic_2 ; almost done - -// For the largest size blocks, a special technique called Block Prefetch -// can be used to accelerate the read operations. Block Prefetch reads -// one address per cache line, for a series of cache lines, in a short loop. -// This is faster than using software prefetch. The technique is great for -// getting maximum read bandwidth, especially in DDR memory systems. -$memcpy_bp_1: ; large blocks, block prefetch copy - - cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop? - jl $memcpy_64_test ; no, back to regular uncached copy - - mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X - add esi, CACHEBLOCK * 64 ; move to the top of the block -align 16 -$memcpy_bp_2: - mov edx, [esi-64] ; grab one address per cache line - mov edx, [esi-128] ; grab one address per cache line - sub esi, 128 ; go reverse order to suppress HW prefetcher - dec eax ; count down the cache lines - jnz $memcpy_bp_2 ; keep grabbing more lines into cache - - mov eax, CACHEBLOCK ; now that it's in cache, do the copy -align 16 -$memcpy_bp_3: - movq mm0, [esi ] ; read 64 bits - movq mm1, [esi+ 8] - movq mm2, [esi+16] - movq mm3, [esi+24] - movq mm4, [esi+32] - movq mm5, [esi+40] - movq mm6, [esi+48] - movq mm7, [esi+56] - add esi, 64 ; update source pointer - movntq [edi ], mm0 ; write 64 bits, bypassing cache - movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU - movntq [edi+16], mm2 ; from READING the destination address - movntq [edi+24], mm3 ; into the cache, only to be over-written, - movntq [edi+32], mm4 ; so that also helps performance - movntq [edi+40], mm5 - movntq [edi+48], mm6 - movntq [edi+56], mm7 - add edi, 64 ; update dest pointer - - dec eax ; count down - - jnz $memcpy_bp_3 ; keep copying - sub ecx, CACHEBLOCK ; update the 64-byte block count - jmp $memcpy_bp_1 ; keep processing chunks - -// The smallest copy uses the X86 "movsd" instruction, in an optimized -// form which is an "unrolled loop". Then it handles the last few bytes. -align 4 - movsd - movsd ; perform last 1-15 dword copies - movsd - movsd - movsd - movsd - movsd - movsd - movsd - movsd ; perform last 1-7 dword copies - movsd - movsd - movsd - movsd - movsd - movsd - -$memcpy_last_few: ; dword aligned from before movsd's - mov ecx, ebx ; has valid low 2 bits of the byte count - and ecx, 11b ; the last few cows must come home - jz $memcpy_final ; no more, let's leave - rep movsb ; the last 1, 2, or 3 bytes - -$memcpy_final: - emms ; clean up the MMX state - sfence ; flush the write buffer - mov eax, [dest] ; ret value = destination pointer - - } -} - -// mmx memcpy implementation, size has to be a multiple of 8 -// returns 0 is equal, nonzero value if not equal -// ~10 times faster than standard memcmp -// (zerofrog) -u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize) -{ - assert( (cmpsize&7) == 0 ); - - __asm { -push esi - mov ecx, cmpsize - mov edx, src1 - mov esi, src2 - - cmp ecx, 32 - jl Done4 - - // custom test first 8 to make sure things are ok - movq mm0, [esi] - movq mm1, [esi+8] - pcmpeqd mm0, [edx] - pcmpeqd mm1, [edx+8] - pand mm0, mm1 - movq mm2, [esi+16] - pmovmskb eax, mm0 - movq mm3, [esi+24] - - // check if eq - cmp eax, 0xff - je NextComp - mov eax, 1 - jmp End - -NextComp: - pcmpeqd mm2, [edx+16] - pcmpeqd mm3, [edx+24] - pand mm2, mm3 - pmovmskb eax, mm2 - - sub ecx, 32 - add esi, 32 - add edx, 32 - - // check if eq - cmp eax, 0xff - je ContinueTest - mov eax, 1 - jmp End - - cmp ecx, 64 - jl Done8 - -Cmp8: - movq mm0, [esi] - movq mm1, [esi+8] - movq mm2, [esi+16] - movq mm3, [esi+24] - movq mm4, [esi+32] - movq mm5, [esi+40] - movq mm6, [esi+48] - movq mm7, [esi+56] - pcmpeqd mm0, [edx] - pcmpeqd mm1, [edx+8] - pcmpeqd mm2, [edx+16] - pcmpeqd mm3, [edx+24] - pand mm0, mm1 - pcmpeqd mm4, [edx+32] - pand mm0, mm2 - pcmpeqd mm5, [edx+40] - pand mm0, mm3 - pcmpeqd mm6, [edx+48] - pand mm0, mm4 - pcmpeqd mm7, [edx+56] - pand mm0, mm5 - pand mm0, mm6 - pand mm0, mm7 - pmovmskb eax, mm0 - - // check if eq - cmp eax, 0xff - je Continue - mov eax, 1 - jmp End - -Continue: - sub ecx, 64 - add esi, 64 - add edx, 64 -ContinueTest: - cmp ecx, 64 - jge Cmp8 - -Done8: - test ecx, 0x20 - jz Done4 - movq mm0, [esi] - movq mm1, [esi+8] - movq mm2, [esi+16] - movq mm3, [esi+24] - pcmpeqd mm0, [edx] - pcmpeqd mm1, [edx+8] - pcmpeqd mm2, [edx+16] - pcmpeqd mm3, [edx+24] - pand mm0, mm1 - pand mm0, mm2 - pand mm0, mm3 - pmovmskb eax, mm0 - sub ecx, 32 - add esi, 32 - add edx, 32 - - // check if eq - cmp eax, 0xff - je Done4 - mov eax, 1 - jmp End - -Done4: - cmp ecx, 24 - jne Done2 - movq mm0, [esi] - movq mm1, [esi+8] - movq mm2, [esi+16] - pcmpeqd mm0, [edx] - pcmpeqd mm1, [edx+8] - pcmpeqd mm2, [edx+16] - pand mm0, mm1 - pand mm0, mm2 - pmovmskb eax, mm0 - - // check if eq - cmp eax, 0xff - setne al - jmp End - -Done2: - cmp ecx, 16 - jne Done1 - - movq mm0, [esi] - movq mm1, [esi+8] - pcmpeqd mm0, [edx] - pcmpeqd mm1, [edx+8] - pand mm0, mm1 - pmovmskb eax, mm0 - - // check if eq - cmp eax, 0xff - setne al - jmp End - -Done1: - cmp ecx, 8 - jne Done - - mov eax, [esi] - mov esi, [esi+4] - cmp eax, [edx] - je Next - mov eax, 1 - jmp End - -Next: - cmp esi, [edx+4] - setne al - jmp End - -Done: - xor eax, eax - -End: - pop esi - emms - } -} - -#else // _MSC_VER -// assume gcc - -#include -#include - -void * memcpy_amd(void *dest, const void *src, size_t n) -{ -memcpy(dest, src, n); -return dest; -} - - -#endif - -} diff --git a/plugins/zerogs/opengl/targets.cpp b/plugins/zerogs/opengl/targets.cpp index 4e143a98ad..d2995a3530 100644 --- a/plugins/zerogs/opengl/targets.cpp +++ b/plugins/zerogs/opengl/targets.cpp @@ -1789,7 +1789,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info targ->clut.resize(clutsize); if( tex0.cpsm <= 1 ) { // 32 bit - memcpy_amd(&targ->clut[0], g_pbyGSClut+nClutOffset, clutsize); + memcpy(&targ->clut[0], g_pbyGSClut+nClutOffset, clutsize); } else { u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset); @@ -1854,7 +1854,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info assert(targ->ptex->ref > 0 ); } - memcpy_amd(targ->ptex->memptr, g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height); + memcpy(targ->ptex->memptr, g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height); vector texdata; u8* ptexdata = NULL; diff --git a/plugins/zerogs/opengl/zerogs.cpp b/plugins/zerogs/opengl/zerogs.cpp index 35927b0ef3..94b33a8845 100644 --- a/plugins/zerogs/opengl/zerogs.cpp +++ b/plugins/zerogs/opengl/zerogs.cpp @@ -2568,7 +2568,7 @@ void ZeroGS::Flush(int context) g_nCurVBOIndex = (g_nCurVBOIndex+1)%g_vboBuffers.size(); glBufferData(GL_ARRAY_BUFFER, curvb.nCount * sizeof(VertexGPU), curvb.pBufferData, GL_STREAM_DRAW); // void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); -// memcpy_amd(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU)); +// memcpy(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU)); // glUnmapBuffer(GL_ARRAY_BUFFER); SET_STREAM(); @@ -2652,7 +2652,7 @@ void ZeroGS::Flush(int context) } if( curvb.tex0.cpsm <= 1 ) { // 32 bit - memcpy_amd(&data[0], g_pbyGSClut+nClutOffset, clutsize); + memcpy(&data[0], g_pbyGSClut+nClutOffset, clutsize); } else { u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset); @@ -5779,7 +5779,7 @@ void ZeroGS::CaptureFrame() // u8* pend = (u8*)&data[0] + (nBackbufferHeight-1)*nBackbufferWidth*4; // for(int i = 0; i < conf.height; ++i) { -// memcpy_amd(&mem[nBackbufferWidth*4*i], pend - nBackbufferWidth*4*i, nBackbufferWidth * 4); +// memcpy(&mem[nBackbufferWidth*4*i], pend - nBackbufferWidth*4*i, nBackbufferWidth * 4); // } int fps = SMODE1->CMOD == 3 ? 50 : 60; diff --git a/plugins/zerogs/opengl/zerogs.h b/plugins/zerogs/opengl/zerogs.h index a0ee9c3143..5f06c61ffd 100644 --- a/plugins/zerogs/opengl/zerogs.h +++ b/plugins/zerogs/opengl/zerogs.h @@ -436,7 +436,7 @@ namespace ZeroGS { if( nCount + nVerts > nNumVertices ) { // recreate except with a bigger count VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU)*nNumVertices*2, 256); - memcpy_amd(ptemp, pBufferData, sizeof(VertexGPU) * nCount); + memcpy(ptemp, pBufferData, sizeof(VertexGPU) * nCount); nNumVertices *= 2; assert( nCount + nVerts <= nNumVertices ); _aligned_free(pBufferData); diff --git a/plugins/zzogl-pg-cg/opengl/CMakeLists.txt b/plugins/zzogl-pg-cg/opengl/CMakeLists.txt index 2630cf1bb2..641e3e3375 100644 --- a/plugins/zzogl-pg-cg/opengl/CMakeLists.txt +++ b/plugins/zzogl-pg-cg/opengl/CMakeLists.txt @@ -49,7 +49,6 @@ set(zzoglSources GSmain.cpp HostMemory.cpp Mem.cpp - # memcpy_amd.cpp Mem_Swizzle.cpp Mem_Tables.cpp Profile.cpp