From e6b450ed9c353dc7a2d008fb89de331669ff1808 Mon Sep 17 00:00:00 2001 From: mtabachenko Date: Tue, 30 Dec 2008 00:37:38 +0000 Subject: [PATCH] core: - rewrited and moved memcpy_fast to new file (.asm) for compiling x86/x64 versions (sorry, if I broke others ports... check it please) --- desmume/src/ARM9.h | 10 +- desmume/src/GPU.cpp | 3 +- desmume/src/GPU.h | 1 + desmume/src/OGLRender.cpp | 88 +-- desmume/src/OGLRender.h | 7 + desmume/src/common-x64.asm | 27 + desmume/src/common-x86.asm | 104 ++++ desmume/src/common.h | 17 + desmume/src/common_sse2-x64.asm | 27 + desmume/src/common_sse2-x86.asm | 151 +++++ desmume/src/windows/DeSmuME_2005.vcproj | 537 +++++----------- desmume/src/windows/DeSmuME_2008.vcproj | 791 +++++++++++++++--------- 12 files changed, 1001 insertions(+), 762 deletions(-) create mode 100644 desmume/src/common-x64.asm create mode 100644 desmume/src/common-x86.asm create mode 100644 desmume/src/common_sse2-x64.asm create mode 100644 desmume/src/common_sse2-x86.asm diff --git a/desmume/src/ARM9.h b/desmume/src/ARM9.h index 668505422..72955330a 100644 --- a/desmume/src/ARM9.h +++ b/desmume/src/ARM9.h @@ -3,7 +3,7 @@ #include "types.h" -typedef struct { +typedef ALIGN(16) struct { //ARM9 mem u8 ARM9_ITCM[0x8000]; u8 ARM9_DTCM[0x4000]; @@ -18,13 +18,13 @@ typedef struct { u8 ARM9_LCD[0xA4000]; u8 ARM9_OAM[0x800]; - u8 * ExtPal[2][4]; - u8 * ObjExtPal[2][2]; - u8 * texPalSlot[6]; + u8 * ExtPal[2][4]; + u8 * ObjExtPal[2][2]; + u8 * texPalSlot[6]; u8 *textureSlotAddr[4]; - u8 *blank_memory[0x20000]; + u8 *blank_memory[0x20000]; } ARM9_struct; extern ARM9_struct ARM9Mem; diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 42b6b5afb..d39c27cb1 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -2536,7 +2536,6 @@ static INLINE void GPU_ligne_MasterBrightness(NDS_Screen * screen, u16 l) #endif } -extern void* memcpy_fast(void* dest, const void* src, size_t count); void GPU_ligne(NDS_Screen * screen, u16 l) { GPU * gpu = screen->gpu; @@ -2565,7 +2564,7 @@ void GPU_ligne(NDS_Screen * screen, u16 l) { u8 * dst = GPU_screen + (screen->offset + l) * 512; u8 * src = gpu->VRAMaddr + (l*512); - memcpy(dst, src, 512); + GPU_copyLine(dst, src); } break; case 3: diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index bac20b5f3..c6f25050b 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -28,6 +28,7 @@ #include "ARM9.h" #include #include "mem.h" +#include "common.h" #include "registers.h" #include "FIFO.h" #include "MMU.h" diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index 176aa42ac..358f4693a 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -24,12 +24,7 @@ //so, it doesnt composite to 2d correctly. //(re: new super mario brothers renders the stormclouds at the beginning) -#include -#include -#include -#include -#include -#include "debug.h" +#include "OGLRender.h" //#define DEBUG_DUMP_TEXTURE @@ -238,7 +233,11 @@ struct TextureCache int coord; float invSizeX; float invSizeY; +#ifdef SSE2 + ALIGN(16) unsigned char texture[128*1024]; // 128Kb texture slot +#else unsigned char texture[128*1024]; // 128Kb texture slot +#endif //set if this texture is suspected be invalid due to a vram reconfigure bool suspectedInvalid; @@ -510,80 +509,6 @@ static void OGLClose() //todo - make all color conversions go through a properly spread table!! -//I think this is slower than the regular memcmp.. doesnt make sense to me, but my -//asm optimization knowlege is 15 years old.. - -#if defined(_MSC_VER) || defined(__INTEL_COMPILER) -int memcmp_slow(const void* src, const void* dst, u32 count) { - int retval; - __asm { - mov [retval], 0; - mov ecx, [count]; - shr ecx, 2; - mov esi, [src]; - mov edi, [dst]; - repe cmpsd; - setc byte ptr [retval]; - } - return retval; -} - -void* memcpy_fast(void* dest, const void* src, size_t count) -{ - size_t blockCnt = count / 64; - size_t remainder = count % 64; - - __asm - { - mov esi, [src] - mov edi, [dest] - mov ecx, [blockCnt] - - test ecx, ecx - jz copy_remainder - - copyloop: - //prefetchnta [esi] - mov eax, [esi] - - movq mm0, qword ptr [esi] - movq mm1, qword ptr [esi+8] - movq mm2, qword ptr [esi+16] - movq mm3, qword ptr [esi+24] - movq mm4, qword ptr [esi+32] - movq mm5, qword ptr [esi+40] - movq mm6, qword ptr [esi+48] - movq mm7, qword ptr [esi+56] - movntq qword ptr [edi], mm0 - movntq qword ptr [edi+8], mm1 - movntq qword ptr [edi+16], mm2 - movntq qword ptr [edi+24], mm3 - movntq qword ptr [edi+32], mm4 - movntq qword ptr [edi+40], mm5 - movntq qword ptr [edi+48], mm6 - movntq qword ptr [edi+56], mm7 - - add edi, 64 - add esi, 64 - dec ecx - jnz copyloop - - sfence - emms - - copy_remainder: - - mov ecx, remainder - rep movsb - } - - return dest; -} -#else -#define memcpy_fast(d,s,c) memcpy(d,s,c) -#endif - - #if defined (DEBUG_DUMP_TEXTURE) && defined (WIN32) static void DebugDumpTexture(int which) { @@ -700,8 +625,7 @@ static void setTexture(unsigned int format, unsigned int texpal) texcache[i].coord=(format>>30); texcache[i].invSizeX=1.0f/((float)(sizeX*(1<<4))); texcache[i].invSizeY=1.0f/((float)(sizeY*(1<<4))); - //memcpy(texcache[i].texture,adr,imageSize); //======================= copy - memcpy_fast(texcache[i].texture,adr,std::min((size_t)imageSize,sizeof(texcache[i].texture))); //======================= copy + memcpy_fast(texcache[i].texture,adr,std::min((size_t)imageSize,sizeof(texcache[i].texture))); texcache[i].numcolors=palSize[texcache[i].mode]; texcache[i].frm=format; diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h index 896ae76f2..b62ef8e5a 100644 --- a/desmume/src/OGLRender.h +++ b/desmume/src/OGLRender.h @@ -22,6 +22,13 @@ #ifndef OGLRENDER_H #define OGLRENDER_H +#include "common.h" +#include +#include +#include +#include +#include +#include "debug.h" #include "render3D.h" extern GPU3DInterface gpu3Dgl; diff --git a/desmume/src/common-x64.asm b/desmume/src/common-x64.asm new file mode 100644 index 000000000..4663d188f --- /dev/null +++ b/desmume/src/common-x64.asm @@ -0,0 +1,27 @@ +; +; Copyright (C) 2006 yopyop +; Copyright (C) 2008 CrazyMax +; +; This file is part of DeSmuME +; +; DeSmuME is free software; you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation; either version 2 of the License, or +; (at your option) any later version. +; +; DeSmuME is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with DeSmuME; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + TITLE common-x64.asm + .code + +NOP_x64 PROC PUBLIC +ENDP + +end diff --git a/desmume/src/common-x86.asm b/desmume/src/common-x86.asm new file mode 100644 index 000000000..cf4c8c9ac --- /dev/null +++ b/desmume/src/common-x86.asm @@ -0,0 +1,104 @@ +; +; Copyright (C) 2006 yopyop +; Copyright (C) 2008 CrazyMax +; +; This file is part of DeSmuME +; +; DeSmuME is free software; you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation; either version 2 of the License, or +; (at your option) any later version. +; +; DeSmuME is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with DeSmuME; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + TITLE common-x86.asm + .686P + .XMM + .model flat + .code + +@memcpy_fast@12 PROC PUBLIC + push esi + push edi ; cauntion: stack increased on 8 + + mov esi, edx + mov edi, ecx + mov ecx, [esp+12] + + cmp ecx, 40h ; 64 bytes + jl remain_loop ;not enought bytes + + mov eax, ecx + shr eax, 6 + and ecx, 63 + +ALIGN 8 +loop_copy: + movq mm0, [esi] + movq mm1, [esi+ 8] + movntq [edi], mm0 + movntq [edi+8], mm1 + movq mm2, [esi+16] + movq mm3, [esi+24] + movntq [edi+16], mm2 + movntq [edi+24], mm3 + movq mm4, [esi+32] + movq mm5, [esi+40] + movntq [edi+32], mm4 + movntq [edi+40], mm5 + movq mm6, [esi+48] + movq mm7, [esi+56] + movntq [edi+48], mm6 + movntq [edi+56], mm7 + add esi, 64 + add edi, 64 + dec eax + jne loop_copy + + emms + +remain_loop: + rep movsb +end_loop: + pop edi + pop esi + ret 4 +@memcpy_fast@12 ENDP + +@GPU_copyLine@8 PROC PUBLIC + mov eax, 8 +ALIGN 8 +loop_copy: + movq mm0, [edx] + movq mm1, [edx+8] + movntq [ecx], mm0 + movntq [ecx+8], mm1 + movq mm2, [edx+16] + movq mm3, [edx+24] + movntq [ecx+16], mm2 + movntq [ecx+24], mm3 + movq mm4, [edx+32] + movq mm5, [edx+40] + movntq [ecx+32], mm4 + movntq [ecx+40], mm5 + movq mm6, [edx+48] + movq mm7, [edx+56] + movntq [ecx+48], mm6 + movntq [ecx+56], mm7 + add edx, 64 + add ecx, 64 + dec eax + jne loop_copy + + emms + ret 0 +@GPU_copyLine@8 ENDP + +end diff --git a/desmume/src/common.h b/desmume/src/common.h index beedea716..0f971e038 100644 --- a/desmume/src/common.h +++ b/desmume/src/common.h @@ -38,6 +38,23 @@ extern HINSTANCE hAppInst; extern char IniName[MAX_PATH]; extern void GetINIPath(); extern void WritePrivateProfileInt(char* appname, char* keyname, int val, char* file); + +// temporally while fix x64 build +#ifndef _WIN64 +#define memcpy_fast(d,s,c) memcpy(d,s,c) +#define GPU_copyLine(d,s) memcpy(d,s,512) +#else +extern "C" +{ + void __fastcall memcpy_fast(void* dest, void* src, size_t count); + void __fastcall GPU_copyLine(void* dest, const void* src); +} +#endif + +// check it in other ports +#else +#define memcpy_fast(d,s,c) memcpy(d,s,c) +#define GPU_copyLine(d,s) memcpy(d,s,512) #endif extern u8 reverseBitsInByte(u8 x); diff --git a/desmume/src/common_sse2-x64.asm b/desmume/src/common_sse2-x64.asm new file mode 100644 index 000000000..77aeeef4c --- /dev/null +++ b/desmume/src/common_sse2-x64.asm @@ -0,0 +1,27 @@ +; +; Copyright (C) 2006 yopyop +; Copyright (C) 2008 CrazyMax +; +; This file is part of DeSmuME +; +; DeSmuME is free software; you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation; either version 2 of the License, or +; (at your option) any later version. +; +; DeSmuME is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with DeSmuME; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + TITLE common_sse2-x64.asm + .code + +NOP_SSE2_x64 PROC PUBLIC +ENDP + +end diff --git a/desmume/src/common_sse2-x86.asm b/desmume/src/common_sse2-x86.asm new file mode 100644 index 000000000..c9ae2569f --- /dev/null +++ b/desmume/src/common_sse2-x86.asm @@ -0,0 +1,151 @@ +; +; Copyright (C) 2006 yopyop +; Copyright (C) 2008 CrazyMax +; +; This file is part of DeSmuME +; +; DeSmuME is free software; you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation; either version 2 of the License, or +; (at your option) any later version. +; +; DeSmuME is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with DeSmuME; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + TITLE common_sse2-x86.asm + .686P + .XMM + .model flat + .code + +@memcpy_fast@12 PROC PUBLIC + push esi + push edi + + mov esi, edx + mov edi, ecx + mov ecx, [esp+12] + + prefetchnta [esi] + prefetchnta [esi+64] + prefetchnta [esi+128] + prefetchnta [esi+192] + prefetchnta [esi+256] + + cmp ecx, 40h ; 64 bytes + jl remain_loop ;not enought bytes + + mov edx, edi + and edx, 15 + je _aligned + + mov eax, ecx + sub edx, 16 + neg edx + sub eax, edx + mov ecx, edx + rep movsb + mov ecx, eax + +_aligned: + mov eax, ecx + shr eax, 6 + and ecx, 63 + test esi, 15 + je aligned_copy_loop + +ALIGN 8 +unaligned_copy_loop: + prefetchnta [esi+320] + movups xmm0, [esi] + movups xmm1, [esi+16] + movntps [edi], xmm0 + movntps [edi+16], xmm1 + movups xmm2, [esi+32] + movups xmm3, [esi+48] + movntps [edi+32], xmm2 + movntps [edi+48], xmm3 + add esi, 64 + add edi, 64 + dec eax + jne unaligned_copy_loop + sfence + jmp remain_loop + +ALIGN 8 +aligned_copy_loop: + prefetchnta [esi+320] + movaps xmm0, [esi] + movaps xmm1, [esi+16] + movntps [edi], xmm0 + movntps [edi+16], xmm1 + movaps xmm2, [esi+32] + movaps xmm3, [esi+48] + movntps [edi+32], xmm2 + movntps [edi+48], xmm3 + add esi, 64 + add edi, 64 + dec eax + jne aligned_copy_loop + sfence + +remain_loop: + cmp ecx, 3 + jg remain_loop2 + rep movsb + pop edi + pop esi + ret 4 +remain_loop2: + mov eax, ecx + shr ecx, 2 + rep movsd + test al, 2 + je skip_word + movsw +skip_word: + test al, 1 + je end_loop + movsb + +end_loop: + pop edi + pop esi + ret 4 +@memcpy_fast@12 ENDP + +@GPU_copyLine@8 PROC PUBLIC + prefetchnta [edx] + prefetchnta [edx+64] + prefetchnta [edx+128] + prefetchnta [edx+192] + prefetchnta [edx+256] + mov eax, 8 + +aligned_copy_loop: + prefetchnta [edx+320] + movaps xmm0, [edx] + movaps xmm1, [edx+16] + movntps [ecx], xmm0 + movntps [ecx+16], xmm1 + movaps xmm2, [edx+32] + movaps xmm3, [edx+48] + movntps [ecx+32], xmm2 + movntps [ecx+48], xmm3 + add edx, 64 + add ecx, 64 + dec eax + jne aligned_copy_loop + sfence + + ret 0 +@GPU_copyLine@8 ENDP + +end + diff --git a/desmume/src/windows/DeSmuME_2005.vcproj b/desmume/src/windows/DeSmuME_2005.vcproj index d2a565675..533563410 100644 --- a/desmume/src/windows/DeSmuME_2005.vcproj +++ b/desmume/src/windows/DeSmuME_2005.vcproj @@ -10,9 +10,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -655,22 +393,6 @@ CompileAs="0" /> - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -988,113 +856,6 @@ RelativePath="..\matrix.h" > - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/desmume/src/windows/DeSmuME_2008.vcproj b/desmume/src/windows/DeSmuME_2008.vcproj index 2ae2229eb..a78ce0727 100644 --- a/desmume/src/windows/DeSmuME_2008.vcproj +++ b/desmume/src/windows/DeSmuME_2008.vcproj @@ -1,7 +1,7 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -834,6 +736,325 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +