diff --git a/desmume/src/Makefile.am b/desmume/src/Makefile.am index 7d35f3b7a..755b08edb 100644 --- a/desmume/src/Makefile.am +++ b/desmume/src/Makefile.am @@ -1,9 +1,8 @@ include $(top_srcdir)/src/desmume.mk -AM_CPPFLAGS += $(SDL_CFLAGS) $(GTK_CFLAGS) $(GTHREAD_CFLAGS) $(X_CFLAGS) $(LUA_CFLAGS) $(ALSA_CFLAGS) $(LIBAGG_CFLAGS) +AM_CPPFLAGS += $(SDL_CFLAGS) $(GTK_CFLAGS) $(GTHREAD_CFLAGS) $(X_CFLAGS) $(LUA_CFLAGS) $(ALSA_CFLAGS) $(LIBAGG_CFLAGS) -EXTRA_DIST = build.bat instruction_tabdef.inc thumb_tabdef.inc fs-linux.cpp fs-windows.cpp \ - matrix_sse2-x64.asm matrix_sse2-x86.asm +EXTRA_DIST = build.bat instruction_tabdef.inc thumb_tabdef.inc fs-linux.cpp fs-windows.cpp if HAVE_GDB_STUB SUBDIRS = . gdbstub $(UI_DIR) else diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index a594058ae..5c9e6ddef 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -73,15 +73,8 @@ static void ENDGL() { #include "shaders.h" #include "texcache.h" - - -#ifndef CTASSERT -#define CTASSERT(x) typedef char __assert ## y[(x) ? 1 : -1] -#endif - static ALIGN(16) u8 GPU_screen3D [256*192*4]; - static const unsigned short map3d_cull[4] = {GL_FRONT_AND_BACK, GL_FRONT, GL_BACK, 0}; static const int texEnv[4] = { GL_MODULATE, GL_DECAL, GL_MODULATE, GL_MODULATE }; static const int depthFunc[2] = { GL_LESS, GL_EQUAL }; @@ -703,17 +696,15 @@ static void GL_ReadFramebuffer() u16* dst = gfx3d_convertedScreen + (y<<8); u8* dstAlpha = gfx3d_convertedAlpha + (y<<8); - #ifndef NOSSE2 //I dont know much about this kind of stuff, but this seems to help //for some reason I couldnt make the intrinsics work - u8* u8screen3D = (u8*)&((u32*)GPU_screen3D)[i]; - #define PREFETCH32(X,Y) __asm { prefetchnta [u8screen3D+32*0x##X##Y] } + //u8* u8screen3D = (u8*)&((u32*)GPU_screen3D)[i]; + /*#define PREFETCH32(X,Y) __asm { prefetchnta [u8screen3D+32*0x##X##Y] } #define PREFETCH128(X) PREFETCH32(X,0) PREFETCH32(X,1) PREFETCH32(X,2) PREFETCH32(X,3) \ PREFETCH32(X,4) PREFETCH32(X,5) PREFETCH32(X,6) PREFETCH32(X,7) \ PREFETCH32(X,8) PREFETCH32(X,9) PREFETCH32(X,A) PREFETCH32(X,B) \ PREFETCH32(X,C) PREFETCH32(X,D) PREFETCH32(X,E) PREFETCH32(X,F) - PREFETCH128(0); PREFETCH128(1); - #endif + PREFETCH128(0); PREFETCH128(1);*/ for(int x=0;x<256;x++,i++) { diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index c914cb010..73d5ede2e 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -1,3 +1,4 @@ +//2 /* Copyright (C) 2006 yopyop yopyop156@ifrance.com yopyop156.ifrance.com @@ -146,11 +147,6 @@ static float float10Table[1024]; static float float10RelTable[1024]; static float normalTable[1024]; -#ifndef NOSSE2 -float ALIGN(16) _fix2float_divizor_mask[4] = { 4096.f, 4096.f, 4096.f, 4096.f }; -float ALIGN(16) _fix10_2float_divizor_mask[4] = { 512.f, 512.f, 512.f, 512.f }; -#endif - #define fix2float(v) (((float)((s32)(v))) / (float)(1<<12)) #define fix10_2float(v) (((float)((s32)(v))) / (float)(1<<9)) @@ -317,6 +313,20 @@ static void makeTables() { void gfx3d_init() { + //DWORD start = timeGetTime(); + //for(int i=0;i<1000000000;i++) + // MatrixMultVec4x4(mtxCurrent[0],mtxCurrent[1]); + //DWORD end = timeGetTime(); + //DWORD diff = end-start; + + //start = timeGetTime(); + //for(int i=0;i<1000000000;i++) + // MatrixMultVec4x4_b(mtxCurrent[0],mtxCurrent[1]); + //end = timeGetTime(); + //DWORD diff2 = end-start; + + //printf("SPEED TEST %d %d\n",diff,diff2); + if(polylists == NULL) { polylists = new POLYLIST[2]; polylist = &polylists[0]; } if(vertlists == NULL) { vertlists = new VERTLIST[2]; vertlist = &vertlists[0]; } makeTables(); @@ -409,15 +419,11 @@ static void SetVertex() if(polylist->count >= POLYLIST_SIZE) return; -#ifdef NOSSE2 - //apply modelview matrix - MatrixMultVec4x4 (mtxCurrent[1], coordTransformed); - - //apply projection matrix - MatrixMultVec4x4 (mtxCurrent[0], coordTransformed); -#else - _sse2_MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed); -#endif + //TODO - think about keeping the clip matrix concatenated, + //so that we only have to multiply one matrix here + //(we could lazy cache the concatenated clip matrix and only generate it + //when we need to) + MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed); //TODO - culling should be done here. //TODO - viewport transform? @@ -694,11 +700,7 @@ void FORCEINLINE gfx3d_glLoadIdentity() void FORCEINLINE gfx3d_glLoadMatrix4x4(s32 v) { -#ifdef NOSSE2 - mtxCurrent[mode][0] = fix2float(v); -#else mtxCurrent[mode][0] = v; -#endif for (int i = 1; i < 16; i++) { @@ -707,16 +709,10 @@ void FORCEINLINE gfx3d_glLoadMatrix4x4(s32 v) if (!GFX_PIPErecv(&cmd, ¶m)) break; dEXEC("glLoadMatrix4x4", 0x16, cmd); -#ifdef NOSSE2 - mtxCurrent[mode][i] = fix2float((s32)param); -#else mtxCurrent[mode][i] = (s32)param; -#endif } -#ifndef NOSSE2 - _sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask); -#endif + vector_fix2float<4>(mtxCurrent[mode], 4096.f); GFX_DELAY(19); @@ -726,11 +722,7 @@ void FORCEINLINE gfx3d_glLoadMatrix4x4(s32 v) void FORCEINLINE gfx3d_glLoadMatrix4x3(s32 v) { -#ifdef NOSSE2 - mtxCurrent[mode][0] = fix2float(v); -#else mtxCurrent[mode][0] = v; -#endif for (int i = 1; i < 16; i++) { @@ -740,16 +732,10 @@ void FORCEINLINE gfx3d_glLoadMatrix4x3(s32 v) if (!GFX_PIPErecv(&cmd, ¶m)) break; dEXEC("glLoadMatrix4x3", 0x17, cmd); -#ifdef NOSSE2 - mtxCurrent[mode][i] = fix2float((s32)param); -#else mtxCurrent[mode][i] = (s32)param; -#endif } -#ifndef NOSSE2 - _sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask); -#endif + vector_fix2float<4>(mtxCurrent[mode], 4096.f); //fill in the unusued matrix values mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0.f; @@ -763,11 +749,7 @@ void FORCEINLINE gfx3d_glLoadMatrix4x3(s32 v) void FORCEINLINE gfx3d_glMultMatrix4x4(s32 v) { -#ifdef NOSSE2 - mtxTemporal[0] = fix2float(v); -#else mtxTemporal[0] = v; -#endif for (int i = 1; i < 16; i++) { @@ -776,16 +758,10 @@ void FORCEINLINE gfx3d_glMultMatrix4x4(s32 v) if (!GFX_PIPErecv(&cmd, ¶m)) break; dEXEC("glMultMatrix4x4", 0x18, cmd); -#ifdef NOSSE2 - mtxTemporal[i] = fix2float((s32)param); -#else mtxTemporal[i] = (s32)param; -#endif } -#ifndef NOSSE2 - _sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask); -#endif + vector_fix2float<4>(mtxTemporal, 4096.f); MatrixMultiply (mtxCurrent[mode], mtxTemporal); @@ -802,11 +778,7 @@ void FORCEINLINE gfx3d_glMultMatrix4x4(s32 v) void FORCEINLINE gfx3d_glMultMatrix4x3(s32 v) { -#ifdef NOSSE2 - mtxTemporal[0] = fix2float(v); -#else mtxTemporal[0] = v; -#endif for (int i = 1; i < 16; i++) { @@ -816,16 +788,10 @@ void FORCEINLINE gfx3d_glMultMatrix4x3(s32 v) if (!GFX_PIPErecv(&cmd, ¶m)) break; dEXEC("glMultMatrix4x3", 0x19, cmd); -#ifdef NOSSE2 - mtxTemporal[i] = fix2float((s32)param); -#else mtxTemporal[i] = (s32)param; -#endif } -#ifndef NOSSE2 - _sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask); -#endif + vector_fix2float<4>(mtxTemporal, 4096.f); //fill in the unusued matrix values mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0.f; @@ -847,12 +813,7 @@ void FORCEINLINE gfx3d_glMultMatrix4x3(s32 v) void FORCEINLINE gfx3d_glMultMatrix3x3(s32 v) { -#ifdef NOSSE2 - mtxTemporal[0] = fix2float(v); -#else mtxTemporal[0] = v; -#endif - for (int i = 1; i < 12; i++) { @@ -862,16 +823,10 @@ void FORCEINLINE gfx3d_glMultMatrix3x3(s32 v) if (!GFX_PIPErecv(&cmd, ¶m)) break; dEXEC("glMultMatrix3x3", 0x1A, cmd); -#ifdef NOSSE2 - mtxTemporal[i] = fix2float((s32)param); -#else mtxTemporal[i] = (s32)param; -#endif } -#ifndef NOSSE2 - _sse2_fix2float_12(mtxTemporal, _fix2float_divizor_mask); -#endif + vector_fix2float<3>(mtxTemporal, 4096.f); //fill in the unusued matrix values mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0; @@ -1276,12 +1231,7 @@ void FORCEINLINE gfx3d_glPosTest(u32 v) PTcoords[2] = float16table[param & 0xFFFF]; PTcoords[3] = 1.0f; -#ifdef NOSSE2 - MatrixMultVec4x4 (mtxCurrent[1], PTcoords); - MatrixMultVec4x4 (mtxCurrent[0], PTcoords); -#else - _sse2_MatrixMultVec4x4_M2(mtxCurrent[0], PTcoords); -#endif + MatrixMultVec4x4_M2(mtxCurrent[0], PTcoords); gxstat &= 0xFFFFFFFE; // cleay busy bit T1WriteLong(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x600, gxstat); @@ -1413,11 +1363,7 @@ void gfx3d_glLoadIdentity() BOOL gfx3d_glLoadMatrix4x4(s32 v) { -#ifdef NOSSE2 - mtxCurrent[mode][ML4x4ind] = fix2float(v); -#else mtxCurrent[mode][ML4x4ind] = v; -#endif ++ML4x4ind; if(ML4x4ind<16) return FALSE; @@ -1425,9 +1371,7 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v) GFX_DELAY(19); -#ifndef NOSSE2 - _sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask); -#endif + vector_fix2float<4>(mtxCurrent[mode], 4096.f); if (mode == 2) MatrixCopy (mtxCurrent[1], mtxCurrent[2]); @@ -1436,20 +1380,14 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v) BOOL gfx3d_glLoadMatrix4x3(s32 v) { -#ifdef NOSSE2 - mtxCurrent[mode][ML4x3ind] = fix2float(v); -#else mtxCurrent[mode][ML4x3ind] = v; -#endif ML4x3ind++; if((ML4x3ind & 0x03) == 3) ML4x3ind++; if(ML4x3ind<16) return FALSE; ML4x3ind = 0; -#ifndef NOSSE2 - _sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask); -#endif + vector_fix2float<4>(mtxCurrent[mode], 4096.f); //fill in the unusued matrix values mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0.f; @@ -1464,11 +1402,7 @@ BOOL gfx3d_glLoadMatrix4x3(s32 v) BOOL gfx3d_glMultMatrix4x4(s32 v) { -#ifdef NOSSE2 - mtxTemporal[MM4x4ind] = fix2float(v); -#else mtxTemporal[MM4x4ind] = v; -#endif MM4x4ind++; if(MM4x4ind<16) return FALSE; @@ -1476,9 +1410,7 @@ BOOL gfx3d_glMultMatrix4x4(s32 v) GFX_DELAY(35); -#ifndef NOSSE2 - _sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask); -#endif + vector_fix2float<4>(mtxTemporal, 4096.f); MatrixMultiply (mtxCurrent[mode], mtxTemporal); @@ -1494,11 +1426,7 @@ BOOL gfx3d_glMultMatrix4x4(s32 v) BOOL gfx3d_glMultMatrix4x3(s32 v) { -#ifdef NOSSE2 - mtxTemporal[MM4x3ind] = fix2float(v); -#else mtxTemporal[MM4x3ind] = v; -#endif MM4x3ind++; if((MM4x3ind & 0x03) == 3) MM4x3ind++; @@ -1507,9 +1435,7 @@ BOOL gfx3d_glMultMatrix4x3(s32 v) GFX_DELAY(31); -#ifndef NOSSE2 - _sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask); -#endif + vector_fix2float<4>(mtxTemporal, 4096.f); //fill in the unusued matrix values mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0.f; @@ -1530,11 +1456,7 @@ BOOL gfx3d_glMultMatrix4x3(s32 v) BOOL gfx3d_glMultMatrix3x3(s32 v) { -#ifdef NOSSE2 - mtxTemporal[MM3x3ind] = fix2float(v); -#else mtxTemporal[MM3x3ind] = v; -#endif MM3x3ind++; @@ -1544,9 +1466,7 @@ BOOL gfx3d_glMultMatrix3x3(s32 v) GFX_DELAY(28); -#ifndef NOSSE2 - _sse2_fix2float_12(mtxTemporal, _fix2float_divizor_mask); -#endif + vector_fix2float<3>(mtxTemporal, 4096.f); //fill in the unusued matrix values mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0; diff --git a/desmume/src/matrix.cpp b/desmume/src/matrix.cpp index 3951e204e..4261abee0 100644 --- a/desmume/src/matrix.cpp +++ b/desmume/src/matrix.cpp @@ -25,17 +25,10 @@ #include #include "matrix.h" -extern "C" { - - -void MatrixInit (float *matrix) -{ - memset (matrix, 0, sizeof(float)*16); - matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f; -} - -#ifdef NOSSE2 -void MATRIXFASTCALL MatrixMultVec4x4 (const float *matrix, float *vecPtr) +//------------------------- +//switched SSE functions: implementations for no SSE +#ifndef ENABLE_SSE +void MatrixMultVec4x4 (const float *matrix, float *vecPtr) { float x = vecPtr[0]; float y = vecPtr[1]; @@ -48,7 +41,8 @@ void MATRIXFASTCALL MatrixMultVec4x4 (const float *matrix, float *vecPtr) vecPtr[3] = x * matrix[3] + y * matrix[7] + z * matrix[11] + w * matrix[15]; } -void MATRIXFASTCALL MatrixMultVec3x3 (const float *matrix, float *vecPtr) + +void MatrixMultVec3x3 (const float *matrix, float *vecPtr) { float x = vecPtr[0]; float y = vecPtr[1]; @@ -59,7 +53,7 @@ void MATRIXFASTCALL MatrixMultVec3x3 (const float *matrix, float *vecPtr) vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10]; } -void MATRIXFASTCALL MatrixMultiply (float *matrix, const float *rightMatrix) +void MatrixMultiply (float *matrix, const float *rightMatrix) { float tmpMatrix[16]; @@ -86,7 +80,7 @@ void MATRIXFASTCALL MatrixMultiply (float *matrix, const float *rightMatrix) memcpy (matrix, tmpMatrix, sizeof(float)*16); } -void MATRIXFASTCALL MatrixTranslate (float *matrix, const float *ptr) +void MatrixTranslate (float *matrix, const float *ptr) { matrix[12] += (matrix[0]*ptr[0])+(matrix[4]*ptr[1])+(matrix[ 8]*ptr[2]); matrix[13] += (matrix[1]*ptr[0])+(matrix[5]*ptr[1])+(matrix[ 9]*ptr[2]); @@ -94,7 +88,7 @@ void MATRIXFASTCALL MatrixTranslate (float *matrix, const float *ptr) matrix[15] += (matrix[3]*ptr[0])+(matrix[7]*ptr[1])+(matrix[11]*ptr[2]); } -void MATRIXFASTCALL MatrixScale (float *matrix, const float *ptr) +void MatrixScale (float *matrix, const float *ptr) { matrix[0] *= ptr[0]; matrix[1] *= ptr[0]; @@ -111,9 +105,16 @@ void MATRIXFASTCALL MatrixScale (float *matrix, const float *ptr) matrix[10] *= ptr[2]; matrix[11] *= ptr[2]; } + #endif //switched c/asm functions //----------------------------------------- +void MatrixInit (float *matrix) +{ + memset (matrix, 0, sizeof(float)*16); + matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f; +} + void MatrixTranspose(float *matrix) { float temp; @@ -127,7 +128,7 @@ void MatrixTranspose(float *matrix) #undef swap } -void MATRIXFASTCALL MatrixIdentity (float *matrix) +void MatrixIdentity (float *matrix) { matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0.0f; matrix[6] = matrix[7] = matrix[8] = matrix[9] = 0.0f; @@ -135,7 +136,7 @@ void MATRIXFASTCALL MatrixIdentity (float *matrix) matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f; } -float MATRIXFASTCALL MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix) +float MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix) { int iMod = index%4, iDiv = (index>>2)<<2; @@ -143,12 +144,12 @@ float MATRIXFASTCALL MatrixGetMultipliedIndex (int index, float *matrix, float * (matrix[iMod+8]*rightMatrix[iDiv+2])+(matrix[iMod+12]*rightMatrix[iDiv+3]); } -void MATRIXFASTCALL MatrixSet (float *matrix, int x, int y, float value) // TODO +void MatrixSet (float *matrix, int x, int y, float value) // TODO { matrix [x+(y<<2)] = value; } -void MATRIXFASTCALL MatrixCopy (float* matrixDST, const float* matrixSRC) +void MatrixCopy (float* matrixDST, const float* matrixSRC) { matrixDST[0] = matrixSRC[0]; matrixDST[1] = matrixSRC[1]; @@ -169,7 +170,7 @@ void MATRIXFASTCALL MatrixCopy (float* matrixDST, const float* matrixSRC) } -int MATRIXFASTCALL MatrixCompare (const float* matrixDST, const float* matrixSRC) +int MatrixCompare (const float* matrixDST, const float* matrixSRC) { return memcmp((void*)matrixDST, matrixSRC, sizeof(float)*16); } @@ -340,5 +341,4 @@ void Vector4Copy(float *dst, const float *src) dst[3] = src[3]; } -} //extern "C" diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h index b67dd2675..0f15e6269 100644 --- a/desmume/src/matrix.h +++ b/desmume/src/matrix.h @@ -1,5 +1,5 @@ -/* - Copyright (C) 2006-2007 shash +/* Copyright (C) 2006-2007 shash + Copyright (C) 2009 DeSmuME team This file is part of DeSmuME @@ -27,17 +27,14 @@ #include "types.h" #include "mem.h" -#if !defined(NOSSE2) && !defined(SSE2_NOINTRIN) -#define SSE2_INTRIN +#ifdef ENABLE_SSE +#include #endif -#ifdef SSE2_INTRIN -#include +#ifdef ENABLE_SSE2 #include #endif -extern "C" { - struct MatrixStack { MatrixStack(int size); @@ -48,42 +45,15 @@ struct MatrixStack void MatrixInit (float *matrix); -#if defined(_MSC_VER) || defined(__INTEL_COMPILER) -#define MATRIXFASTCALL __fastcall -#else -#define MATRIXFASTCALL -#endif - //In order to conditionally use these asm optimized functions in visual studio //without having to make new build types to exclude the assembly files. //a bit sloppy, but there aint much to it -#ifndef NOSSE2 -#define SSE2_FUNC(X) _sse2_##X -#define MatrixMultVec4x4 _sse2_MatrixMultVec4x4 -#define MatrixMultVec3x3 _sse2_MatrixMultVec3x3 -#define MatrixMultiply _sse2_MatrixMultiply -#define MatrixTranslate _sse2_MatrixTranslate -#define MatrixScale _sse2_MatrixScale -void MATRIXFASTCALL _sse2_fix2float_16 (float* matrix, float* divizor_mask); -void MATRIXFASTCALL _sse2_fix2float_12 (float* matrix, float* divizor_mask); -void MATRIXFASTCALL _sse2_MatrixMultVec4x4_M2 (const float * matrix, float * vecPtr); // mode 2 -#else -#define SSE2_FUNC(X) X -#endif -void MATRIXFASTCALL SSE2_FUNC(MatrixMultVec3x3) (const float * matrix, float * vecPtr); -void MATRIXFASTCALL SSE2_FUNC(MatrixMultVec4x4) (const float * matrix, float * vecPtr); -void MATRIXFASTCALL SSE2_FUNC(MatrixMultiply) (float * matrix, const float * rightMatrix); -void MATRIXFASTCALL SSE2_FUNC(MatrixTranslate) (float *matrix, const float *ptr); -void MATRIXFASTCALL SSE2_FUNC(MatrixScale) (float * matrix, const float * ptr); - - - -float MATRIXFASTCALL MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix); -void MATRIXFASTCALL MatrixSet (float *matrix, int x, int y, float value); -void MATRIXFASTCALL MatrixCopy (float * matrixDST, const float * matrixSRC); -int MATRIXFASTCALL MatrixCompare (const float * matrixDST, const float * matrixSRC); -void MATRIXFASTCALL MatrixIdentity (float *matrix); +float MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix); +void MatrixSet (float *matrix, int x, int y, float value); +void MatrixCopy (float * matrixDST, const float * matrixSRC); +int MatrixCompare (const float * matrixDST, const float * matrixSRC); +void MatrixIdentity (float *matrix); void MatrixTranspose (float *matrix); void MatrixStackInit (MatrixStack *stack); @@ -112,27 +82,21 @@ void Vector3Normalize(float *dst); void Vector4Copy(float *dst, const float *src); -} //extern "C" - //these functions are an unreliable, inaccurate floor. //it should only be used for positive numbers //this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available FORCEINLINE u32 u32floor(float f) { -#if defined(SSE2_INTRIN) - return (u32)_mm_cvttss_si32(_mm_set_ss(f)); -#elif !defined(NOSSE2) - __asm cvttss2si eax, f; +#ifdef ENABLE_SSE2 + return (u32)_mm_cvtt_ss2si(_mm_set_ss(f)); #else return (u32)f; #endif } FORCEINLINE u32 u32floor(double d) { -#if defined(SSE2_INTRIN) +#ifdef ENABLE_SSE2 return (u32)_mm_cvttsd_si32(_mm_set_sd(d)); -#elif !defined(NOSSE2) - __asm cvttsd2si eax, d; #else return (u32)d; #endif @@ -142,66 +106,212 @@ FORCEINLINE u32 u32floor(double d) //be sure that the results are the same thing as floorf! FORCEINLINE s32 s32floor(float f) { -#if defined(SSE2_INTRIN) +#ifdef ENABLE_SSE2 return _mm_cvtss_si32( _mm_add_ss(_mm_set_ss(-0.5f),_mm_add_ss(_mm_set_ss(f), _mm_set_ss(f))) ) >> 1; -#elif !defined(NOSSE2) - static const float c = -0.5f; - __asm - { - movss xmm0, f; - addss xmm0, xmm0; - addss xmm0, c; - cvtss2si eax, xmm0 - sar eax, 1 - } #else return (s32)floorf(f); #endif } -//now comes some sse2 functions coded solely with intrinsics. -//let's wait and see how many people this upsets. -//they can always #define SSE2_NOINTRIN in their userconfig.h.... - -#ifdef SSE2_INTRIN +//switched SSE2 functions +//------------- +#ifdef ENABLE_SSE template -static FORCEINLINE void memset_u16_le(void* dst, u16 val) +FORCEINLINE void memset_u16_le(void* dst, u16 val) { u32 u32val; //just for the endian safety T1WriteWord((u8*)&u32val,0,val); T1WriteWord((u8*)&u32val,2,val); - const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val); - MACRODO_N(NUM/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp)); + ////const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val); + __m128 temp; temp.m128_i32[0] = u32val; + //MACRODO_N(NUM/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp)); + MACRODO_N(NUM/8,_mm_store_ps1((float*)((u8*)dst+(X)*16), temp)); } -#else + +#else //no sse2 + template static FORCEINLINE void memset_u16_le(void* dst, u16 val) { for(int i=0;i +FORCEINLINE void vector_fix2float(float* matrix, const float divisor) +{ + CTASSERT(NUM_ROWS==3 || NUM_ROWS==4); + + const __m128 val = _mm_set_ps1(divisor); + + _mm_store_ps(matrix,_mm_div_ps(_mm_load_ps(matrix),val)); + _mm_store_ps(matrix+4,_mm_div_ps(_mm_load_ps(matrix+4),val)); + _mm_store_ps(matrix+8,_mm_div_ps(_mm_load_ps(matrix+8),val)); + if(NUM_ROWS==4) + _mm_store_ps(matrix+12,_mm_div_ps(_mm_load_ps(matrix+12),val)); +} + //WARNING: I do not think this is as fast as a memset, for some reason. //at least in vc2005 with sse enabled. better figure out why before using it -#ifdef SSE2_INTRIN template static FORCEINLINE void memset_u8(void* _dst, u8 val) { - const u8* dst = (u8*)_dst; - u32 u32val = (val<<24)|(val<<16)|(val<<8)|val; - const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val); - MACRODO_N(NUM/16,_mm_store_si128((__m128i*)(dst+(X)*16), temp)); + memset(_dst,val,NUM); + //const u8* dst = (u8*)_dst; + //u32 u32val = (val<<24)|(val<<16)|(val<<8)|val; + //const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val); + //MACRODO_N(NUM/16,_mm_store_si128((__m128i*)(dst+(X)*16), temp)); } -#else + +#else //no sse + +void MatrixMultVec4x4 (const float *matrix, float *vecPtr); +void MatrixMultVec3x3(const float * matrix, float * vecPtr); +void MatrixMultiply(float * matrix, const float * rightMatrix); +void MatrixTranslate(float *matrix, const float *ptr); +void MatrixScale(float * matrix, const float * ptr); + +FORCEINLINE void MatrixMultVec4x4_M2(const float *matrix, float *vecPtr) +{ + //there are hardly any gains from merging these manually + MatrixMultVec4x4(matrix+16,vecPtr); + MatrixMultVec4x4(matrix,vecPtr); +} + +template +FORCEINLINE void vector_fix2float(float* matrix, const float divisor) +{ + for(int i=0;i static FORCEINLINE void memset_u8(void* dst, u8 val) { memset(dst,val,NUM); } -#endif + +#endif //switched SSE functions #endif diff --git a/desmume/src/matrix_sse2-x64.asm b/desmume/src/matrix_sse2-x64.asm deleted file mode 100644 index ffbeb7a2f..000000000 --- a/desmume/src/matrix_sse2-x64.asm +++ /dev/null @@ -1,182 +0,0 @@ -; -; Copyright (C) 2006 yopyop -; Copyright (C) 2008 CrazyMax -; -; This file is part of DeSmuME -; -; DeSmuME is free software; you can redistribute it and/or modify -; it under the terms of the GNU General Public License as published by -; the Free Software Foundation; either version 2 of the License, or -; (at your option) any later version. -; -; DeSmuME is distributed in the hope that it will be useful, -; but WITHOUT ANY WARRANTY; without even the implied warranty of -; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; GNU General Public License for more details. -; -; You should have received a copy of the GNU General Public License -; along with DeSmuME; if not, write to the Free Software -; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - TITLE matrix_sse2-x64.asm - .code - -_sse2_MatrixMultVec4x4 PROC PUBLIC - movaps xmm0, XMMWORD PTR [rcx] - movaps xmm1, XMMWORD PTR [rcx+16] - movaps xmm2, XMMWORD PTR [rcx+32] - movaps xmm3, XMMWORD PTR [rcx+48] - movaps xmm4, XMMWORD PTR [rdx] - movaps xmm5, xmm4 - movaps xmm6, xmm4 - movaps xmm7, xmm4 - shufps xmm4, xmm4, 00000000b - shufps xmm5, xmm5, 01010101b - shufps xmm6, xmm6, 10101010b - shufps xmm7, xmm7, 11111111b - mulps xmm4, xmm0 - mulps xmm5, xmm1 - mulps xmm6, xmm2 - mulps xmm7, xmm3 - addps xmm4, xmm5 - addps xmm4, xmm6 - addps xmm4, xmm7 - movaps XMMWORD PTR [rdx], xmm4 - ret 0 -_sse2_MatrixMultVec4x4 ENDP - -_sse2_MatrixMultVec3x3 PROC PUBLIC - movaps xmm0, XMMWORD PTR [rcx] - movaps xmm1, XMMWORD PTR [rcx+16] - movaps xmm2, XMMWORD PTR [rcx+32] - movaps xmm4, XMMWORD PTR [rdx] - movaps xmm5, xmm4 - movaps xmm6, xmm4 - movaps xmm7, xmm4 - shufps xmm4, xmm4, 00000000b - shufps xmm5, xmm5, 01010101b - shufps xmm6, xmm6, 10101010b - mulps xmm4, xmm0 - mulps xmm5, xmm1 - mulps xmm6, xmm2 - addps xmm4, xmm5 - addps xmm4, xmm6 - movaps XMMWORD PTR [rdx], xmm4 - ret 0 -_sse2_MatrixMultVec3x3 ENDP - -_sse2_MatrixMultiply PROC PUBLIC - movaps xmm0, XMMWORD PTR [rcx] - movaps xmm1, XMMWORD PTR [rcx+16] - movaps xmm2, XMMWORD PTR [rcx+32] - movaps xmm3, XMMWORD PTR [rcx+48] - movaps xmm4, XMMWORD PTR [rdx] ; r00, r01, r02, r03 - movaps xmm8, XMMWORD PTR [rdx+16] ; r04, r05, r06, r07 - movaps xmm5,xmm4 - movaps xmm6,xmm4 - movaps xmm7,xmm4 - movaps xmm9,xmm8 ; - movaps xmm10,xmm8 - movaps xmm11,xmm8 - shufps xmm4,xmm4,00000000b - shufps xmm5,xmm5,01010101b - shufps xmm6,xmm6,10101010b - shufps xmm7,xmm7,11111111b - shufps xmm8, xmm8, 00000000b ; - shufps xmm9, xmm9, 01010101b - shufps xmm10,xmm10,10101010b - shufps xmm11,xmm11,11111111b - mulps xmm4,xmm0 - mulps xmm5,xmm1 - mulps xmm6,xmm2 - mulps xmm7,xmm3 - mulps xmm8, xmm0 ; - mulps xmm9, xmm1 - mulps xmm10,xmm2 - mulps xmm11,xmm3 - addps xmm4,xmm5 - addps xmm4,xmm6 - addps xmm4,xmm7 - addps xmm8,xmm9 ; - addps xmm8,xmm10 - addps xmm8,xmm11 - movaps XMMWORD PTR [rcx],xmm4 - movaps XMMWORD PTR [rcx+16],xmm8 - - movaps xmm4, XMMWORD PTR [rdx+32] ; r00, r01, r02, r03 - movaps xmm8, XMMWORD PTR [rdx+48] ; r04, r05, r06, r07 - movaps xmm5,xmm4 - movaps xmm6,xmm4 - movaps xmm7,xmm4 - movaps xmm9,xmm8 ; - movaps xmm10,xmm8 - movaps xmm11,xmm8 - shufps xmm4,xmm4,00000000b - shufps xmm5,xmm5,01010101b - shufps xmm6,xmm6,10101010b - shufps xmm7,xmm7,11111111b - shufps xmm8, xmm8, 00000000b ; - shufps xmm9, xmm9, 01010101b - shufps xmm10,xmm10,10101010b - shufps xmm11,xmm11,11111111b - mulps xmm4,xmm0 - mulps xmm5,xmm1 - mulps xmm6,xmm2 - mulps xmm7,xmm3 - mulps xmm8, xmm0 ; - mulps xmm9, xmm1 - mulps xmm10,xmm2 - mulps xmm11,xmm3 - addps xmm4,xmm5 - addps xmm4,xmm6 - addps xmm4,xmm7 - addps xmm8,xmm9 ; - addps xmm8,xmm10 - addps xmm8,xmm11 - movaps XMMWORD PTR [rcx+32],xmm4 - movaps XMMWORD PTR [rcx+48],xmm8 - ret 0 -_sse2_MatrixMultiply ENDP - -_sse2_MatrixTranslate PROC PUBLIC - movaps xmm0, XMMWORD PTR [rcx] - movaps xmm1, XMMWORD PTR [rcx+16] - movaps xmm2, XMMWORD PTR [rcx+32] - movaps xmm3, XMMWORD PTR [rcx+48] - movaps xmm4, XMMWORD PTR [rdx] - movaps xmm5, xmm4 - movaps xmm6, xmm4 - movaps xmm7, xmm4 - shufps xmm4, xmm4, 00000000b - shufps xmm5, xmm5, 01010101b - shufps xmm6, xmm6, 10101010b - mulps xmm4, xmm0 - mulps xmm5, xmm1 - mulps xmm6, xmm2 - addps xmm4, xmm5 - addps xmm4, xmm6 - addps xmm4, xmm3 - movaps XMMWORD PTR [rcx+48], xmm4 - ret 0 -_sse2_MatrixTranslate ENDP - -_sse2_MatrixScale PROC PUBLIC - movaps xmm0, XMMWORD PTR [rcx] - movaps xmm1, XMMWORD PTR [rcx+16] - movaps xmm2, XMMWORD PTR [rcx+32] - movaps xmm4, XMMWORD PTR [rdx] - movaps xmm5, xmm4 - movaps xmm6, xmm4 - shufps xmm4, xmm4, 00000000b - shufps xmm5, xmm5, 01010101b - shufps xmm6, xmm6, 10101010b - mulps xmm4, xmm0 - mulps xmm5, xmm1 - mulps xmm6, xmm2 - movaps XMMWORD PTR [rcx],xmm4 - movaps XMMWORD PTR [rcx+16],xmm5 - movaps XMMWORD PTR [rcx+32],xmm6 - ret 0 -_sse2_MatrixScale ENDP - -end diff --git a/desmume/src/matrix_sse2-x86.asm b/desmume/src/matrix_sse2-x86.asm deleted file mode 100644 index 4f2ddf30e..000000000 --- a/desmume/src/matrix_sse2-x86.asm +++ /dev/null @@ -1,214 +0,0 @@ -; -; Copyright (C) 2006 yopyop -; Copyright (C) 2008 CrazyMax -; -; This file is part of DeSmuME -; -; DeSmuME is free software; you can redistribute it and/or modify -; it under the terms of the GNU General Public License as published by -; the Free Software Foundation; either version 2 of the License, or -; (at your option) any later version. -; -; DeSmuME is distributed in the hope that it will be useful, -; but WITHOUT ANY WARRANTY; without even the implied warranty of -; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; GNU General Public License for more details. -; -; You should have received a copy of the GNU General Public License -; along with DeSmuME; if not, write to the Free Software -; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - TITLE matrix_sse2-x86.asm - .686P - .XMM - .model flat - .code - -@_sse2_MatrixMultVec4x4@8 PROC PUBLIC - movaps xmm4, XMMWORD PTR [edx] - pshufd xmm5, xmm4, 01010101b - pshufd xmm6, xmm4, 10101010b - pshufd xmm7, xmm4, 11111111b - shufps xmm4, xmm4, 00000000b - mulps xmm4, XMMWORD PTR [ecx] - mulps xmm5, XMMWORD PTR [ecx+16] - mulps xmm6, XMMWORD PTR [ecx+32] - mulps xmm7, XMMWORD PTR [ecx+48] - addps xmm4, xmm5 - addps xmm4, xmm6 - addps xmm4, xmm7 - movaps XMMWORD PTR [edx], xmm4 - ret 0 -@_sse2_MatrixMultVec4x4@8 ENDP - -@_sse2_MatrixMultVec4x4_M2@8 PROC PUBLIC - movaps xmm4, XMMWORD PTR [edx] - pshufd xmm5, xmm4, 01010101b - pshufd xmm6, xmm4, 10101010b - pshufd xmm7, xmm4, 11111111b - shufps xmm4, xmm4, 00000000b - mulps xmm4, XMMWORD PTR [ecx+64] - mulps xmm5, XMMWORD PTR [ecx+80] - mulps xmm6, XMMWORD PTR [ecx+96] - mulps xmm7, XMMWORD PTR [ecx+112] - addps xmm4, xmm5 - addps xmm4, xmm6 - addps xmm4, xmm7 - pshufd xmm5, xmm4, 01010101b - pshufd xmm6, xmm4, 10101010b - pshufd xmm7, xmm4, 11111111b - shufps xmm4, xmm4, 00000000b - mulps xmm4, XMMWORD PTR [ecx] - mulps xmm5, XMMWORD PTR [ecx+16] - mulps xmm6, XMMWORD PTR [ecx+32] - mulps xmm7, XMMWORD PTR [ecx+48] - addps xmm4, xmm5 - addps xmm4, xmm6 - addps xmm4, xmm7 - movaps XMMWORD PTR [edx], xmm4 - ret 0 -@_sse2_MatrixMultVec4x4_M2@8 ENDP - - -@_sse2_MatrixMultVec3x3@8 PROC PUBLIC - movaps xmm4, XMMWORD PTR [edx] - pshufd xmm5, xmm4, 01010101b - pshufd xmm6, xmm4, 10101010b - shufps xmm4, xmm4, 00000000b - mulps xmm4, XMMWORD PTR [ecx] - mulps xmm5, XMMWORD PTR [ecx+16] - mulps xmm6, XMMWORD PTR [ecx+32] - addps xmm4, xmm5 - addps xmm4, xmm6 - movaps XMMWORD PTR [edx], xmm4 - ret 0 -@_sse2_MatrixMultVec3x3@8 ENDP - -@_sse2_MatrixMultiply@8 PROC PUBLIC - movaps xmm0, XMMWORD PTR [ecx] - movaps xmm1, XMMWORD PTR [ecx+16] - movaps xmm2, XMMWORD PTR [ecx+32] - movaps xmm3, XMMWORD PTR [ecx+48] - movaps xmm4, XMMWORD PTR [edx] ; r00, r01, r02, r03 - pshufd xmm5, xmm4, 01010101b - pshufd xmm6, xmm4, 10101010b - pshufd xmm7, xmm4, 11111111b - shufps xmm4, xmm4, 00000000b - mulps xmm4,xmm0 - mulps xmm5,xmm1 - mulps xmm6,xmm2 - mulps xmm7,xmm3 - addps xmm4,xmm5 - addps xmm4,xmm6 - addps xmm4,xmm7 - movaps XMMWORD PTR [ecx],xmm4 - - movaps xmm4, XMMWORD PTR [edx+16] ; r04, r05, r06, r07 - pshufd xmm5, xmm4, 01010101b - pshufd xmm6, xmm4, 10101010b - pshufd xmm7, xmm4, 11111111b - shufps xmm4, xmm4, 00000000b - mulps xmm4,xmm0 - mulps xmm5,xmm1 - mulps xmm6,xmm2 - mulps xmm7,xmm3 - addps xmm4,xmm5 - addps xmm4,xmm6 - addps xmm4,xmm7 - movaps XMMWORD PTR [ecx+16],xmm4 - - movaps xmm4, XMMWORD PTR [edx+32] ; r08, r09, r10, r11 - pshufd xmm5, xmm4, 01010101b - pshufd xmm6, xmm4, 10101010b - pshufd xmm7, xmm4, 11111111b - shufps xmm4, xmm4, 00000000b - mulps xmm4,xmm0 - mulps xmm5,xmm1 - mulps xmm6,xmm2 - mulps xmm7,xmm3 - addps xmm4,xmm5 - addps xmm4,xmm6 - addps xmm4,xmm7 - movaps XMMWORD PTR [ecx+32],xmm4 - - movaps xmm4, XMMWORD PTR [edx+48] ; r12, r13, r14, r15 - pshufd xmm5, xmm4, 01010101b - pshufd xmm6, xmm4, 10101010b - pshufd xmm7, xmm4, 11111111b - shufps xmm4, xmm4, 00000000b - mulps xmm4,xmm0 - mulps xmm5,xmm1 - mulps xmm6,xmm2 - mulps xmm7,xmm3 - addps xmm4,xmm5 - addps xmm4,xmm6 - addps xmm4,xmm7 - movaps XMMWORD PTR [ecx+48],xmm4 - - ret 0 -@_sse2_MatrixMultiply@8 ENDP - -@_sse2_MatrixTranslate@8 PROC PUBLIC - movaps xmm4, XMMWORD PTR [edx] - pshufd xmm5, xmm4, 01010101b - pshufd xmm6, xmm4, 10101010b - shufps xmm4, xmm4, 00000000b - mulps xmm4, XMMWORD PTR [ecx] - mulps xmm5, XMMWORD PTR [ecx+16] - mulps xmm6, XMMWORD PTR [ecx+32] - addps xmm4, xmm5 - addps xmm4, xmm6 - addps xmm4, XMMWORD PTR [ecx+48] - movaps XMMWORD PTR [ecx+48], xmm4 - ret 0 -@_sse2_MatrixTranslate@8 ENDP - -@_sse2_MatrixScale@8 PROC PUBLIC - movaps xmm4, XMMWORD PTR [edx] - pshufd xmm5, xmm4, 01010101b - pshufd xmm6, xmm4, 10101010b - shufps xmm4, xmm4, 00000000b - mulps xmm4, XMMWORD PTR [ecx] - mulps xmm5, XMMWORD PTR [ecx+16] - mulps xmm6, XMMWORD PTR [ecx+32] - movaps XMMWORD PTR [ecx], xmm4 - movaps XMMWORD PTR [ecx+16], xmm5 - movaps XMMWORD PTR [ecx+32], xmm6 - ret 0 -@_sse2_MatrixScale@8 ENDP - -@_sse2_fix2float_12@8 PROC PUBLIC - movaps xmm0, XMMWORD PTR[ecx] - movaps xmm1, XMMWORD PTR[ecx+16] - movaps xmm2, XMMWORD PTR[ecx+32] - movaps xmm4, XMMWORD PTR [edx] - ;prefetchnta [ecx+64] - divps xmm0, xmm4 - divps xmm1, xmm4 - divps xmm2, xmm4 - movaps XMMWORD PTR[ecx], xmm0 - movaps XMMWORD PTR[ecx+16],xmm1 - movaps XMMWORD PTR[ecx+32],xmm2 - ret 0 -@_sse2_fix2float_12@8 ENDP - -@_sse2_fix2float_16@8 PROC PUBLIC - movaps xmm0, XMMWORD PTR[ecx] - movaps xmm1, XMMWORD PTR[ecx+16] - movaps xmm2, XMMWORD PTR[ecx+32] - movaps xmm3, XMMWORD PTR[ecx+48] - movaps xmm4, XMMWORD PTR [edx] - ;prefetchnta [ecx+64] - divps xmm0, xmm4 - divps xmm1, xmm4 - divps xmm2, xmm4 - divps xmm3, xmm4 - movaps XMMWORD PTR[ecx], xmm0 - movaps XMMWORD PTR[ecx+16],xmm1 - movaps XMMWORD PTR[ecx+32],xmm2 - movaps XMMWORD PTR[ecx+48],xmm3 - ret 0 -@_sse2_fix2float_16@8 ENDP - -end - diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index 4bc71fd23..37ea79865 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -1022,12 +1022,10 @@ static void SoftRastConvertFramebuffer() for(int i=0,y=0;y<192;y++) { - #ifndef NOSSE2 - u8* wanx = (u8*)&src[i]; - #define ASS(X,Y) __asm { prefetchnta [wanx+32*0x##X##Y] } - #define PUNK(X) ASS(X,0) ASS(X,1) ASS(X,2) ASS(X,3) ASS(X,4) ASS(X,5) ASS(X,6) ASS(X,7) ASS(X,8) ASS(X,9) ASS(X,A) ASS(X,B) ASS(X,C) ASS(X,D) ASS(X,E) ASS(X,F) - PUNK(0); PUNK(1); - #endif + // u8* wanx = (u8*)&src[i]; + // #define ASS(X,Y) __asm { prefetchnta [wanx+32*0x##X##Y] } + // #define PUNK(X) ASS(X,0) ASS(X,1) ASS(X,2) ASS(X,3) ASS(X,4) ASS(X,5) ASS(X,6) ASS(X,7) ASS(X,8) ASS(X,9) ASS(X,A) ASS(X,B) ASS(X,C) ASS(X,D) ASS(X,E) ASS(X,F) + // PUNK(0); PUNK(1); for(int x=0;x<256;x++,i++) { diff --git a/desmume/src/texcache.h b/desmume/src/texcache.h index 8d7f2c48a..d51de2f0f 100644 --- a/desmume/src/texcache.h +++ b/desmume/src/texcache.h @@ -10,11 +10,9 @@ enum TexCache_TexFormat }; #define MAX_TEXTURE 500 -#ifndef NOSSE2 -struct ALIGN(16) TextureCache -#else -struct ALIGN(8) TextureCache -#endif + + +struct CACHE_ALIGN TextureCache { u32 id; u32 frm; @@ -33,7 +31,6 @@ struct ALIGN(8) TextureCache //set if this texture is suspected be invalid due to a vram reconfigure bool suspectedInvalid; - }; extern TextureCache *texcache; diff --git a/desmume/src/types.h b/desmume/src/types.h index 1f72cf193..07539035c 100644 --- a/desmume/src/types.h +++ b/desmume/src/types.h @@ -26,13 +26,17 @@ #include "config.h" #endif -#ifndef _MSC_VER -#define NOSSE2 +#ifdef _MSC_VER +#define ENABLE_SSE +#define ENABLE_SSE2 #endif -//if theres no sse2, also enforce no intrinsics -#if defined(NOSSE2) -#define SSE2_NOINTRIN +#ifdef NOSSE +#undef ENABLE_SSE +#endif + +#ifdef NOSSE2 +#undef ENABLE_SSE2 #endif #ifdef _WIN32 @@ -92,20 +96,6 @@ #endif #endif -//#ifndef _PREFETCH -//#if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(NOSSE2) -//#include -//#include -//#define _PREFETCH(X) _mm_prefetch((char*)(X),_MM_HINT_T0); -//#define _PREFETCHNTA(X) _mm_prefetch((char*)(X),_MM_HINT_NTA); -//#else -#define _PREFETCH(X) {} -#define _PREFETCHNTA(X) {} -//#endif -//#endif - - - #if defined(__LP64__) typedef unsigned char u8; typedef unsigned short u16; @@ -360,7 +350,45 @@ char (*BLAHBLAHBLAH( UNALIGNED T (&)[N] ))[N]; if((N)&0x001) MACRODO1((N)&(0x100|0x080|0x040|0x020|0x010|0x008|0x004|0x002),TODO); \ } +//--------------------------- +//Binary constant generator macro By Tom Torfs - donated to the public domain +//turn a numeric literal into a hex constant +//(avoids problems with leading zeroes) +//8-bit constants max value 0x11111111, always fits in unsigned long +#define HEX__(n) 0x##n##LU +//8-bit conversion function +#define B8__(x) ((x&0x0000000FLU)?1:0) \ ++((x&0x000000F0LU)?2:0) \ ++((x&0x00000F00LU)?4:0) \ ++((x&0x0000F000LU)?8:0) \ ++((x&0x000F0000LU)?16:0) \ ++((x&0x00F00000LU)?32:0) \ ++((x&0x0F000000LU)?64:0) \ ++((x&0xF0000000LU)?128:0) + +//for upto 8-bit binary constants +#define B8(d) ((unsigned char)B8__(HEX__(d))) + +// for upto 16-bit binary constants, MSB first +#define B16(dmsb,dlsb) (((unsigned short)B8(dmsb)<<8) \ ++ B8(dlsb)) + +// for upto 32-bit binary constants, MSB first */ +#define B32(dmsb,db2,db3,dlsb) (((unsigned long)B8(dmsb)<<24) \ ++ ((unsigned long)B8(db2)<<16) \ ++ ((unsigned long)B8(db3)<<8) \ ++ B8(dlsb)) + +//Sample usage: +//B8(01010101) = 85 +//B16(10101010,01010101) = 43605 +//B32(10000000,11111111,10101010,01010101) = 2164238933 +//--------------------------- + +#ifndef CTASSERT +#define CTASSERT(x) typedef char __assert ## y[(x) ? 1 : -1] +#endif #endif diff --git a/desmume/src/version.h b/desmume/src/version.h index 32e6f6de2..7ceb38dee 100644 --- a/desmume/src/version.h +++ b/desmume/src/version.h @@ -41,8 +41,12 @@ #endif #endif -#ifdef NOSSE2 -#define DESMUME_CPUEXT_STRING " NOSSE2" +#ifndef ENABLE_SSE2 + #ifndef ENABLE_SSE + #define DESMUME_CPUEXT_STRING " NOSSE" + #else + #define DESMUME_CPUEXT_STRING " NOSSE2" + #endif #else #define DESMUME_CPUEXT_STRING "" #endif diff --git a/desmume/src/windows/DeSmuME_2005.vcproj b/desmume/src/windows/DeSmuME_2005.vcproj index 3a9464de1..22701f6d6 100644 --- a/desmume/src/windows/DeSmuME_2005.vcproj +++ b/desmume/src/windows/DeSmuME_2005.vcproj @@ -710,42 +710,6 @@ - - - - - - - - - - - - - - - diff --git a/desmume/src/windows/DeSmuME_2008.vcproj b/desmume/src/windows/DeSmuME_2008.vcproj index cb4d8fa91..897f55afa 100644 --- a/desmume/src/windows/DeSmuME_2008.vcproj +++ b/desmume/src/windows/DeSmuME_2008.vcproj @@ -521,142 +521,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/desmume/src/windows/defaultconfig/userconfig.h b/desmume/src/windows/defaultconfig/userconfig.h index f523f6253..c0b89c83c 100644 --- a/desmume/src/windows/defaultconfig/userconfig.h +++ b/desmume/src/windows/defaultconfig/userconfig.h @@ -5,10 +5,13 @@ //to customize your build, place a customized copy in the userconfig directory //(alongside this defaultconfig directory) -//#define NOSSE2 //disables SSE2 optimizations (better change it in the vc++ codegen options too) +//disables SSE and SSE2 optimizations (better change it in the vc++ codegen options too) +//note that you may have to use this if your compiler doesn't support standard SSE intrinsics +//#define NOSSE +//#define NOSSE2 + //#define DEVELOPER //enables dev+ features //#define GDB_STUB //enables the gdb stub. for some reason this is separate from dev+ for now -//#define SSE2_NOINTRIN //indicates that you have a crippled compiler with no sse2 intrinsics (only relevant for SSE2 builds) #endif //_USERCONFIG_H