reconfigure sse optimizations. all asm routines have been rewritten with intrinsics so that they are more neatly inlineable; this speeds them up by a couple of FPS; also, they work in gcc; and they can be enabled separately with ENABLE_SSE and ENABLE_SSE2; 95% of the optimizations come from the SSE optimizations, meaning that SSE2 will not be necessary to enjoy the bulk of the optimizations. I think.

This commit is contained in:
zeromus 2009-07-20 23:33:39 +00:00
parent 3abc58ae36
commit 0d36fecf93
14 changed files with 308 additions and 826 deletions

View File

@ -1,9 +1,8 @@
include $(top_srcdir)/src/desmume.mk
AM_CPPFLAGS += $(SDL_CFLAGS) $(GTK_CFLAGS) $(GTHREAD_CFLAGS) $(X_CFLAGS) $(LUA_CFLAGS) $(ALSA_CFLAGS) $(LIBAGG_CFLAGS)
AM_CPPFLAGS += $(SDL_CFLAGS) $(GTK_CFLAGS) $(GTHREAD_CFLAGS) $(X_CFLAGS) $(LUA_CFLAGS) $(ALSA_CFLAGS) $(LIBAGG_CFLAGS)
EXTRA_DIST = build.bat instruction_tabdef.inc thumb_tabdef.inc fs-linux.cpp fs-windows.cpp \
matrix_sse2-x64.asm matrix_sse2-x86.asm
EXTRA_DIST = build.bat instruction_tabdef.inc thumb_tabdef.inc fs-linux.cpp fs-windows.cpp
if HAVE_GDB_STUB
SUBDIRS = . gdbstub $(UI_DIR)
else

View File

@ -73,15 +73,8 @@ static void ENDGL() {
#include "shaders.h"
#include "texcache.h"
#ifndef CTASSERT
#define CTASSERT(x) typedef char __assert ## y[(x) ? 1 : -1]
#endif
static ALIGN(16) u8 GPU_screen3D [256*192*4];
static const unsigned short map3d_cull[4] = {GL_FRONT_AND_BACK, GL_FRONT, GL_BACK, 0};
static const int texEnv[4] = { GL_MODULATE, GL_DECAL, GL_MODULATE, GL_MODULATE };
static const int depthFunc[2] = { GL_LESS, GL_EQUAL };
@ -703,17 +696,15 @@ static void GL_ReadFramebuffer()
u16* dst = gfx3d_convertedScreen + (y<<8);
u8* dstAlpha = gfx3d_convertedAlpha + (y<<8);
#ifndef NOSSE2
//I dont know much about this kind of stuff, but this seems to help
//for some reason I couldnt make the intrinsics work
u8* u8screen3D = (u8*)&((u32*)GPU_screen3D)[i];
#define PREFETCH32(X,Y) __asm { prefetchnta [u8screen3D+32*0x##X##Y] }
//u8* u8screen3D = (u8*)&((u32*)GPU_screen3D)[i];
/*#define PREFETCH32(X,Y) __asm { prefetchnta [u8screen3D+32*0x##X##Y] }
#define PREFETCH128(X) PREFETCH32(X,0) PREFETCH32(X,1) PREFETCH32(X,2) PREFETCH32(X,3) \
PREFETCH32(X,4) PREFETCH32(X,5) PREFETCH32(X,6) PREFETCH32(X,7) \
PREFETCH32(X,8) PREFETCH32(X,9) PREFETCH32(X,A) PREFETCH32(X,B) \
PREFETCH32(X,C) PREFETCH32(X,D) PREFETCH32(X,E) PREFETCH32(X,F)
PREFETCH128(0); PREFETCH128(1);
#endif
PREFETCH128(0); PREFETCH128(1);*/
for(int x=0;x<256;x++,i++)
{

View File

@ -1,3 +1,4 @@
//2
/* Copyright (C) 2006 yopyop
yopyop156@ifrance.com
yopyop156.ifrance.com
@ -146,11 +147,6 @@ static float float10Table[1024];
static float float10RelTable[1024];
static float normalTable[1024];
#ifndef NOSSE2
float ALIGN(16) _fix2float_divizor_mask[4] = { 4096.f, 4096.f, 4096.f, 4096.f };
float ALIGN(16) _fix10_2float_divizor_mask[4] = { 512.f, 512.f, 512.f, 512.f };
#endif
#define fix2float(v) (((float)((s32)(v))) / (float)(1<<12))
#define fix10_2float(v) (((float)((s32)(v))) / (float)(1<<9))
@ -317,6 +313,20 @@ static void makeTables() {
void gfx3d_init()
{
//DWORD start = timeGetTime();
//for(int i=0;i<1000000000;i++)
// MatrixMultVec4x4(mtxCurrent[0],mtxCurrent[1]);
//DWORD end = timeGetTime();
//DWORD diff = end-start;
//start = timeGetTime();
//for(int i=0;i<1000000000;i++)
// MatrixMultVec4x4_b(mtxCurrent[0],mtxCurrent[1]);
//end = timeGetTime();
//DWORD diff2 = end-start;
//printf("SPEED TEST %d %d\n",diff,diff2);
if(polylists == NULL) { polylists = new POLYLIST[2]; polylist = &polylists[0]; }
if(vertlists == NULL) { vertlists = new VERTLIST[2]; vertlist = &vertlists[0]; }
makeTables();
@ -409,15 +419,11 @@ static void SetVertex()
if(polylist->count >= POLYLIST_SIZE)
return;
#ifdef NOSSE2
//apply modelview matrix
MatrixMultVec4x4 (mtxCurrent[1], coordTransformed);
//apply projection matrix
MatrixMultVec4x4 (mtxCurrent[0], coordTransformed);
#else
_sse2_MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed);
#endif
//TODO - think about keeping the clip matrix concatenated,
//so that we only have to multiply one matrix here
//(we could lazy cache the concatenated clip matrix and only generate it
//when we need to)
MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed);
//TODO - culling should be done here.
//TODO - viewport transform?
@ -694,11 +700,7 @@ void FORCEINLINE gfx3d_glLoadIdentity()
void FORCEINLINE gfx3d_glLoadMatrix4x4(s32 v)
{
#ifdef NOSSE2
mtxCurrent[mode][0] = fix2float(v);
#else
mtxCurrent[mode][0] = v;
#endif
for (int i = 1; i < 16; i++)
{
@ -707,16 +709,10 @@ void FORCEINLINE gfx3d_glLoadMatrix4x4(s32 v)
if (!GFX_PIPErecv(&cmd, &param)) break;
dEXEC("glLoadMatrix4x4", 0x16, cmd);
#ifdef NOSSE2
mtxCurrent[mode][i] = fix2float((s32)param);
#else
mtxCurrent[mode][i] = (s32)param;
#endif
}
#ifndef NOSSE2
_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
#endif
vector_fix2float<4>(mtxCurrent[mode], 4096.f);
GFX_DELAY(19);
@ -726,11 +722,7 @@ void FORCEINLINE gfx3d_glLoadMatrix4x4(s32 v)
void FORCEINLINE gfx3d_glLoadMatrix4x3(s32 v)
{
#ifdef NOSSE2
mtxCurrent[mode][0] = fix2float(v);
#else
mtxCurrent[mode][0] = v;
#endif
for (int i = 1; i < 16; i++)
{
@ -740,16 +732,10 @@ void FORCEINLINE gfx3d_glLoadMatrix4x3(s32 v)
if (!GFX_PIPErecv(&cmd, &param)) break;
dEXEC("glLoadMatrix4x3", 0x17, cmd);
#ifdef NOSSE2
mtxCurrent[mode][i] = fix2float((s32)param);
#else
mtxCurrent[mode][i] = (s32)param;
#endif
}
#ifndef NOSSE2
_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
#endif
vector_fix2float<4>(mtxCurrent[mode], 4096.f);
//fill in the unusued matrix values
mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0.f;
@ -763,11 +749,7 @@ void FORCEINLINE gfx3d_glLoadMatrix4x3(s32 v)
void FORCEINLINE gfx3d_glMultMatrix4x4(s32 v)
{
#ifdef NOSSE2
mtxTemporal[0] = fix2float(v);
#else
mtxTemporal[0] = v;
#endif
for (int i = 1; i < 16; i++)
{
@ -776,16 +758,10 @@ void FORCEINLINE gfx3d_glMultMatrix4x4(s32 v)
if (!GFX_PIPErecv(&cmd, &param)) break;
dEXEC("glMultMatrix4x4", 0x18, cmd);
#ifdef NOSSE2
mtxTemporal[i] = fix2float((s32)param);
#else
mtxTemporal[i] = (s32)param;
#endif
}
#ifndef NOSSE2
_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
#endif
vector_fix2float<4>(mtxTemporal, 4096.f);
MatrixMultiply (mtxCurrent[mode], mtxTemporal);
@ -802,11 +778,7 @@ void FORCEINLINE gfx3d_glMultMatrix4x4(s32 v)
void FORCEINLINE gfx3d_glMultMatrix4x3(s32 v)
{
#ifdef NOSSE2
mtxTemporal[0] = fix2float(v);
#else
mtxTemporal[0] = v;
#endif
for (int i = 1; i < 16; i++)
{
@ -816,16 +788,10 @@ void FORCEINLINE gfx3d_glMultMatrix4x3(s32 v)
if (!GFX_PIPErecv(&cmd, &param)) break;
dEXEC("glMultMatrix4x3", 0x19, cmd);
#ifdef NOSSE2
mtxTemporal[i] = fix2float((s32)param);
#else
mtxTemporal[i] = (s32)param;
#endif
}
#ifndef NOSSE2
_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
#endif
vector_fix2float<4>(mtxTemporal, 4096.f);
//fill in the unusued matrix values
mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0.f;
@ -847,12 +813,7 @@ void FORCEINLINE gfx3d_glMultMatrix4x3(s32 v)
void FORCEINLINE gfx3d_glMultMatrix3x3(s32 v)
{
#ifdef NOSSE2
mtxTemporal[0] = fix2float(v);
#else
mtxTemporal[0] = v;
#endif
for (int i = 1; i < 12; i++)
{
@ -862,16 +823,10 @@ void FORCEINLINE gfx3d_glMultMatrix3x3(s32 v)
if (!GFX_PIPErecv(&cmd, &param)) break;
dEXEC("glMultMatrix3x3", 0x1A, cmd);
#ifdef NOSSE2
mtxTemporal[i] = fix2float((s32)param);
#else
mtxTemporal[i] = (s32)param;
#endif
}
#ifndef NOSSE2
_sse2_fix2float_12(mtxTemporal, _fix2float_divizor_mask);
#endif
vector_fix2float<3>(mtxTemporal, 4096.f);
//fill in the unusued matrix values
mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0;
@ -1276,12 +1231,7 @@ void FORCEINLINE gfx3d_glPosTest(u32 v)
PTcoords[2] = float16table[param & 0xFFFF];
PTcoords[3] = 1.0f;
#ifdef NOSSE2
MatrixMultVec4x4 (mtxCurrent[1], PTcoords);
MatrixMultVec4x4 (mtxCurrent[0], PTcoords);
#else
_sse2_MatrixMultVec4x4_M2(mtxCurrent[0], PTcoords);
#endif
MatrixMultVec4x4_M2(mtxCurrent[0], PTcoords);
gxstat &= 0xFFFFFFFE; // cleay busy bit
T1WriteLong(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x600, gxstat);
@ -1413,11 +1363,7 @@ void gfx3d_glLoadIdentity()
BOOL gfx3d_glLoadMatrix4x4(s32 v)
{
#ifdef NOSSE2
mtxCurrent[mode][ML4x4ind] = fix2float(v);
#else
mtxCurrent[mode][ML4x4ind] = v;
#endif
++ML4x4ind;
if(ML4x4ind<16) return FALSE;
@ -1425,9 +1371,7 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v)
GFX_DELAY(19);
#ifndef NOSSE2
_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
#endif
vector_fix2float<4>(mtxCurrent[mode], 4096.f);
if (mode == 2)
MatrixCopy (mtxCurrent[1], mtxCurrent[2]);
@ -1436,20 +1380,14 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v)
BOOL gfx3d_glLoadMatrix4x3(s32 v)
{
#ifdef NOSSE2
mtxCurrent[mode][ML4x3ind] = fix2float(v);
#else
mtxCurrent[mode][ML4x3ind] = v;
#endif
ML4x3ind++;
if((ML4x3ind & 0x03) == 3) ML4x3ind++;
if(ML4x3ind<16) return FALSE;
ML4x3ind = 0;
#ifndef NOSSE2
_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
#endif
vector_fix2float<4>(mtxCurrent[mode], 4096.f);
//fill in the unusued matrix values
mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0.f;
@ -1464,11 +1402,7 @@ BOOL gfx3d_glLoadMatrix4x3(s32 v)
BOOL gfx3d_glMultMatrix4x4(s32 v)
{
#ifdef NOSSE2
mtxTemporal[MM4x4ind] = fix2float(v);
#else
mtxTemporal[MM4x4ind] = v;
#endif
MM4x4ind++;
if(MM4x4ind<16) return FALSE;
@ -1476,9 +1410,7 @@ BOOL gfx3d_glMultMatrix4x4(s32 v)
GFX_DELAY(35);
#ifndef NOSSE2
_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
#endif
vector_fix2float<4>(mtxTemporal, 4096.f);
MatrixMultiply (mtxCurrent[mode], mtxTemporal);
@ -1494,11 +1426,7 @@ BOOL gfx3d_glMultMatrix4x4(s32 v)
BOOL gfx3d_glMultMatrix4x3(s32 v)
{
#ifdef NOSSE2
mtxTemporal[MM4x3ind] = fix2float(v);
#else
mtxTemporal[MM4x3ind] = v;
#endif
MM4x3ind++;
if((MM4x3ind & 0x03) == 3) MM4x3ind++;
@ -1507,9 +1435,7 @@ BOOL gfx3d_glMultMatrix4x3(s32 v)
GFX_DELAY(31);
#ifndef NOSSE2
_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
#endif
vector_fix2float<4>(mtxTemporal, 4096.f);
//fill in the unusued matrix values
mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0.f;
@ -1530,11 +1456,7 @@ BOOL gfx3d_glMultMatrix4x3(s32 v)
BOOL gfx3d_glMultMatrix3x3(s32 v)
{
#ifdef NOSSE2
mtxTemporal[MM3x3ind] = fix2float(v);
#else
mtxTemporal[MM3x3ind] = v;
#endif
MM3x3ind++;
@ -1544,9 +1466,7 @@ BOOL gfx3d_glMultMatrix3x3(s32 v)
GFX_DELAY(28);
#ifndef NOSSE2
_sse2_fix2float_12(mtxTemporal, _fix2float_divizor_mask);
#endif
vector_fix2float<3>(mtxTemporal, 4096.f);
//fill in the unusued matrix values
mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0;

View File

@ -25,17 +25,10 @@
#include <assert.h>
#include "matrix.h"
extern "C" {
void MatrixInit (float *matrix)
{
memset (matrix, 0, sizeof(float)*16);
matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
}
#ifdef NOSSE2
void MATRIXFASTCALL MatrixMultVec4x4 (const float *matrix, float *vecPtr)
//-------------------------
//switched SSE functions: implementations for no SSE
#ifndef ENABLE_SSE
void MatrixMultVec4x4 (const float *matrix, float *vecPtr)
{
float x = vecPtr[0];
float y = vecPtr[1];
@ -48,7 +41,8 @@ void MATRIXFASTCALL MatrixMultVec4x4 (const float *matrix, float *vecPtr)
vecPtr[3] = x * matrix[3] + y * matrix[7] + z * matrix[11] + w * matrix[15];
}
void MATRIXFASTCALL MatrixMultVec3x3 (const float *matrix, float *vecPtr)
void MatrixMultVec3x3 (const float *matrix, float *vecPtr)
{
float x = vecPtr[0];
float y = vecPtr[1];
@ -59,7 +53,7 @@ void MATRIXFASTCALL MatrixMultVec3x3 (const float *matrix, float *vecPtr)
vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10];
}
void MATRIXFASTCALL MatrixMultiply (float *matrix, const float *rightMatrix)
void MatrixMultiply (float *matrix, const float *rightMatrix)
{
float tmpMatrix[16];
@ -86,7 +80,7 @@ void MATRIXFASTCALL MatrixMultiply (float *matrix, const float *rightMatrix)
memcpy (matrix, tmpMatrix, sizeof(float)*16);
}
void MATRIXFASTCALL MatrixTranslate (float *matrix, const float *ptr)
void MatrixTranslate (float *matrix, const float *ptr)
{
matrix[12] += (matrix[0]*ptr[0])+(matrix[4]*ptr[1])+(matrix[ 8]*ptr[2]);
matrix[13] += (matrix[1]*ptr[0])+(matrix[5]*ptr[1])+(matrix[ 9]*ptr[2]);
@ -94,7 +88,7 @@ void MATRIXFASTCALL MatrixTranslate (float *matrix, const float *ptr)
matrix[15] += (matrix[3]*ptr[0])+(matrix[7]*ptr[1])+(matrix[11]*ptr[2]);
}
void MATRIXFASTCALL MatrixScale (float *matrix, const float *ptr)
void MatrixScale (float *matrix, const float *ptr)
{
matrix[0] *= ptr[0];
matrix[1] *= ptr[0];
@ -111,9 +105,16 @@ void MATRIXFASTCALL MatrixScale (float *matrix, const float *ptr)
matrix[10] *= ptr[2];
matrix[11] *= ptr[2];
}
#endif //switched c/asm functions
//-----------------------------------------
void MatrixInit (float *matrix)
{
memset (matrix, 0, sizeof(float)*16);
matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
}
void MatrixTranspose(float *matrix)
{
float temp;
@ -127,7 +128,7 @@ void MatrixTranspose(float *matrix)
#undef swap
}
void MATRIXFASTCALL MatrixIdentity (float *matrix)
void MatrixIdentity (float *matrix)
{
matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0.0f;
matrix[6] = matrix[7] = matrix[8] = matrix[9] = 0.0f;
@ -135,7 +136,7 @@ void MATRIXFASTCALL MatrixIdentity (float *matrix)
matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
}
float MATRIXFASTCALL MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix)
float MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix)
{
int iMod = index%4, iDiv = (index>>2)<<2;
@ -143,12 +144,12 @@ float MATRIXFASTCALL MatrixGetMultipliedIndex (int index, float *matrix, float *
(matrix[iMod+8]*rightMatrix[iDiv+2])+(matrix[iMod+12]*rightMatrix[iDiv+3]);
}
void MATRIXFASTCALL MatrixSet (float *matrix, int x, int y, float value) // TODO
void MatrixSet (float *matrix, int x, int y, float value) // TODO
{
matrix [x+(y<<2)] = value;
}
void MATRIXFASTCALL MatrixCopy (float* matrixDST, const float* matrixSRC)
void MatrixCopy (float* matrixDST, const float* matrixSRC)
{
matrixDST[0] = matrixSRC[0];
matrixDST[1] = matrixSRC[1];
@ -169,7 +170,7 @@ void MATRIXFASTCALL MatrixCopy (float* matrixDST, const float* matrixSRC)
}
int MATRIXFASTCALL MatrixCompare (const float* matrixDST, const float* matrixSRC)
int MatrixCompare (const float* matrixDST, const float* matrixSRC)
{
return memcmp((void*)matrixDST, matrixSRC, sizeof(float)*16);
}
@ -340,5 +341,4 @@ void Vector4Copy(float *dst, const float *src)
dst[3] = src[3];
}
} //extern "C"

View File

@ -1,5 +1,5 @@
/*
Copyright (C) 2006-2007 shash
/* Copyright (C) 2006-2007 shash
Copyright (C) 2009 DeSmuME team
This file is part of DeSmuME
@ -27,17 +27,14 @@
#include "types.h"
#include "mem.h"
#if !defined(NOSSE2) && !defined(SSE2_NOINTRIN)
#define SSE2_INTRIN
#ifdef ENABLE_SSE
#include <xmmintrin.h>
#endif
#ifdef SSE2_INTRIN
#include <xmmintrin.h>
#ifdef ENABLE_SSE2
#include <emmintrin.h>
#endif
extern "C" {
struct MatrixStack
{
MatrixStack(int size);
@ -48,42 +45,15 @@ struct MatrixStack
void MatrixInit (float *matrix);
#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
#define MATRIXFASTCALL __fastcall
#else
#define MATRIXFASTCALL
#endif
//In order to conditionally use these asm optimized functions in visual studio
//without having to make new build types to exclude the assembly files.
//a bit sloppy, but there aint much to it
#ifndef NOSSE2
#define SSE2_FUNC(X) _sse2_##X
#define MatrixMultVec4x4 _sse2_MatrixMultVec4x4
#define MatrixMultVec3x3 _sse2_MatrixMultVec3x3
#define MatrixMultiply _sse2_MatrixMultiply
#define MatrixTranslate _sse2_MatrixTranslate
#define MatrixScale _sse2_MatrixScale
void MATRIXFASTCALL _sse2_fix2float_16 (float* matrix, float* divizor_mask);
void MATRIXFASTCALL _sse2_fix2float_12 (float* matrix, float* divizor_mask);
void MATRIXFASTCALL _sse2_MatrixMultVec4x4_M2 (const float * matrix, float * vecPtr); // mode 2
#else
#define SSE2_FUNC(X) X
#endif
void MATRIXFASTCALL SSE2_FUNC(MatrixMultVec3x3) (const float * matrix, float * vecPtr);
void MATRIXFASTCALL SSE2_FUNC(MatrixMultVec4x4) (const float * matrix, float * vecPtr);
void MATRIXFASTCALL SSE2_FUNC(MatrixMultiply) (float * matrix, const float * rightMatrix);
void MATRIXFASTCALL SSE2_FUNC(MatrixTranslate) (float *matrix, const float *ptr);
void MATRIXFASTCALL SSE2_FUNC(MatrixScale) (float * matrix, const float * ptr);
float MATRIXFASTCALL MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix);
void MATRIXFASTCALL MatrixSet (float *matrix, int x, int y, float value);
void MATRIXFASTCALL MatrixCopy (float * matrixDST, const float * matrixSRC);
int MATRIXFASTCALL MatrixCompare (const float * matrixDST, const float * matrixSRC);
void MATRIXFASTCALL MatrixIdentity (float *matrix);
float MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix);
void MatrixSet (float *matrix, int x, int y, float value);
void MatrixCopy (float * matrixDST, const float * matrixSRC);
int MatrixCompare (const float * matrixDST, const float * matrixSRC);
void MatrixIdentity (float *matrix);
void MatrixTranspose (float *matrix);
void MatrixStackInit (MatrixStack *stack);
@ -112,27 +82,21 @@ void Vector3Normalize(float *dst);
void Vector4Copy(float *dst, const float *src);
} //extern "C"
//these functions are an unreliable, inaccurate floor.
//it should only be used for positive numbers
//this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available
FORCEINLINE u32 u32floor(float f)
{
#if defined(SSE2_INTRIN)
return (u32)_mm_cvttss_si32(_mm_set_ss(f));
#elif !defined(NOSSE2)
__asm cvttss2si eax, f;
#ifdef ENABLE_SSE2
return (u32)_mm_cvtt_ss2si(_mm_set_ss(f));
#else
return (u32)f;
#endif
}
FORCEINLINE u32 u32floor(double d)
{
#if defined(SSE2_INTRIN)
#ifdef ENABLE_SSE2
return (u32)_mm_cvttsd_si32(_mm_set_sd(d));
#elif !defined(NOSSE2)
__asm cvttsd2si eax, d;
#else
return (u32)d;
#endif
@ -142,66 +106,212 @@ FORCEINLINE u32 u32floor(double d)
//be sure that the results are the same thing as floorf!
FORCEINLINE s32 s32floor(float f)
{
#if defined(SSE2_INTRIN)
#ifdef ENABLE_SSE2
return _mm_cvtss_si32( _mm_add_ss(_mm_set_ss(-0.5f),_mm_add_ss(_mm_set_ss(f), _mm_set_ss(f))) ) >> 1;
#elif !defined(NOSSE2)
static const float c = -0.5f;
__asm
{
movss xmm0, f;
addss xmm0, xmm0;
addss xmm0, c;
cvtss2si eax, xmm0
sar eax, 1
}
#else
return (s32)floorf(f);
#endif
}
//now comes some sse2 functions coded solely with intrinsics.
//let's wait and see how many people this upsets.
//they can always #define SSE2_NOINTRIN in their userconfig.h....
#ifdef SSE2_INTRIN
//switched SSE2 functions
//-------------
#ifdef ENABLE_SSE
template<int NUM>
static FORCEINLINE void memset_u16_le(void* dst, u16 val)
FORCEINLINE void memset_u16_le(void* dst, u16 val)
{
u32 u32val;
//just for the endian safety
T1WriteWord((u8*)&u32val,0,val);
T1WriteWord((u8*)&u32val,2,val);
const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
MACRODO_N(NUM/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp));
////const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
__m128 temp; temp.m128_i32[0] = u32val;
//MACRODO_N(NUM/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp));
MACRODO_N(NUM/8,_mm_store_ps1((float*)((u8*)dst+(X)*16), temp));
}
#else
#else //no sse2
template<int NUM>
static FORCEINLINE void memset_u16_le(void* dst, u16 val)
{
for(int i=0;i<NUM;i++)
T1WriteWord((u8*)dst,i<<1,val);
}
#endif
//---------------------------
//switched SSE functions
#ifdef ENABLE_SSE
struct SSE_MATRIX
{
SSE_MATRIX(const float *matrix)
: row0(_mm_load_ps(matrix))
, row1(_mm_load_ps(matrix+4))
, row2(_mm_load_ps(matrix+8))
, row3(_mm_load_ps(matrix+12))
{}
union {
__m128 rows[4];
struct { __m128 row0; __m128 row1; __m128 row2; __m128 row3; };
};
};
FORCEINLINE __m128 _util_MatrixMultVec4x4_(const SSE_MATRIX &mat, __m128 vec)
{
__m128 xmm5 = _mm_shuffle_ps(vec, vec, B8(01010101));
__m128 xmm6 = _mm_shuffle_ps(vec, vec, B8(10101010));
__m128 xmm7 = _mm_shuffle_ps(vec, vec, B8(11111111));
__m128 xmm4 = _mm_shuffle_ps(vec, vec, B8(00000000));
xmm4 = _mm_mul_ps(xmm4,mat.row0);
xmm5 = _mm_mul_ps(xmm5,mat.row1);
xmm6 = _mm_mul_ps(xmm6,mat.row2);
xmm7 = _mm_mul_ps(xmm7,mat.row3);
xmm4 = _mm_add_ps(xmm4,xmm5);
xmm4 = _mm_add_ps(xmm4,xmm6);
xmm4 = _mm_add_ps(xmm4,xmm7);
return xmm4;
}
FORCEINLINE void MatrixMultiply(float * matrix, const float * rightMatrix)
{
//this seems to generate larger code, including many movaps, but maybe it is less harsh on the registers than the
//more hand-tailored approach
__m128 row0 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix));
__m128 row1 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+4));
__m128 row2 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+8));
__m128 row3 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+12));
_mm_store_ps(matrix,row0);
_mm_store_ps(matrix+4,row1);
_mm_store_ps(matrix+8,row2);
_mm_store_ps(matrix+12,row3);
}
FORCEINLINE void MatrixMultVec4x4(const float *matrix, float *vecPtr)
{
_mm_store_ps(vecPtr,_util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(vecPtr)));
}
FORCEINLINE void MatrixMultVec4x4_M2(const float *matrix, float *vecPtr)
{
//there are hardly any gains from merging these manually
MatrixMultVec4x4(matrix+16,vecPtr);
MatrixMultVec4x4(matrix,vecPtr);
}
FORCEINLINE void MatrixMultVec3x3(const float * matrix, float * vecPtr)
{
const __m128 vec = _mm_load_ps(vecPtr);
__m128 xmm5 = _mm_shuffle_ps(vec, vec, B8(01010101));
__m128 xmm6 = _mm_shuffle_ps(vec, vec, B8(10101010));
__m128 xmm4 = _mm_shuffle_ps(vec, vec, B8(00000000));
const SSE_MATRIX mat(matrix);
xmm4 = _mm_mul_ps(xmm4,mat.row0);
xmm5 = _mm_mul_ps(xmm5,mat.row1);
xmm6 = _mm_mul_ps(xmm6,mat.row2);
xmm4 = _mm_add_ps(xmm4,xmm5);
xmm4 = _mm_add_ps(xmm4,xmm6);
_mm_store_ps(vecPtr,xmm4);
}
FORCEINLINE void MatrixTranslate(float *matrix, const float *ptr)
{
__m128 xmm4 = _mm_load_ps(ptr);
__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(matrix));
xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(matrix+4));
xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(matrix+8));
xmm4 = _mm_add_ps(xmm4,xmm5);
xmm4 = _mm_add_ps(xmm4,xmm6);
xmm4 = _mm_add_ps(xmm4,_mm_load_ps(matrix+12));
_mm_store_ps(matrix+12,xmm4);
}
FORCEINLINE void MatrixScale(float *matrix, const float *ptr)
{
__m128 xmm4 = _mm_load_ps(ptr);
__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(matrix));
xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(matrix+4));
xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(matrix+8));
_mm_store_ps(matrix,xmm4);
_mm_store_ps(matrix+4,xmm5);
_mm_store_ps(matrix+8,xmm6);
}
template<int NUM_ROWS>
FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
{
CTASSERT(NUM_ROWS==3 || NUM_ROWS==4);
const __m128 val = _mm_set_ps1(divisor);
_mm_store_ps(matrix,_mm_div_ps(_mm_load_ps(matrix),val));
_mm_store_ps(matrix+4,_mm_div_ps(_mm_load_ps(matrix+4),val));
_mm_store_ps(matrix+8,_mm_div_ps(_mm_load_ps(matrix+8),val));
if(NUM_ROWS==4)
_mm_store_ps(matrix+12,_mm_div_ps(_mm_load_ps(matrix+12),val));
}
//WARNING: I do not think this is as fast as a memset, for some reason.
//at least in vc2005 with sse enabled. better figure out why before using it
#ifdef SSE2_INTRIN
template<int NUM>
static FORCEINLINE void memset_u8(void* _dst, u8 val)
{
const u8* dst = (u8*)_dst;
u32 u32val = (val<<24)|(val<<16)|(val<<8)|val;
const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
MACRODO_N(NUM/16,_mm_store_si128((__m128i*)(dst+(X)*16), temp));
memset(_dst,val,NUM);
//const u8* dst = (u8*)_dst;
//u32 u32val = (val<<24)|(val<<16)|(val<<8)|val;
//const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
//MACRODO_N(NUM/16,_mm_store_si128((__m128i*)(dst+(X)*16), temp));
}
#else
#else //no sse
void MatrixMultVec4x4 (const float *matrix, float *vecPtr);
void MatrixMultVec3x3(const float * matrix, float * vecPtr);
void MatrixMultiply(float * matrix, const float * rightMatrix);
void MatrixTranslate(float *matrix, const float *ptr);
void MatrixScale(float * matrix, const float * ptr);
FORCEINLINE void MatrixMultVec4x4_M2(const float *matrix, float *vecPtr)
{
//there are hardly any gains from merging these manually
MatrixMultVec4x4(matrix+16,vecPtr);
MatrixMultVec4x4(matrix,vecPtr);
}
template<int NUM_ROWS>
FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
{
for(int i=0;i<NUM_ROWS*4;i++)
matrix[i] /= divisor;
}
template<int NUM>
static FORCEINLINE void memset_u8(void* dst, u8 val)
{
memset(dst,val,NUM);
}
#endif
#endif //switched SSE functions
#endif

View File

@ -1,182 +0,0 @@
;
; Copyright (C) 2006 yopyop
; Copyright (C) 2008 CrazyMax
;
; This file is part of DeSmuME
;
; DeSmuME is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation; either version 2 of the License, or
; (at your option) any later version.
;
; DeSmuME is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with DeSmuME; if not, write to the Free Software
; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
TITLE matrix_sse2-x64.asm
.code
_sse2_MatrixMultVec4x4 PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32]
movaps xmm3, XMMWORD PTR [rcx+48]
movaps xmm4, XMMWORD PTR [rdx]
movaps xmm5, xmm4
movaps xmm6, xmm4
movaps xmm7, xmm4
shufps xmm4, xmm4, 00000000b
shufps xmm5, xmm5, 01010101b
shufps xmm6, xmm6, 10101010b
shufps xmm7, xmm7, 11111111b
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
mulps xmm7, xmm3
addps xmm4, xmm5
addps xmm4, xmm6
addps xmm4, xmm7
movaps XMMWORD PTR [rdx], xmm4
ret 0
_sse2_MatrixMultVec4x4 ENDP
_sse2_MatrixMultVec3x3 PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32]
movaps xmm4, XMMWORD PTR [rdx]
movaps xmm5, xmm4
movaps xmm6, xmm4
movaps xmm7, xmm4
shufps xmm4, xmm4, 00000000b
shufps xmm5, xmm5, 01010101b
shufps xmm6, xmm6, 10101010b
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
addps xmm4, xmm5
addps xmm4, xmm6
movaps XMMWORD PTR [rdx], xmm4
ret 0
_sse2_MatrixMultVec3x3 ENDP
_sse2_MatrixMultiply PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32]
movaps xmm3, XMMWORD PTR [rcx+48]
movaps xmm4, XMMWORD PTR [rdx] ; r00, r01, r02, r03
movaps xmm8, XMMWORD PTR [rdx+16] ; r04, r05, r06, r07
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
movaps xmm9,xmm8 ;
movaps xmm10,xmm8
movaps xmm11,xmm8
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
shufps xmm8, xmm8, 00000000b ;
shufps xmm9, xmm9, 01010101b
shufps xmm10,xmm10,10101010b
shufps xmm11,xmm11,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
mulps xmm8, xmm0 ;
mulps xmm9, xmm1
mulps xmm10,xmm2
mulps xmm11,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
addps xmm8,xmm9 ;
addps xmm8,xmm10
addps xmm8,xmm11
movaps XMMWORD PTR [rcx],xmm4
movaps XMMWORD PTR [rcx+16],xmm8
movaps xmm4, XMMWORD PTR [rdx+32] ; r00, r01, r02, r03
movaps xmm8, XMMWORD PTR [rdx+48] ; r04, r05, r06, r07
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
movaps xmm9,xmm8 ;
movaps xmm10,xmm8
movaps xmm11,xmm8
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
shufps xmm8, xmm8, 00000000b ;
shufps xmm9, xmm9, 01010101b
shufps xmm10,xmm10,10101010b
shufps xmm11,xmm11,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
mulps xmm8, xmm0 ;
mulps xmm9, xmm1
mulps xmm10,xmm2
mulps xmm11,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
addps xmm8,xmm9 ;
addps xmm8,xmm10
addps xmm8,xmm11
movaps XMMWORD PTR [rcx+32],xmm4
movaps XMMWORD PTR [rcx+48],xmm8
ret 0
_sse2_MatrixMultiply ENDP
_sse2_MatrixTranslate PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32]
movaps xmm3, XMMWORD PTR [rcx+48]
movaps xmm4, XMMWORD PTR [rdx]
movaps xmm5, xmm4
movaps xmm6, xmm4
movaps xmm7, xmm4
shufps xmm4, xmm4, 00000000b
shufps xmm5, xmm5, 01010101b
shufps xmm6, xmm6, 10101010b
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
addps xmm4, xmm5
addps xmm4, xmm6
addps xmm4, xmm3
movaps XMMWORD PTR [rcx+48], xmm4
ret 0
_sse2_MatrixTranslate ENDP
_sse2_MatrixScale PROC PUBLIC
movaps xmm0, XMMWORD PTR [rcx]
movaps xmm1, XMMWORD PTR [rcx+16]
movaps xmm2, XMMWORD PTR [rcx+32]
movaps xmm4, XMMWORD PTR [rdx]
movaps xmm5, xmm4
movaps xmm6, xmm4
shufps xmm4, xmm4, 00000000b
shufps xmm5, xmm5, 01010101b
shufps xmm6, xmm6, 10101010b
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
movaps XMMWORD PTR [rcx],xmm4
movaps XMMWORD PTR [rcx+16],xmm5
movaps XMMWORD PTR [rcx+32],xmm6
ret 0
_sse2_MatrixScale ENDP
end

View File

@ -1,214 +0,0 @@
;
; Copyright (C) 2006 yopyop
; Copyright (C) 2008 CrazyMax
;
; This file is part of DeSmuME
;
; DeSmuME is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation; either version 2 of the License, or
; (at your option) any later version.
;
; DeSmuME is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with DeSmuME; if not, write to the Free Software
; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
TITLE matrix_sse2-x86.asm
.686P
.XMM
.model flat
.code
@_sse2_MatrixMultVec4x4@8 PROC PUBLIC
movaps xmm4, XMMWORD PTR [edx]
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b
mulps xmm4, XMMWORD PTR [ecx]
mulps xmm5, XMMWORD PTR [ecx+16]
mulps xmm6, XMMWORD PTR [ecx+32]
mulps xmm7, XMMWORD PTR [ecx+48]
addps xmm4, xmm5
addps xmm4, xmm6
addps xmm4, xmm7
movaps XMMWORD PTR [edx], xmm4
ret 0
@_sse2_MatrixMultVec4x4@8 ENDP
@_sse2_MatrixMultVec4x4_M2@8 PROC PUBLIC
movaps xmm4, XMMWORD PTR [edx]
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b
mulps xmm4, XMMWORD PTR [ecx+64]
mulps xmm5, XMMWORD PTR [ecx+80]
mulps xmm6, XMMWORD PTR [ecx+96]
mulps xmm7, XMMWORD PTR [ecx+112]
addps xmm4, xmm5
addps xmm4, xmm6
addps xmm4, xmm7
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b
mulps xmm4, XMMWORD PTR [ecx]
mulps xmm5, XMMWORD PTR [ecx+16]
mulps xmm6, XMMWORD PTR [ecx+32]
mulps xmm7, XMMWORD PTR [ecx+48]
addps xmm4, xmm5
addps xmm4, xmm6
addps xmm4, xmm7
movaps XMMWORD PTR [edx], xmm4
ret 0
@_sse2_MatrixMultVec4x4_M2@8 ENDP
@_sse2_MatrixMultVec3x3@8 PROC PUBLIC
movaps xmm4, XMMWORD PTR [edx]
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
shufps xmm4, xmm4, 00000000b
mulps xmm4, XMMWORD PTR [ecx]
mulps xmm5, XMMWORD PTR [ecx+16]
mulps xmm6, XMMWORD PTR [ecx+32]
addps xmm4, xmm5
addps xmm4, xmm6
movaps XMMWORD PTR [edx], xmm4
ret 0
@_sse2_MatrixMultVec3x3@8 ENDP
@_sse2_MatrixMultiply@8 PROC PUBLIC
movaps xmm0, XMMWORD PTR [ecx]
movaps xmm1, XMMWORD PTR [ecx+16]
movaps xmm2, XMMWORD PTR [ecx+32]
movaps xmm3, XMMWORD PTR [ecx+48]
movaps xmm4, XMMWORD PTR [edx] ; r00, r01, r02, r03
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps XMMWORD PTR [ecx],xmm4
movaps xmm4, XMMWORD PTR [edx+16] ; r04, r05, r06, r07
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps XMMWORD PTR [ecx+16],xmm4
movaps xmm4, XMMWORD PTR [edx+32] ; r08, r09, r10, r11
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps XMMWORD PTR [ecx+32],xmm4
movaps xmm4, XMMWORD PTR [edx+48] ; r12, r13, r14, r15
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps XMMWORD PTR [ecx+48],xmm4
ret 0
@_sse2_MatrixMultiply@8 ENDP
@_sse2_MatrixTranslate@8 PROC PUBLIC
movaps xmm4, XMMWORD PTR [edx]
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
shufps xmm4, xmm4, 00000000b
mulps xmm4, XMMWORD PTR [ecx]
mulps xmm5, XMMWORD PTR [ecx+16]
mulps xmm6, XMMWORD PTR [ecx+32]
addps xmm4, xmm5
addps xmm4, xmm6
addps xmm4, XMMWORD PTR [ecx+48]
movaps XMMWORD PTR [ecx+48], xmm4
ret 0
@_sse2_MatrixTranslate@8 ENDP
@_sse2_MatrixScale@8 PROC PUBLIC
movaps xmm4, XMMWORD PTR [edx]
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
shufps xmm4, xmm4, 00000000b
mulps xmm4, XMMWORD PTR [ecx]
mulps xmm5, XMMWORD PTR [ecx+16]
mulps xmm6, XMMWORD PTR [ecx+32]
movaps XMMWORD PTR [ecx], xmm4
movaps XMMWORD PTR [ecx+16], xmm5
movaps XMMWORD PTR [ecx+32], xmm6
ret 0
@_sse2_MatrixScale@8 ENDP
@_sse2_fix2float_12@8 PROC PUBLIC
movaps xmm0, XMMWORD PTR[ecx]
movaps xmm1, XMMWORD PTR[ecx+16]
movaps xmm2, XMMWORD PTR[ecx+32]
movaps xmm4, XMMWORD PTR [edx]
;prefetchnta [ecx+64]
divps xmm0, xmm4
divps xmm1, xmm4
divps xmm2, xmm4
movaps XMMWORD PTR[ecx], xmm0
movaps XMMWORD PTR[ecx+16],xmm1
movaps XMMWORD PTR[ecx+32],xmm2
ret 0
@_sse2_fix2float_12@8 ENDP
@_sse2_fix2float_16@8 PROC PUBLIC
movaps xmm0, XMMWORD PTR[ecx]
movaps xmm1, XMMWORD PTR[ecx+16]
movaps xmm2, XMMWORD PTR[ecx+32]
movaps xmm3, XMMWORD PTR[ecx+48]
movaps xmm4, XMMWORD PTR [edx]
;prefetchnta [ecx+64]
divps xmm0, xmm4
divps xmm1, xmm4
divps xmm2, xmm4
divps xmm3, xmm4
movaps XMMWORD PTR[ecx], xmm0
movaps XMMWORD PTR[ecx+16],xmm1
movaps XMMWORD PTR[ecx+32],xmm2
movaps XMMWORD PTR[ecx+48],xmm3
ret 0
@_sse2_fix2float_16@8 ENDP
end

View File

@ -1022,12 +1022,10 @@ static void SoftRastConvertFramebuffer()
for(int i=0,y=0;y<192;y++)
{
#ifndef NOSSE2
u8* wanx = (u8*)&src[i];
#define ASS(X,Y) __asm { prefetchnta [wanx+32*0x##X##Y] }
#define PUNK(X) ASS(X,0) ASS(X,1) ASS(X,2) ASS(X,3) ASS(X,4) ASS(X,5) ASS(X,6) ASS(X,7) ASS(X,8) ASS(X,9) ASS(X,A) ASS(X,B) ASS(X,C) ASS(X,D) ASS(X,E) ASS(X,F)
PUNK(0); PUNK(1);
#endif
// u8* wanx = (u8*)&src[i];
// #define ASS(X,Y) __asm { prefetchnta [wanx+32*0x##X##Y] }
// #define PUNK(X) ASS(X,0) ASS(X,1) ASS(X,2) ASS(X,3) ASS(X,4) ASS(X,5) ASS(X,6) ASS(X,7) ASS(X,8) ASS(X,9) ASS(X,A) ASS(X,B) ASS(X,C) ASS(X,D) ASS(X,E) ASS(X,F)
// PUNK(0); PUNK(1);
for(int x=0;x<256;x++,i++)
{

View File

@ -10,11 +10,9 @@ enum TexCache_TexFormat
};
#define MAX_TEXTURE 500
#ifndef NOSSE2
struct ALIGN(16) TextureCache
#else
struct ALIGN(8) TextureCache
#endif
struct CACHE_ALIGN TextureCache
{
u32 id;
u32 frm;
@ -33,7 +31,6 @@ struct ALIGN(8) TextureCache
//set if this texture is suspected be invalid due to a vram reconfigure
bool suspectedInvalid;
};
extern TextureCache *texcache;

View File

@ -26,13 +26,17 @@
#include "config.h"
#endif
#ifndef _MSC_VER
#define NOSSE2
#ifdef _MSC_VER
#define ENABLE_SSE
#define ENABLE_SSE2
#endif
//if theres no sse2, also enforce no intrinsics
#if defined(NOSSE2)
#define SSE2_NOINTRIN
#ifdef NOSSE
#undef ENABLE_SSE
#endif
#ifdef NOSSE2
#undef ENABLE_SSE2
#endif
#ifdef _WIN32
@ -92,20 +96,6 @@
#endif
#endif
//#ifndef _PREFETCH
//#if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(NOSSE2)
//#include <xmmintrin.h>
//#include <intrin.h>
//#define _PREFETCH(X) _mm_prefetch((char*)(X),_MM_HINT_T0);
//#define _PREFETCHNTA(X) _mm_prefetch((char*)(X),_MM_HINT_NTA);
//#else
#define _PREFETCH(X) {}
#define _PREFETCHNTA(X) {}
//#endif
//#endif
#if defined(__LP64__)
typedef unsigned char u8;
typedef unsigned short u16;
@ -360,7 +350,45 @@ char (*BLAHBLAHBLAH( UNALIGNED T (&)[N] ))[N];
if((N)&0x001) MACRODO1((N)&(0x100|0x080|0x040|0x020|0x010|0x008|0x004|0x002),TODO); \
}
//---------------------------
//Binary constant generator macro By Tom Torfs - donated to the public domain
//turn a numeric literal into a hex constant
//(avoids problems with leading zeroes)
//8-bit constants max value 0x11111111, always fits in unsigned long
#define HEX__(n) 0x##n##LU
//8-bit conversion function
#define B8__(x) ((x&0x0000000FLU)?1:0) \
+((x&0x000000F0LU)?2:0) \
+((x&0x00000F00LU)?4:0) \
+((x&0x0000F000LU)?8:0) \
+((x&0x000F0000LU)?16:0) \
+((x&0x00F00000LU)?32:0) \
+((x&0x0F000000LU)?64:0) \
+((x&0xF0000000LU)?128:0)
//for upto 8-bit binary constants
#define B8(d) ((unsigned char)B8__(HEX__(d)))
// for upto 16-bit binary constants, MSB first
#define B16(dmsb,dlsb) (((unsigned short)B8(dmsb)<<8) \
+ B8(dlsb))
// for upto 32-bit binary constants, MSB first */
#define B32(dmsb,db2,db3,dlsb) (((unsigned long)B8(dmsb)<<24) \
+ ((unsigned long)B8(db2)<<16) \
+ ((unsigned long)B8(db3)<<8) \
+ B8(dlsb))
//Sample usage:
//B8(01010101) = 85
//B16(10101010,01010101) = 43605
//B32(10000000,11111111,10101010,01010101) = 2164238933
//---------------------------
#ifndef CTASSERT
#define CTASSERT(x) typedef char __assert ## y[(x) ? 1 : -1]
#endif
#endif

View File

@ -41,8 +41,12 @@
#endif
#endif
#ifdef NOSSE2
#define DESMUME_CPUEXT_STRING " NOSSE2"
#ifndef ENABLE_SSE2
#ifndef ENABLE_SSE
#define DESMUME_CPUEXT_STRING " NOSSE"
#else
#define DESMUME_CPUEXT_STRING " NOSSE2"
#endif
#else
#define DESMUME_CPUEXT_STRING ""
#endif

View File

@ -710,42 +710,6 @@
</File>
</Filter>
</Filter>
<Filter
Name="asm"
>
<File
RelativePath="..\matrix_sse2-x64.asm"
>
<FileConfiguration
Name="Debug|Win32"
ExcludedFromBuild="true"
>
<Tool
Name="MASM"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
ExcludedFromBuild="true"
>
<Tool
Name="MASM"
/>
</FileConfiguration>
<FileConfiguration
Name="Release FastBuild|Win32"
ExcludedFromBuild="true"
>
<Tool
Name="MASM"
/>
</FileConfiguration>
</File>
<File
RelativePath="..\matrix_sse2-x86.asm"
>
</File>
</Filter>
<Filter
Name="addons"
>

View File

@ -521,142 +521,6 @@
</File>
</Filter>
</Filter>
<Filter
Name="asm"
>
<File
RelativePath="..\matrix_sse2-x64.asm"
>
<FileConfiguration
Name="Debug|Win32"
ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
Description="Assembling x64..."
CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
Description="Assembling x64..."
CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release FastBuild|Win32"
ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
Description="Assembling x64..."
CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
Description="Assembling x64..."
CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
/>
</FileConfiguration>
<FileConfiguration
Name="Interim SSE2|x64"
>
<Tool
Name="VCCustomBuildTool"
Description="Assembling x64..."
CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Interim|x64"
ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
Description="Assembling x64..."
CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release (public)|x64"
ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
Description="Assembling x64..."
CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release SSE2 (public)|x64"
>
<Tool
Name="VCCustomBuildTool"
Description="Assembling x64..."
CommandLine="ml64 /nologo /c /Zi /Fo&quot;$(IntDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
RelativePath="..\matrix_sse2-x86.asm"
>
<FileConfiguration
Name="Debug|x64"
ExcludedFromBuild="true"
>
<Tool
Name="MASM"
/>
</FileConfiguration>
<FileConfiguration
Name="Interim SSE2|x64"
ExcludedFromBuild="true"
>
<Tool
Name="MASM"
/>
</FileConfiguration>
<FileConfiguration
Name="Interim|x64"
ExcludedFromBuild="true"
>
<Tool
Name="MASM"
/>
</FileConfiguration>
<FileConfiguration
Name="Release (public)|x64"
ExcludedFromBuild="true"
>
<Tool
Name="MASM"
/>
</FileConfiguration>
<FileConfiguration
Name="Release SSE2 (public)|x64"
ExcludedFromBuild="true"
>
<Tool
Name="MASM"
/>
</FileConfiguration>
</File>
</Filter>
<Filter
Name="addons"
>

View File

@ -5,10 +5,13 @@
//to customize your build, place a customized copy in the userconfig directory
//(alongside this defaultconfig directory)
//#define NOSSE2 //disables SSE2 optimizations (better change it in the vc++ codegen options too)
//disables SSE and SSE2 optimizations (better change it in the vc++ codegen options too)
//note that you may have to use this if your compiler doesn't support standard SSE intrinsics
//#define NOSSE
//#define NOSSE2
//#define DEVELOPER //enables dev+ features
//#define GDB_STUB //enables the gdb stub. for some reason this is separate from dev+ for now
//#define SSE2_NOINTRIN //indicates that you have a crippled compiler with no sse2 intrinsics (only relevant for SSE2 builds)
#endif //_USERCONFIG_H