reconfigure sse optimizations. all asm routines have been rewritten with intrinsics so that they are more neatly inlineable; this speeds them up by a couple of FPS; also, they work in gcc; and they can be enabled separately with ENABLE_SSE and ENABLE_SSE2; 95% of the optimizations come from the SSE optimizations, meaning that SSE2 will not be necessary to enjoy the bulk of the optimizations. I think.
This commit is contained in:
parent
3abc58ae36
commit
0d36fecf93
|
@ -1,9 +1,8 @@
|
|||
include $(top_srcdir)/src/desmume.mk
|
||||
|
||||
AM_CPPFLAGS += $(SDL_CFLAGS) $(GTK_CFLAGS) $(GTHREAD_CFLAGS) $(X_CFLAGS) $(LUA_CFLAGS) $(ALSA_CFLAGS) $(LIBAGG_CFLAGS)
|
||||
AM_CPPFLAGS += $(SDL_CFLAGS) $(GTK_CFLAGS) $(GTHREAD_CFLAGS) $(X_CFLAGS) $(LUA_CFLAGS) $(ALSA_CFLAGS) $(LIBAGG_CFLAGS)
|
||||
|
||||
EXTRA_DIST = build.bat instruction_tabdef.inc thumb_tabdef.inc fs-linux.cpp fs-windows.cpp \
|
||||
matrix_sse2-x64.asm matrix_sse2-x86.asm
|
||||
EXTRA_DIST = build.bat instruction_tabdef.inc thumb_tabdef.inc fs-linux.cpp fs-windows.cpp
|
||||
if HAVE_GDB_STUB
|
||||
SUBDIRS = . gdbstub $(UI_DIR)
|
||||
else
|
||||
|
|
|
@ -73,15 +73,8 @@ static void ENDGL() {
|
|||
#include "shaders.h"
|
||||
#include "texcache.h"
|
||||
|
||||
|
||||
|
||||
#ifndef CTASSERT
|
||||
#define CTASSERT(x) typedef char __assert ## y[(x) ? 1 : -1]
|
||||
#endif
|
||||
|
||||
static ALIGN(16) u8 GPU_screen3D [256*192*4];
|
||||
|
||||
|
||||
static const unsigned short map3d_cull[4] = {GL_FRONT_AND_BACK, GL_FRONT, GL_BACK, 0};
|
||||
static const int texEnv[4] = { GL_MODULATE, GL_DECAL, GL_MODULATE, GL_MODULATE };
|
||||
static const int depthFunc[2] = { GL_LESS, GL_EQUAL };
|
||||
|
@ -703,17 +696,15 @@ static void GL_ReadFramebuffer()
|
|||
u16* dst = gfx3d_convertedScreen + (y<<8);
|
||||
u8* dstAlpha = gfx3d_convertedAlpha + (y<<8);
|
||||
|
||||
#ifndef NOSSE2
|
||||
//I dont know much about this kind of stuff, but this seems to help
|
||||
//for some reason I couldnt make the intrinsics work
|
||||
u8* u8screen3D = (u8*)&((u32*)GPU_screen3D)[i];
|
||||
#define PREFETCH32(X,Y) __asm { prefetchnta [u8screen3D+32*0x##X##Y] }
|
||||
//u8* u8screen3D = (u8*)&((u32*)GPU_screen3D)[i];
|
||||
/*#define PREFETCH32(X,Y) __asm { prefetchnta [u8screen3D+32*0x##X##Y] }
|
||||
#define PREFETCH128(X) PREFETCH32(X,0) PREFETCH32(X,1) PREFETCH32(X,2) PREFETCH32(X,3) \
|
||||
PREFETCH32(X,4) PREFETCH32(X,5) PREFETCH32(X,6) PREFETCH32(X,7) \
|
||||
PREFETCH32(X,8) PREFETCH32(X,9) PREFETCH32(X,A) PREFETCH32(X,B) \
|
||||
PREFETCH32(X,C) PREFETCH32(X,D) PREFETCH32(X,E) PREFETCH32(X,F)
|
||||
PREFETCH128(0); PREFETCH128(1);
|
||||
#endif
|
||||
PREFETCH128(0); PREFETCH128(1);*/
|
||||
|
||||
for(int x=0;x<256;x++,i++)
|
||||
{
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
//2
|
||||
/* Copyright (C) 2006 yopyop
|
||||
yopyop156@ifrance.com
|
||||
yopyop156.ifrance.com
|
||||
|
@ -146,11 +147,6 @@ static float float10Table[1024];
|
|||
static float float10RelTable[1024];
|
||||
static float normalTable[1024];
|
||||
|
||||
#ifndef NOSSE2
|
||||
float ALIGN(16) _fix2float_divizor_mask[4] = { 4096.f, 4096.f, 4096.f, 4096.f };
|
||||
float ALIGN(16) _fix10_2float_divizor_mask[4] = { 512.f, 512.f, 512.f, 512.f };
|
||||
#endif
|
||||
|
||||
#define fix2float(v) (((float)((s32)(v))) / (float)(1<<12))
|
||||
#define fix10_2float(v) (((float)((s32)(v))) / (float)(1<<9))
|
||||
|
||||
|
@ -317,6 +313,20 @@ static void makeTables() {
|
|||
|
||||
void gfx3d_init()
|
||||
{
|
||||
//DWORD start = timeGetTime();
|
||||
//for(int i=0;i<1000000000;i++)
|
||||
// MatrixMultVec4x4(mtxCurrent[0],mtxCurrent[1]);
|
||||
//DWORD end = timeGetTime();
|
||||
//DWORD diff = end-start;
|
||||
|
||||
//start = timeGetTime();
|
||||
//for(int i=0;i<1000000000;i++)
|
||||
// MatrixMultVec4x4_b(mtxCurrent[0],mtxCurrent[1]);
|
||||
//end = timeGetTime();
|
||||
//DWORD diff2 = end-start;
|
||||
|
||||
//printf("SPEED TEST %d %d\n",diff,diff2);
|
||||
|
||||
if(polylists == NULL) { polylists = new POLYLIST[2]; polylist = &polylists[0]; }
|
||||
if(vertlists == NULL) { vertlists = new VERTLIST[2]; vertlist = &vertlists[0]; }
|
||||
makeTables();
|
||||
|
@ -409,15 +419,11 @@ static void SetVertex()
|
|||
if(polylist->count >= POLYLIST_SIZE)
|
||||
return;
|
||||
|
||||
#ifdef NOSSE2
|
||||
//apply modelview matrix
|
||||
MatrixMultVec4x4 (mtxCurrent[1], coordTransformed);
|
||||
|
||||
//apply projection matrix
|
||||
MatrixMultVec4x4 (mtxCurrent[0], coordTransformed);
|
||||
#else
|
||||
_sse2_MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed);
|
||||
#endif
|
||||
//TODO - think about keeping the clip matrix concatenated,
|
||||
//so that we only have to multiply one matrix here
|
||||
//(we could lazy cache the concatenated clip matrix and only generate it
|
||||
//when we need to)
|
||||
MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed);
|
||||
|
||||
//TODO - culling should be done here.
|
||||
//TODO - viewport transform?
|
||||
|
@ -694,11 +700,7 @@ void FORCEINLINE gfx3d_glLoadIdentity()
|
|||
|
||||
void FORCEINLINE gfx3d_glLoadMatrix4x4(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxCurrent[mode][0] = fix2float(v);
|
||||
#else
|
||||
mtxCurrent[mode][0] = v;
|
||||
#endif
|
||||
|
||||
for (int i = 1; i < 16; i++)
|
||||
{
|
||||
|
@ -707,16 +709,10 @@ void FORCEINLINE gfx3d_glLoadMatrix4x4(s32 v)
|
|||
|
||||
if (!GFX_PIPErecv(&cmd, ¶m)) break;
|
||||
dEXEC("glLoadMatrix4x4", 0x16, cmd);
|
||||
#ifdef NOSSE2
|
||||
mtxCurrent[mode][i] = fix2float((s32)param);
|
||||
#else
|
||||
mtxCurrent[mode][i] = (s32)param;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
|
||||
#endif
|
||||
vector_fix2float<4>(mtxCurrent[mode], 4096.f);
|
||||
|
||||
GFX_DELAY(19);
|
||||
|
||||
|
@ -726,11 +722,7 @@ void FORCEINLINE gfx3d_glLoadMatrix4x4(s32 v)
|
|||
|
||||
void FORCEINLINE gfx3d_glLoadMatrix4x3(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxCurrent[mode][0] = fix2float(v);
|
||||
#else
|
||||
mtxCurrent[mode][0] = v;
|
||||
#endif
|
||||
|
||||
for (int i = 1; i < 16; i++)
|
||||
{
|
||||
|
@ -740,16 +732,10 @@ void FORCEINLINE gfx3d_glLoadMatrix4x3(s32 v)
|
|||
|
||||
if (!GFX_PIPErecv(&cmd, ¶m)) break;
|
||||
dEXEC("glLoadMatrix4x3", 0x17, cmd);
|
||||
#ifdef NOSSE2
|
||||
mtxCurrent[mode][i] = fix2float((s32)param);
|
||||
#else
|
||||
mtxCurrent[mode][i] = (s32)param;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
|
||||
#endif
|
||||
vector_fix2float<4>(mtxCurrent[mode], 4096.f);
|
||||
|
||||
//fill in the unusued matrix values
|
||||
mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0.f;
|
||||
|
@ -763,11 +749,7 @@ void FORCEINLINE gfx3d_glLoadMatrix4x3(s32 v)
|
|||
|
||||
void FORCEINLINE gfx3d_glMultMatrix4x4(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxTemporal[0] = fix2float(v);
|
||||
#else
|
||||
mtxTemporal[0] = v;
|
||||
#endif
|
||||
|
||||
for (int i = 1; i < 16; i++)
|
||||
{
|
||||
|
@ -776,16 +758,10 @@ void FORCEINLINE gfx3d_glMultMatrix4x4(s32 v)
|
|||
|
||||
if (!GFX_PIPErecv(&cmd, ¶m)) break;
|
||||
dEXEC("glMultMatrix4x4", 0x18, cmd);
|
||||
#ifdef NOSSE2
|
||||
mtxTemporal[i] = fix2float((s32)param);
|
||||
#else
|
||||
mtxTemporal[i] = (s32)param;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
|
||||
#endif
|
||||
vector_fix2float<4>(mtxTemporal, 4096.f);
|
||||
|
||||
MatrixMultiply (mtxCurrent[mode], mtxTemporal);
|
||||
|
||||
|
@ -802,11 +778,7 @@ void FORCEINLINE gfx3d_glMultMatrix4x4(s32 v)
|
|||
|
||||
void FORCEINLINE gfx3d_glMultMatrix4x3(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxTemporal[0] = fix2float(v);
|
||||
#else
|
||||
mtxTemporal[0] = v;
|
||||
#endif
|
||||
|
||||
for (int i = 1; i < 16; i++)
|
||||
{
|
||||
|
@ -816,16 +788,10 @@ void FORCEINLINE gfx3d_glMultMatrix4x3(s32 v)
|
|||
|
||||
if (!GFX_PIPErecv(&cmd, ¶m)) break;
|
||||
dEXEC("glMultMatrix4x3", 0x19, cmd);
|
||||
#ifdef NOSSE2
|
||||
mtxTemporal[i] = fix2float((s32)param);
|
||||
#else
|
||||
mtxTemporal[i] = (s32)param;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
|
||||
#endif
|
||||
vector_fix2float<4>(mtxTemporal, 4096.f);
|
||||
|
||||
//fill in the unusued matrix values
|
||||
mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0.f;
|
||||
|
@ -847,12 +813,7 @@ void FORCEINLINE gfx3d_glMultMatrix4x3(s32 v)
|
|||
|
||||
void FORCEINLINE gfx3d_glMultMatrix3x3(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxTemporal[0] = fix2float(v);
|
||||
#else
|
||||
mtxTemporal[0] = v;
|
||||
#endif
|
||||
|
||||
|
||||
for (int i = 1; i < 12; i++)
|
||||
{
|
||||
|
@ -862,16 +823,10 @@ void FORCEINLINE gfx3d_glMultMatrix3x3(s32 v)
|
|||
|
||||
if (!GFX_PIPErecv(&cmd, ¶m)) break;
|
||||
dEXEC("glMultMatrix3x3", 0x1A, cmd);
|
||||
#ifdef NOSSE2
|
||||
mtxTemporal[i] = fix2float((s32)param);
|
||||
#else
|
||||
mtxTemporal[i] = (s32)param;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_12(mtxTemporal, _fix2float_divizor_mask);
|
||||
#endif
|
||||
vector_fix2float<3>(mtxTemporal, 4096.f);
|
||||
|
||||
//fill in the unusued matrix values
|
||||
mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0;
|
||||
|
@ -1276,12 +1231,7 @@ void FORCEINLINE gfx3d_glPosTest(u32 v)
|
|||
PTcoords[2] = float16table[param & 0xFFFF];
|
||||
PTcoords[3] = 1.0f;
|
||||
|
||||
#ifdef NOSSE2
|
||||
MatrixMultVec4x4 (mtxCurrent[1], PTcoords);
|
||||
MatrixMultVec4x4 (mtxCurrent[0], PTcoords);
|
||||
#else
|
||||
_sse2_MatrixMultVec4x4_M2(mtxCurrent[0], PTcoords);
|
||||
#endif
|
||||
MatrixMultVec4x4_M2(mtxCurrent[0], PTcoords);
|
||||
|
||||
gxstat &= 0xFFFFFFFE; // cleay busy bit
|
||||
T1WriteLong(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x600, gxstat);
|
||||
|
@ -1413,11 +1363,7 @@ void gfx3d_glLoadIdentity()
|
|||
|
||||
BOOL gfx3d_glLoadMatrix4x4(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxCurrent[mode][ML4x4ind] = fix2float(v);
|
||||
#else
|
||||
mtxCurrent[mode][ML4x4ind] = v;
|
||||
#endif
|
||||
|
||||
++ML4x4ind;
|
||||
if(ML4x4ind<16) return FALSE;
|
||||
|
@ -1425,9 +1371,7 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v)
|
|||
|
||||
GFX_DELAY(19);
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
|
||||
#endif
|
||||
vector_fix2float<4>(mtxCurrent[mode], 4096.f);
|
||||
|
||||
if (mode == 2)
|
||||
MatrixCopy (mtxCurrent[1], mtxCurrent[2]);
|
||||
|
@ -1436,20 +1380,14 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v)
|
|||
|
||||
BOOL gfx3d_glLoadMatrix4x3(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxCurrent[mode][ML4x3ind] = fix2float(v);
|
||||
#else
|
||||
mtxCurrent[mode][ML4x3ind] = v;
|
||||
#endif
|
||||
|
||||
ML4x3ind++;
|
||||
if((ML4x3ind & 0x03) == 3) ML4x3ind++;
|
||||
if(ML4x3ind<16) return FALSE;
|
||||
ML4x3ind = 0;
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
|
||||
#endif
|
||||
vector_fix2float<4>(mtxCurrent[mode], 4096.f);
|
||||
|
||||
//fill in the unusued matrix values
|
||||
mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0.f;
|
||||
|
@ -1464,11 +1402,7 @@ BOOL gfx3d_glLoadMatrix4x3(s32 v)
|
|||
|
||||
BOOL gfx3d_glMultMatrix4x4(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxTemporal[MM4x4ind] = fix2float(v);
|
||||
#else
|
||||
mtxTemporal[MM4x4ind] = v;
|
||||
#endif
|
||||
|
||||
MM4x4ind++;
|
||||
if(MM4x4ind<16) return FALSE;
|
||||
|
@ -1476,9 +1410,7 @@ BOOL gfx3d_glMultMatrix4x4(s32 v)
|
|||
|
||||
GFX_DELAY(35);
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
|
||||
#endif
|
||||
vector_fix2float<4>(mtxTemporal, 4096.f);
|
||||
|
||||
MatrixMultiply (mtxCurrent[mode], mtxTemporal);
|
||||
|
||||
|
@ -1494,11 +1426,7 @@ BOOL gfx3d_glMultMatrix4x4(s32 v)
|
|||
|
||||
BOOL gfx3d_glMultMatrix4x3(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxTemporal[MM4x3ind] = fix2float(v);
|
||||
#else
|
||||
mtxTemporal[MM4x3ind] = v;
|
||||
#endif
|
||||
|
||||
MM4x3ind++;
|
||||
if((MM4x3ind & 0x03) == 3) MM4x3ind++;
|
||||
|
@ -1507,9 +1435,7 @@ BOOL gfx3d_glMultMatrix4x3(s32 v)
|
|||
|
||||
GFX_DELAY(31);
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
|
||||
#endif
|
||||
vector_fix2float<4>(mtxTemporal, 4096.f);
|
||||
|
||||
//fill in the unusued matrix values
|
||||
mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0.f;
|
||||
|
@ -1530,11 +1456,7 @@ BOOL gfx3d_glMultMatrix4x3(s32 v)
|
|||
|
||||
BOOL gfx3d_glMultMatrix3x3(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxTemporal[MM3x3ind] = fix2float(v);
|
||||
#else
|
||||
mtxTemporal[MM3x3ind] = v;
|
||||
#endif
|
||||
|
||||
|
||||
MM3x3ind++;
|
||||
|
@ -1544,9 +1466,7 @@ BOOL gfx3d_glMultMatrix3x3(s32 v)
|
|||
|
||||
GFX_DELAY(28);
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_12(mtxTemporal, _fix2float_divizor_mask);
|
||||
#endif
|
||||
vector_fix2float<3>(mtxTemporal, 4096.f);
|
||||
|
||||
//fill in the unusued matrix values
|
||||
mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0;
|
||||
|
|
|
@ -25,17 +25,10 @@
|
|||
#include <assert.h>
|
||||
#include "matrix.h"
|
||||
|
||||
extern "C" {
|
||||
|
||||
|
||||
void MatrixInit (float *matrix)
|
||||
{
|
||||
memset (matrix, 0, sizeof(float)*16);
|
||||
matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
|
||||
}
|
||||
|
||||
#ifdef NOSSE2
|
||||
void MATRIXFASTCALL MatrixMultVec4x4 (const float *matrix, float *vecPtr)
|
||||
//-------------------------
|
||||
//switched SSE functions: implementations for no SSE
|
||||
#ifndef ENABLE_SSE
|
||||
void MatrixMultVec4x4 (const float *matrix, float *vecPtr)
|
||||
{
|
||||
float x = vecPtr[0];
|
||||
float y = vecPtr[1];
|
||||
|
@ -48,7 +41,8 @@ void MATRIXFASTCALL MatrixMultVec4x4 (const float *matrix, float *vecPtr)
|
|||
vecPtr[3] = x * matrix[3] + y * matrix[7] + z * matrix[11] + w * matrix[15];
|
||||
}
|
||||
|
||||
void MATRIXFASTCALL MatrixMultVec3x3 (const float *matrix, float *vecPtr)
|
||||
|
||||
void MatrixMultVec3x3 (const float *matrix, float *vecPtr)
|
||||
{
|
||||
float x = vecPtr[0];
|
||||
float y = vecPtr[1];
|
||||
|
@ -59,7 +53,7 @@ void MATRIXFASTCALL MatrixMultVec3x3 (const float *matrix, float *vecPtr)
|
|||
vecPtr[2] = x * matrix[2] + y * matrix[6] + z * matrix[10];
|
||||
}
|
||||
|
||||
void MATRIXFASTCALL MatrixMultiply (float *matrix, const float *rightMatrix)
|
||||
void MatrixMultiply (float *matrix, const float *rightMatrix)
|
||||
{
|
||||
float tmpMatrix[16];
|
||||
|
||||
|
@ -86,7 +80,7 @@ void MATRIXFASTCALL MatrixMultiply (float *matrix, const float *rightMatrix)
|
|||
memcpy (matrix, tmpMatrix, sizeof(float)*16);
|
||||
}
|
||||
|
||||
void MATRIXFASTCALL MatrixTranslate (float *matrix, const float *ptr)
|
||||
void MatrixTranslate (float *matrix, const float *ptr)
|
||||
{
|
||||
matrix[12] += (matrix[0]*ptr[0])+(matrix[4]*ptr[1])+(matrix[ 8]*ptr[2]);
|
||||
matrix[13] += (matrix[1]*ptr[0])+(matrix[5]*ptr[1])+(matrix[ 9]*ptr[2]);
|
||||
|
@ -94,7 +88,7 @@ void MATRIXFASTCALL MatrixTranslate (float *matrix, const float *ptr)
|
|||
matrix[15] += (matrix[3]*ptr[0])+(matrix[7]*ptr[1])+(matrix[11]*ptr[2]);
|
||||
}
|
||||
|
||||
void MATRIXFASTCALL MatrixScale (float *matrix, const float *ptr)
|
||||
void MatrixScale (float *matrix, const float *ptr)
|
||||
{
|
||||
matrix[0] *= ptr[0];
|
||||
matrix[1] *= ptr[0];
|
||||
|
@ -111,9 +105,16 @@ void MATRIXFASTCALL MatrixScale (float *matrix, const float *ptr)
|
|||
matrix[10] *= ptr[2];
|
||||
matrix[11] *= ptr[2];
|
||||
}
|
||||
|
||||
#endif //switched c/asm functions
|
||||
//-----------------------------------------
|
||||
|
||||
void MatrixInit (float *matrix)
|
||||
{
|
||||
memset (matrix, 0, sizeof(float)*16);
|
||||
matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
|
||||
}
|
||||
|
||||
void MatrixTranspose(float *matrix)
|
||||
{
|
||||
float temp;
|
||||
|
@ -127,7 +128,7 @@ void MatrixTranspose(float *matrix)
|
|||
#undef swap
|
||||
}
|
||||
|
||||
void MATRIXFASTCALL MatrixIdentity (float *matrix)
|
||||
void MatrixIdentity (float *matrix)
|
||||
{
|
||||
matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0.0f;
|
||||
matrix[6] = matrix[7] = matrix[8] = matrix[9] = 0.0f;
|
||||
|
@ -135,7 +136,7 @@ void MATRIXFASTCALL MatrixIdentity (float *matrix)
|
|||
matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
|
||||
}
|
||||
|
||||
float MATRIXFASTCALL MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix)
|
||||
float MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix)
|
||||
{
|
||||
int iMod = index%4, iDiv = (index>>2)<<2;
|
||||
|
||||
|
@ -143,12 +144,12 @@ float MATRIXFASTCALL MatrixGetMultipliedIndex (int index, float *matrix, float *
|
|||
(matrix[iMod+8]*rightMatrix[iDiv+2])+(matrix[iMod+12]*rightMatrix[iDiv+3]);
|
||||
}
|
||||
|
||||
void MATRIXFASTCALL MatrixSet (float *matrix, int x, int y, float value) // TODO
|
||||
void MatrixSet (float *matrix, int x, int y, float value) // TODO
|
||||
{
|
||||
matrix [x+(y<<2)] = value;
|
||||
}
|
||||
|
||||
void MATRIXFASTCALL MatrixCopy (float* matrixDST, const float* matrixSRC)
|
||||
void MatrixCopy (float* matrixDST, const float* matrixSRC)
|
||||
{
|
||||
matrixDST[0] = matrixSRC[0];
|
||||
matrixDST[1] = matrixSRC[1];
|
||||
|
@ -169,7 +170,7 @@ void MATRIXFASTCALL MatrixCopy (float* matrixDST, const float* matrixSRC)
|
|||
|
||||
}
|
||||
|
||||
int MATRIXFASTCALL MatrixCompare (const float* matrixDST, const float* matrixSRC)
|
||||
int MatrixCompare (const float* matrixDST, const float* matrixSRC)
|
||||
{
|
||||
return memcmp((void*)matrixDST, matrixSRC, sizeof(float)*16);
|
||||
}
|
||||
|
@ -340,5 +341,4 @@ void Vector4Copy(float *dst, const float *src)
|
|||
dst[3] = src[3];
|
||||
}
|
||||
|
||||
} //extern "C"
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2006-2007 shash
|
||||
/* Copyright (C) 2006-2007 shash
|
||||
Copyright (C) 2009 DeSmuME team
|
||||
|
||||
This file is part of DeSmuME
|
||||
|
||||
|
@ -27,17 +27,14 @@
|
|||
#include "types.h"
|
||||
#include "mem.h"
|
||||
|
||||
#if !defined(NOSSE2) && !defined(SSE2_NOINTRIN)
|
||||
#define SSE2_INTRIN
|
||||
#ifdef ENABLE_SSE
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef SSE2_INTRIN
|
||||
#include <xmmintrin.h>
|
||||
#ifdef ENABLE_SSE2
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
|
||||
struct MatrixStack
|
||||
{
|
||||
MatrixStack(int size);
|
||||
|
@ -48,42 +45,15 @@ struct MatrixStack
|
|||
|
||||
void MatrixInit (float *matrix);
|
||||
|
||||
#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
|
||||
#define MATRIXFASTCALL __fastcall
|
||||
#else
|
||||
#define MATRIXFASTCALL
|
||||
#endif
|
||||
|
||||
//In order to conditionally use these asm optimized functions in visual studio
|
||||
//without having to make new build types to exclude the assembly files.
|
||||
//a bit sloppy, but there aint much to it
|
||||
#ifndef NOSSE2
|
||||
#define SSE2_FUNC(X) _sse2_##X
|
||||
#define MatrixMultVec4x4 _sse2_MatrixMultVec4x4
|
||||
#define MatrixMultVec3x3 _sse2_MatrixMultVec3x3
|
||||
#define MatrixMultiply _sse2_MatrixMultiply
|
||||
#define MatrixTranslate _sse2_MatrixTranslate
|
||||
#define MatrixScale _sse2_MatrixScale
|
||||
void MATRIXFASTCALL _sse2_fix2float_16 (float* matrix, float* divizor_mask);
|
||||
void MATRIXFASTCALL _sse2_fix2float_12 (float* matrix, float* divizor_mask);
|
||||
void MATRIXFASTCALL _sse2_MatrixMultVec4x4_M2 (const float * matrix, float * vecPtr); // mode 2
|
||||
#else
|
||||
#define SSE2_FUNC(X) X
|
||||
#endif
|
||||
|
||||
void MATRIXFASTCALL SSE2_FUNC(MatrixMultVec3x3) (const float * matrix, float * vecPtr);
|
||||
void MATRIXFASTCALL SSE2_FUNC(MatrixMultVec4x4) (const float * matrix, float * vecPtr);
|
||||
void MATRIXFASTCALL SSE2_FUNC(MatrixMultiply) (float * matrix, const float * rightMatrix);
|
||||
void MATRIXFASTCALL SSE2_FUNC(MatrixTranslate) (float *matrix, const float *ptr);
|
||||
void MATRIXFASTCALL SSE2_FUNC(MatrixScale) (float * matrix, const float * ptr);
|
||||
|
||||
|
||||
|
||||
float MATRIXFASTCALL MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix);
|
||||
void MATRIXFASTCALL MatrixSet (float *matrix, int x, int y, float value);
|
||||
void MATRIXFASTCALL MatrixCopy (float * matrixDST, const float * matrixSRC);
|
||||
int MATRIXFASTCALL MatrixCompare (const float * matrixDST, const float * matrixSRC);
|
||||
void MATRIXFASTCALL MatrixIdentity (float *matrix);
|
||||
float MatrixGetMultipliedIndex (int index, float *matrix, float *rightMatrix);
|
||||
void MatrixSet (float *matrix, int x, int y, float value);
|
||||
void MatrixCopy (float * matrixDST, const float * matrixSRC);
|
||||
int MatrixCompare (const float * matrixDST, const float * matrixSRC);
|
||||
void MatrixIdentity (float *matrix);
|
||||
|
||||
void MatrixTranspose (float *matrix);
|
||||
void MatrixStackInit (MatrixStack *stack);
|
||||
|
@ -112,27 +82,21 @@ void Vector3Normalize(float *dst);
|
|||
|
||||
void Vector4Copy(float *dst, const float *src);
|
||||
|
||||
} //extern "C"
|
||||
|
||||
//these functions are an unreliable, inaccurate floor.
|
||||
//it should only be used for positive numbers
|
||||
//this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available
|
||||
FORCEINLINE u32 u32floor(float f)
|
||||
{
|
||||
#if defined(SSE2_INTRIN)
|
||||
return (u32)_mm_cvttss_si32(_mm_set_ss(f));
|
||||
#elif !defined(NOSSE2)
|
||||
__asm cvttss2si eax, f;
|
||||
#ifdef ENABLE_SSE2
|
||||
return (u32)_mm_cvtt_ss2si(_mm_set_ss(f));
|
||||
#else
|
||||
return (u32)f;
|
||||
#endif
|
||||
}
|
||||
FORCEINLINE u32 u32floor(double d)
|
||||
{
|
||||
#if defined(SSE2_INTRIN)
|
||||
#ifdef ENABLE_SSE2
|
||||
return (u32)_mm_cvttsd_si32(_mm_set_sd(d));
|
||||
#elif !defined(NOSSE2)
|
||||
__asm cvttsd2si eax, d;
|
||||
#else
|
||||
return (u32)d;
|
||||
#endif
|
||||
|
@ -142,66 +106,212 @@ FORCEINLINE u32 u32floor(double d)
|
|||
//be sure that the results are the same thing as floorf!
|
||||
FORCEINLINE s32 s32floor(float f)
|
||||
{
|
||||
#if defined(SSE2_INTRIN)
|
||||
#ifdef ENABLE_SSE2
|
||||
return _mm_cvtss_si32( _mm_add_ss(_mm_set_ss(-0.5f),_mm_add_ss(_mm_set_ss(f), _mm_set_ss(f))) ) >> 1;
|
||||
#elif !defined(NOSSE2)
|
||||
static const float c = -0.5f;
|
||||
__asm
|
||||
{
|
||||
movss xmm0, f;
|
||||
addss xmm0, xmm0;
|
||||
addss xmm0, c;
|
||||
cvtss2si eax, xmm0
|
||||
sar eax, 1
|
||||
}
|
||||
#else
|
||||
return (s32)floorf(f);
|
||||
#endif
|
||||
}
|
||||
|
||||
//now comes some sse2 functions coded solely with intrinsics.
|
||||
//let's wait and see how many people this upsets.
|
||||
//they can always #define SSE2_NOINTRIN in their userconfig.h....
|
||||
|
||||
#ifdef SSE2_INTRIN
|
||||
//switched SSE2 functions
|
||||
//-------------
|
||||
#ifdef ENABLE_SSE
|
||||
|
||||
template<int NUM>
|
||||
static FORCEINLINE void memset_u16_le(void* dst, u16 val)
|
||||
FORCEINLINE void memset_u16_le(void* dst, u16 val)
|
||||
{
|
||||
u32 u32val;
|
||||
//just for the endian safety
|
||||
T1WriteWord((u8*)&u32val,0,val);
|
||||
T1WriteWord((u8*)&u32val,2,val);
|
||||
const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
|
||||
MACRODO_N(NUM/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp));
|
||||
////const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
|
||||
__m128 temp; temp.m128_i32[0] = u32val;
|
||||
//MACRODO_N(NUM/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp));
|
||||
MACRODO_N(NUM/8,_mm_store_ps1((float*)((u8*)dst+(X)*16), temp));
|
||||
}
|
||||
#else
|
||||
|
||||
#else //no sse2
|
||||
|
||||
template<int NUM>
|
||||
static FORCEINLINE void memset_u16_le(void* dst, u16 val)
|
||||
{
|
||||
for(int i=0;i<NUM;i++)
|
||||
T1WriteWord((u8*)dst,i<<1,val);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//---------------------------
|
||||
//switched SSE functions
|
||||
#ifdef ENABLE_SSE
|
||||
|
||||
struct SSE_MATRIX
|
||||
{
|
||||
SSE_MATRIX(const float *matrix)
|
||||
: row0(_mm_load_ps(matrix))
|
||||
, row1(_mm_load_ps(matrix+4))
|
||||
, row2(_mm_load_ps(matrix+8))
|
||||
, row3(_mm_load_ps(matrix+12))
|
||||
{}
|
||||
|
||||
union {
|
||||
__m128 rows[4];
|
||||
struct { __m128 row0; __m128 row1; __m128 row2; __m128 row3; };
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
FORCEINLINE __m128 _util_MatrixMultVec4x4_(const SSE_MATRIX &mat, __m128 vec)
|
||||
{
|
||||
__m128 xmm5 = _mm_shuffle_ps(vec, vec, B8(01010101));
|
||||
__m128 xmm6 = _mm_shuffle_ps(vec, vec, B8(10101010));
|
||||
__m128 xmm7 = _mm_shuffle_ps(vec, vec, B8(11111111));
|
||||
__m128 xmm4 = _mm_shuffle_ps(vec, vec, B8(00000000));
|
||||
|
||||
xmm4 = _mm_mul_ps(xmm4,mat.row0);
|
||||
xmm5 = _mm_mul_ps(xmm5,mat.row1);
|
||||
xmm6 = _mm_mul_ps(xmm6,mat.row2);
|
||||
xmm7 = _mm_mul_ps(xmm7,mat.row3);
|
||||
xmm4 = _mm_add_ps(xmm4,xmm5);
|
||||
xmm4 = _mm_add_ps(xmm4,xmm6);
|
||||
xmm4 = _mm_add_ps(xmm4,xmm7);
|
||||
return xmm4;
|
||||
}
|
||||
|
||||
FORCEINLINE void MatrixMultiply(float * matrix, const float * rightMatrix)
|
||||
{
|
||||
//this seems to generate larger code, including many movaps, but maybe it is less harsh on the registers than the
|
||||
//more hand-tailored approach
|
||||
__m128 row0 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix));
|
||||
__m128 row1 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+4));
|
||||
__m128 row2 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+8));
|
||||
__m128 row3 = _util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(rightMatrix+12));
|
||||
_mm_store_ps(matrix,row0);
|
||||
_mm_store_ps(matrix+4,row1);
|
||||
_mm_store_ps(matrix+8,row2);
|
||||
_mm_store_ps(matrix+12,row3);
|
||||
}
|
||||
|
||||
|
||||
|
||||
FORCEINLINE void MatrixMultVec4x4(const float *matrix, float *vecPtr)
|
||||
{
|
||||
_mm_store_ps(vecPtr,_util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(vecPtr)));
|
||||
}
|
||||
|
||||
FORCEINLINE void MatrixMultVec4x4_M2(const float *matrix, float *vecPtr)
|
||||
{
|
||||
//there are hardly any gains from merging these manually
|
||||
MatrixMultVec4x4(matrix+16,vecPtr);
|
||||
MatrixMultVec4x4(matrix,vecPtr);
|
||||
}
|
||||
|
||||
|
||||
FORCEINLINE void MatrixMultVec3x3(const float * matrix, float * vecPtr)
|
||||
{
|
||||
const __m128 vec = _mm_load_ps(vecPtr);
|
||||
|
||||
__m128 xmm5 = _mm_shuffle_ps(vec, vec, B8(01010101));
|
||||
__m128 xmm6 = _mm_shuffle_ps(vec, vec, B8(10101010));
|
||||
__m128 xmm4 = _mm_shuffle_ps(vec, vec, B8(00000000));
|
||||
|
||||
const SSE_MATRIX mat(matrix);
|
||||
|
||||
xmm4 = _mm_mul_ps(xmm4,mat.row0);
|
||||
xmm5 = _mm_mul_ps(xmm5,mat.row1);
|
||||
xmm6 = _mm_mul_ps(xmm6,mat.row2);
|
||||
xmm4 = _mm_add_ps(xmm4,xmm5);
|
||||
xmm4 = _mm_add_ps(xmm4,xmm6);
|
||||
|
||||
_mm_store_ps(vecPtr,xmm4);
|
||||
}
|
||||
|
||||
FORCEINLINE void MatrixTranslate(float *matrix, const float *ptr)
|
||||
{
|
||||
__m128 xmm4 = _mm_load_ps(ptr);
|
||||
__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
|
||||
__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
|
||||
xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
|
||||
|
||||
xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(matrix));
|
||||
xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(matrix+4));
|
||||
xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(matrix+8));
|
||||
xmm4 = _mm_add_ps(xmm4,xmm5);
|
||||
xmm4 = _mm_add_ps(xmm4,xmm6);
|
||||
xmm4 = _mm_add_ps(xmm4,_mm_load_ps(matrix+12));
|
||||
_mm_store_ps(matrix+12,xmm4);
|
||||
}
|
||||
|
||||
FORCEINLINE void MatrixScale(float *matrix, const float *ptr)
|
||||
{
|
||||
__m128 xmm4 = _mm_load_ps(ptr);
|
||||
__m128 xmm5 = _mm_shuffle_ps(xmm4, xmm4, B8(01010101));
|
||||
__m128 xmm6 = _mm_shuffle_ps(xmm4, xmm4, B8(10101010));
|
||||
xmm4 = _mm_shuffle_ps(xmm4, xmm4, B8(00000000));
|
||||
|
||||
xmm4 = _mm_mul_ps(xmm4,_mm_load_ps(matrix));
|
||||
xmm5 = _mm_mul_ps(xmm5,_mm_load_ps(matrix+4));
|
||||
xmm6 = _mm_mul_ps(xmm6,_mm_load_ps(matrix+8));
|
||||
_mm_store_ps(matrix,xmm4);
|
||||
_mm_store_ps(matrix+4,xmm5);
|
||||
_mm_store_ps(matrix+8,xmm6);
|
||||
}
|
||||
|
||||
template<int NUM_ROWS>
|
||||
FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
|
||||
{
|
||||
CTASSERT(NUM_ROWS==3 || NUM_ROWS==4);
|
||||
|
||||
const __m128 val = _mm_set_ps1(divisor);
|
||||
|
||||
_mm_store_ps(matrix,_mm_div_ps(_mm_load_ps(matrix),val));
|
||||
_mm_store_ps(matrix+4,_mm_div_ps(_mm_load_ps(matrix+4),val));
|
||||
_mm_store_ps(matrix+8,_mm_div_ps(_mm_load_ps(matrix+8),val));
|
||||
if(NUM_ROWS==4)
|
||||
_mm_store_ps(matrix+12,_mm_div_ps(_mm_load_ps(matrix+12),val));
|
||||
}
|
||||
|
||||
//WARNING: I do not think this is as fast as a memset, for some reason.
|
||||
//at least in vc2005 with sse enabled. better figure out why before using it
|
||||
#ifdef SSE2_INTRIN
|
||||
template<int NUM>
|
||||
static FORCEINLINE void memset_u8(void* _dst, u8 val)
|
||||
{
|
||||
const u8* dst = (u8*)_dst;
|
||||
u32 u32val = (val<<24)|(val<<16)|(val<<8)|val;
|
||||
const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
|
||||
MACRODO_N(NUM/16,_mm_store_si128((__m128i*)(dst+(X)*16), temp));
|
||||
memset(_dst,val,NUM);
|
||||
//const u8* dst = (u8*)_dst;
|
||||
//u32 u32val = (val<<24)|(val<<16)|(val<<8)|val;
|
||||
//const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
|
||||
//MACRODO_N(NUM/16,_mm_store_si128((__m128i*)(dst+(X)*16), temp));
|
||||
}
|
||||
#else
|
||||
|
||||
#else //no sse
|
||||
|
||||
void MatrixMultVec4x4 (const float *matrix, float *vecPtr);
|
||||
void MatrixMultVec3x3(const float * matrix, float * vecPtr);
|
||||
void MatrixMultiply(float * matrix, const float * rightMatrix);
|
||||
void MatrixTranslate(float *matrix, const float *ptr);
|
||||
void MatrixScale(float * matrix, const float * ptr);
|
||||
|
||||
FORCEINLINE void MatrixMultVec4x4_M2(const float *matrix, float *vecPtr)
|
||||
{
|
||||
//there are hardly any gains from merging these manually
|
||||
MatrixMultVec4x4(matrix+16,vecPtr);
|
||||
MatrixMultVec4x4(matrix,vecPtr);
|
||||
}
|
||||
|
||||
template<int NUM_ROWS>
|
||||
FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
|
||||
{
|
||||
for(int i=0;i<NUM_ROWS*4;i++)
|
||||
matrix[i] /= divisor;
|
||||
}
|
||||
|
||||
template<int NUM>
|
||||
static FORCEINLINE void memset_u8(void* dst, u8 val)
|
||||
{
|
||||
memset(dst,val,NUM);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //switched SSE functions
|
||||
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,182 +0,0 @@
|
|||
;
|
||||
; Copyright (C) 2006 yopyop
|
||||
; Copyright (C) 2008 CrazyMax
|
||||
;
|
||||
; This file is part of DeSmuME
|
||||
;
|
||||
; DeSmuME is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU General Public License as published by
|
||||
; the Free Software Foundation; either version 2 of the License, or
|
||||
; (at your option) any later version.
|
||||
;
|
||||
; DeSmuME is distributed in the hope that it will be useful,
|
||||
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
; GNU General Public License for more details.
|
||||
;
|
||||
; You should have received a copy of the GNU General Public License
|
||||
; along with DeSmuME; if not, write to the Free Software
|
||||
; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
TITLE matrix_sse2-x64.asm
|
||||
.code
|
||||
|
||||
_sse2_MatrixMultVec4x4 PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [rcx]
|
||||
movaps xmm1, XMMWORD PTR [rcx+16]
|
||||
movaps xmm2, XMMWORD PTR [rcx+32]
|
||||
movaps xmm3, XMMWORD PTR [rcx+48]
|
||||
movaps xmm4, XMMWORD PTR [rdx]
|
||||
movaps xmm5, xmm4
|
||||
movaps xmm6, xmm4
|
||||
movaps xmm7, xmm4
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
shufps xmm5, xmm5, 01010101b
|
||||
shufps xmm6, xmm6, 10101010b
|
||||
shufps xmm7, xmm7, 11111111b
|
||||
mulps xmm4, xmm0
|
||||
mulps xmm5, xmm1
|
||||
mulps xmm6, xmm2
|
||||
mulps xmm7, xmm3
|
||||
addps xmm4, xmm5
|
||||
addps xmm4, xmm6
|
||||
addps xmm4, xmm7
|
||||
movaps XMMWORD PTR [rdx], xmm4
|
||||
ret 0
|
||||
_sse2_MatrixMultVec4x4 ENDP
|
||||
|
||||
_sse2_MatrixMultVec3x3 PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [rcx]
|
||||
movaps xmm1, XMMWORD PTR [rcx+16]
|
||||
movaps xmm2, XMMWORD PTR [rcx+32]
|
||||
movaps xmm4, XMMWORD PTR [rdx]
|
||||
movaps xmm5, xmm4
|
||||
movaps xmm6, xmm4
|
||||
movaps xmm7, xmm4
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
shufps xmm5, xmm5, 01010101b
|
||||
shufps xmm6, xmm6, 10101010b
|
||||
mulps xmm4, xmm0
|
||||
mulps xmm5, xmm1
|
||||
mulps xmm6, xmm2
|
||||
addps xmm4, xmm5
|
||||
addps xmm4, xmm6
|
||||
movaps XMMWORD PTR [rdx], xmm4
|
||||
ret 0
|
||||
_sse2_MatrixMultVec3x3 ENDP
|
||||
|
||||
_sse2_MatrixMultiply PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [rcx]
|
||||
movaps xmm1, XMMWORD PTR [rcx+16]
|
||||
movaps xmm2, XMMWORD PTR [rcx+32]
|
||||
movaps xmm3, XMMWORD PTR [rcx+48]
|
||||
movaps xmm4, XMMWORD PTR [rdx] ; r00, r01, r02, r03
|
||||
movaps xmm8, XMMWORD PTR [rdx+16] ; r04, r05, r06, r07
|
||||
movaps xmm5,xmm4
|
||||
movaps xmm6,xmm4
|
||||
movaps xmm7,xmm4
|
||||
movaps xmm9,xmm8 ;
|
||||
movaps xmm10,xmm8
|
||||
movaps xmm11,xmm8
|
||||
shufps xmm4,xmm4,00000000b
|
||||
shufps xmm5,xmm5,01010101b
|
||||
shufps xmm6,xmm6,10101010b
|
||||
shufps xmm7,xmm7,11111111b
|
||||
shufps xmm8, xmm8, 00000000b ;
|
||||
shufps xmm9, xmm9, 01010101b
|
||||
shufps xmm10,xmm10,10101010b
|
||||
shufps xmm11,xmm11,11111111b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
mulps xmm7,xmm3
|
||||
mulps xmm8, xmm0 ;
|
||||
mulps xmm9, xmm1
|
||||
mulps xmm10,xmm2
|
||||
mulps xmm11,xmm3
|
||||
addps xmm4,xmm5
|
||||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
addps xmm8,xmm9 ;
|
||||
addps xmm8,xmm10
|
||||
addps xmm8,xmm11
|
||||
movaps XMMWORD PTR [rcx],xmm4
|
||||
movaps XMMWORD PTR [rcx+16],xmm8
|
||||
|
||||
movaps xmm4, XMMWORD PTR [rdx+32] ; r00, r01, r02, r03
|
||||
movaps xmm8, XMMWORD PTR [rdx+48] ; r04, r05, r06, r07
|
||||
movaps xmm5,xmm4
|
||||
movaps xmm6,xmm4
|
||||
movaps xmm7,xmm4
|
||||
movaps xmm9,xmm8 ;
|
||||
movaps xmm10,xmm8
|
||||
movaps xmm11,xmm8
|
||||
shufps xmm4,xmm4,00000000b
|
||||
shufps xmm5,xmm5,01010101b
|
||||
shufps xmm6,xmm6,10101010b
|
||||
shufps xmm7,xmm7,11111111b
|
||||
shufps xmm8, xmm8, 00000000b ;
|
||||
shufps xmm9, xmm9, 01010101b
|
||||
shufps xmm10,xmm10,10101010b
|
||||
shufps xmm11,xmm11,11111111b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
mulps xmm7,xmm3
|
||||
mulps xmm8, xmm0 ;
|
||||
mulps xmm9, xmm1
|
||||
mulps xmm10,xmm2
|
||||
mulps xmm11,xmm3
|
||||
addps xmm4,xmm5
|
||||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
addps xmm8,xmm9 ;
|
||||
addps xmm8,xmm10
|
||||
addps xmm8,xmm11
|
||||
movaps XMMWORD PTR [rcx+32],xmm4
|
||||
movaps XMMWORD PTR [rcx+48],xmm8
|
||||
ret 0
|
||||
_sse2_MatrixMultiply ENDP
|
||||
|
||||
_sse2_MatrixTranslate PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [rcx]
|
||||
movaps xmm1, XMMWORD PTR [rcx+16]
|
||||
movaps xmm2, XMMWORD PTR [rcx+32]
|
||||
movaps xmm3, XMMWORD PTR [rcx+48]
|
||||
movaps xmm4, XMMWORD PTR [rdx]
|
||||
movaps xmm5, xmm4
|
||||
movaps xmm6, xmm4
|
||||
movaps xmm7, xmm4
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
shufps xmm5, xmm5, 01010101b
|
||||
shufps xmm6, xmm6, 10101010b
|
||||
mulps xmm4, xmm0
|
||||
mulps xmm5, xmm1
|
||||
mulps xmm6, xmm2
|
||||
addps xmm4, xmm5
|
||||
addps xmm4, xmm6
|
||||
addps xmm4, xmm3
|
||||
movaps XMMWORD PTR [rcx+48], xmm4
|
||||
ret 0
|
||||
_sse2_MatrixTranslate ENDP
|
||||
|
||||
_sse2_MatrixScale PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [rcx]
|
||||
movaps xmm1, XMMWORD PTR [rcx+16]
|
||||
movaps xmm2, XMMWORD PTR [rcx+32]
|
||||
movaps xmm4, XMMWORD PTR [rdx]
|
||||
movaps xmm5, xmm4
|
||||
movaps xmm6, xmm4
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
shufps xmm5, xmm5, 01010101b
|
||||
shufps xmm6, xmm6, 10101010b
|
||||
mulps xmm4, xmm0
|
||||
mulps xmm5, xmm1
|
||||
mulps xmm6, xmm2
|
||||
movaps XMMWORD PTR [rcx],xmm4
|
||||
movaps XMMWORD PTR [rcx+16],xmm5
|
||||
movaps XMMWORD PTR [rcx+32],xmm6
|
||||
ret 0
|
||||
_sse2_MatrixScale ENDP
|
||||
|
||||
end
|
|
@ -1,214 +0,0 @@
|
|||
;
|
||||
; Copyright (C) 2006 yopyop
|
||||
; Copyright (C) 2008 CrazyMax
|
||||
;
|
||||
; This file is part of DeSmuME
|
||||
;
|
||||
; DeSmuME is free software; you can redistribute it and/or modify
|
||||
; it under the terms of the GNU General Public License as published by
|
||||
; the Free Software Foundation; either version 2 of the License, or
|
||||
; (at your option) any later version.
|
||||
;
|
||||
; DeSmuME is distributed in the hope that it will be useful,
|
||||
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
; GNU General Public License for more details.
|
||||
;
|
||||
; You should have received a copy of the GNU General Public License
|
||||
; along with DeSmuME; if not, write to the Free Software
|
||||
; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
TITLE matrix_sse2-x86.asm
|
||||
.686P
|
||||
.XMM
|
||||
.model flat
|
||||
.code
|
||||
|
||||
@_sse2_MatrixMultVec4x4@8 PROC PUBLIC
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
pshufd xmm7, xmm4, 11111111b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
mulps xmm4, XMMWORD PTR [ecx]
|
||||
mulps xmm5, XMMWORD PTR [ecx+16]
|
||||
mulps xmm6, XMMWORD PTR [ecx+32]
|
||||
mulps xmm7, XMMWORD PTR [ecx+48]
|
||||
addps xmm4, xmm5
|
||||
addps xmm4, xmm6
|
||||
addps xmm4, xmm7
|
||||
movaps XMMWORD PTR [edx], xmm4
|
||||
ret 0
|
||||
@_sse2_MatrixMultVec4x4@8 ENDP
|
||||
|
||||
@_sse2_MatrixMultVec4x4_M2@8 PROC PUBLIC
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
pshufd xmm7, xmm4, 11111111b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
mulps xmm4, XMMWORD PTR [ecx+64]
|
||||
mulps xmm5, XMMWORD PTR [ecx+80]
|
||||
mulps xmm6, XMMWORD PTR [ecx+96]
|
||||
mulps xmm7, XMMWORD PTR [ecx+112]
|
||||
addps xmm4, xmm5
|
||||
addps xmm4, xmm6
|
||||
addps xmm4, xmm7
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
pshufd xmm7, xmm4, 11111111b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
mulps xmm4, XMMWORD PTR [ecx]
|
||||
mulps xmm5, XMMWORD PTR [ecx+16]
|
||||
mulps xmm6, XMMWORD PTR [ecx+32]
|
||||
mulps xmm7, XMMWORD PTR [ecx+48]
|
||||
addps xmm4, xmm5
|
||||
addps xmm4, xmm6
|
||||
addps xmm4, xmm7
|
||||
movaps XMMWORD PTR [edx], xmm4
|
||||
ret 0
|
||||
@_sse2_MatrixMultVec4x4_M2@8 ENDP
|
||||
|
||||
|
||||
@_sse2_MatrixMultVec3x3@8 PROC PUBLIC
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
mulps xmm4, XMMWORD PTR [ecx]
|
||||
mulps xmm5, XMMWORD PTR [ecx+16]
|
||||
mulps xmm6, XMMWORD PTR [ecx+32]
|
||||
addps xmm4, xmm5
|
||||
addps xmm4, xmm6
|
||||
movaps XMMWORD PTR [edx], xmm4
|
||||
ret 0
|
||||
@_sse2_MatrixMultVec3x3@8 ENDP
|
||||
|
||||
@_sse2_MatrixMultiply@8 PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR [ecx]
|
||||
movaps xmm1, XMMWORD PTR [ecx+16]
|
||||
movaps xmm2, XMMWORD PTR [ecx+32]
|
||||
movaps xmm3, XMMWORD PTR [ecx+48]
|
||||
movaps xmm4, XMMWORD PTR [edx] ; r00, r01, r02, r03
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
pshufd xmm7, xmm4, 11111111b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
mulps xmm7,xmm3
|
||||
addps xmm4,xmm5
|
||||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
movaps XMMWORD PTR [ecx],xmm4
|
||||
|
||||
movaps xmm4, XMMWORD PTR [edx+16] ; r04, r05, r06, r07
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
pshufd xmm7, xmm4, 11111111b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
mulps xmm7,xmm3
|
||||
addps xmm4,xmm5
|
||||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
movaps XMMWORD PTR [ecx+16],xmm4
|
||||
|
||||
movaps xmm4, XMMWORD PTR [edx+32] ; r08, r09, r10, r11
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
pshufd xmm7, xmm4, 11111111b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
mulps xmm7,xmm3
|
||||
addps xmm4,xmm5
|
||||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
movaps XMMWORD PTR [ecx+32],xmm4
|
||||
|
||||
movaps xmm4, XMMWORD PTR [edx+48] ; r12, r13, r14, r15
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
pshufd xmm7, xmm4, 11111111b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
mulps xmm4,xmm0
|
||||
mulps xmm5,xmm1
|
||||
mulps xmm6,xmm2
|
||||
mulps xmm7,xmm3
|
||||
addps xmm4,xmm5
|
||||
addps xmm4,xmm6
|
||||
addps xmm4,xmm7
|
||||
movaps XMMWORD PTR [ecx+48],xmm4
|
||||
|
||||
ret 0
|
||||
@_sse2_MatrixMultiply@8 ENDP
|
||||
|
||||
@_sse2_MatrixTranslate@8 PROC PUBLIC
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
mulps xmm4, XMMWORD PTR [ecx]
|
||||
mulps xmm5, XMMWORD PTR [ecx+16]
|
||||
mulps xmm6, XMMWORD PTR [ecx+32]
|
||||
addps xmm4, xmm5
|
||||
addps xmm4, xmm6
|
||||
addps xmm4, XMMWORD PTR [ecx+48]
|
||||
movaps XMMWORD PTR [ecx+48], xmm4
|
||||
ret 0
|
||||
@_sse2_MatrixTranslate@8 ENDP
|
||||
|
||||
@_sse2_MatrixScale@8 PROC PUBLIC
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
mulps xmm4, XMMWORD PTR [ecx]
|
||||
mulps xmm5, XMMWORD PTR [ecx+16]
|
||||
mulps xmm6, XMMWORD PTR [ecx+32]
|
||||
movaps XMMWORD PTR [ecx], xmm4
|
||||
movaps XMMWORD PTR [ecx+16], xmm5
|
||||
movaps XMMWORD PTR [ecx+32], xmm6
|
||||
ret 0
|
||||
@_sse2_MatrixScale@8 ENDP
|
||||
|
||||
@_sse2_fix2float_12@8 PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR[ecx]
|
||||
movaps xmm1, XMMWORD PTR[ecx+16]
|
||||
movaps xmm2, XMMWORD PTR[ecx+32]
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
;prefetchnta [ecx+64]
|
||||
divps xmm0, xmm4
|
||||
divps xmm1, xmm4
|
||||
divps xmm2, xmm4
|
||||
movaps XMMWORD PTR[ecx], xmm0
|
||||
movaps XMMWORD PTR[ecx+16],xmm1
|
||||
movaps XMMWORD PTR[ecx+32],xmm2
|
||||
ret 0
|
||||
@_sse2_fix2float_12@8 ENDP
|
||||
|
||||
@_sse2_fix2float_16@8 PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR[ecx]
|
||||
movaps xmm1, XMMWORD PTR[ecx+16]
|
||||
movaps xmm2, XMMWORD PTR[ecx+32]
|
||||
movaps xmm3, XMMWORD PTR[ecx+48]
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
;prefetchnta [ecx+64]
|
||||
divps xmm0, xmm4
|
||||
divps xmm1, xmm4
|
||||
divps xmm2, xmm4
|
||||
divps xmm3, xmm4
|
||||
movaps XMMWORD PTR[ecx], xmm0
|
||||
movaps XMMWORD PTR[ecx+16],xmm1
|
||||
movaps XMMWORD PTR[ecx+32],xmm2
|
||||
movaps XMMWORD PTR[ecx+48],xmm3
|
||||
ret 0
|
||||
@_sse2_fix2float_16@8 ENDP
|
||||
|
||||
end
|
||||
|
|
@ -1022,12 +1022,10 @@ static void SoftRastConvertFramebuffer()
|
|||
|
||||
for(int i=0,y=0;y<192;y++)
|
||||
{
|
||||
#ifndef NOSSE2
|
||||
u8* wanx = (u8*)&src[i];
|
||||
#define ASS(X,Y) __asm { prefetchnta [wanx+32*0x##X##Y] }
|
||||
#define PUNK(X) ASS(X,0) ASS(X,1) ASS(X,2) ASS(X,3) ASS(X,4) ASS(X,5) ASS(X,6) ASS(X,7) ASS(X,8) ASS(X,9) ASS(X,A) ASS(X,B) ASS(X,C) ASS(X,D) ASS(X,E) ASS(X,F)
|
||||
PUNK(0); PUNK(1);
|
||||
#endif
|
||||
// u8* wanx = (u8*)&src[i];
|
||||
// #define ASS(X,Y) __asm { prefetchnta [wanx+32*0x##X##Y] }
|
||||
// #define PUNK(X) ASS(X,0) ASS(X,1) ASS(X,2) ASS(X,3) ASS(X,4) ASS(X,5) ASS(X,6) ASS(X,7) ASS(X,8) ASS(X,9) ASS(X,A) ASS(X,B) ASS(X,C) ASS(X,D) ASS(X,E) ASS(X,F)
|
||||
// PUNK(0); PUNK(1);
|
||||
|
||||
for(int x=0;x<256;x++,i++)
|
||||
{
|
||||
|
|
|
@ -10,11 +10,9 @@ enum TexCache_TexFormat
|
|||
};
|
||||
|
||||
#define MAX_TEXTURE 500
|
||||
#ifndef NOSSE2
|
||||
struct ALIGN(16) TextureCache
|
||||
#else
|
||||
struct ALIGN(8) TextureCache
|
||||
#endif
|
||||
|
||||
|
||||
struct CACHE_ALIGN TextureCache
|
||||
{
|
||||
u32 id;
|
||||
u32 frm;
|
||||
|
@ -33,7 +31,6 @@ struct ALIGN(8) TextureCache
|
|||
|
||||
//set if this texture is suspected be invalid due to a vram reconfigure
|
||||
bool suspectedInvalid;
|
||||
|
||||
};
|
||||
|
||||
extern TextureCache *texcache;
|
||||
|
|
|
@ -26,13 +26,17 @@
|
|||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#ifndef _MSC_VER
|
||||
#define NOSSE2
|
||||
#ifdef _MSC_VER
|
||||
#define ENABLE_SSE
|
||||
#define ENABLE_SSE2
|
||||
#endif
|
||||
|
||||
//if theres no sse2, also enforce no intrinsics
|
||||
#if defined(NOSSE2)
|
||||
#define SSE2_NOINTRIN
|
||||
#ifdef NOSSE
|
||||
#undef ENABLE_SSE
|
||||
#endif
|
||||
|
||||
#ifdef NOSSE2
|
||||
#undef ENABLE_SSE2
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
|
@ -92,20 +96,6 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
//#ifndef _PREFETCH
|
||||
//#if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(NOSSE2)
|
||||
//#include <xmmintrin.h>
|
||||
//#include <intrin.h>
|
||||
//#define _PREFETCH(X) _mm_prefetch((char*)(X),_MM_HINT_T0);
|
||||
//#define _PREFETCHNTA(X) _mm_prefetch((char*)(X),_MM_HINT_NTA);
|
||||
//#else
|
||||
#define _PREFETCH(X) {}
|
||||
#define _PREFETCHNTA(X) {}
|
||||
//#endif
|
||||
//#endif
|
||||
|
||||
|
||||
|
||||
#if defined(__LP64__)
|
||||
typedef unsigned char u8;
|
||||
typedef unsigned short u16;
|
||||
|
@ -360,7 +350,45 @@ char (*BLAHBLAHBLAH( UNALIGNED T (&)[N] ))[N];
|
|||
if((N)&0x001) MACRODO1((N)&(0x100|0x080|0x040|0x020|0x010|0x008|0x004|0x002),TODO); \
|
||||
}
|
||||
|
||||
//---------------------------
|
||||
//Binary constant generator macro By Tom Torfs - donated to the public domain
|
||||
|
||||
//turn a numeric literal into a hex constant
|
||||
//(avoids problems with leading zeroes)
|
||||
//8-bit constants max value 0x11111111, always fits in unsigned long
|
||||
#define HEX__(n) 0x##n##LU
|
||||
|
||||
//8-bit conversion function
|
||||
#define B8__(x) ((x&0x0000000FLU)?1:0) \
|
||||
+((x&0x000000F0LU)?2:0) \
|
||||
+((x&0x00000F00LU)?4:0) \
|
||||
+((x&0x0000F000LU)?8:0) \
|
||||
+((x&0x000F0000LU)?16:0) \
|
||||
+((x&0x00F00000LU)?32:0) \
|
||||
+((x&0x0F000000LU)?64:0) \
|
||||
+((x&0xF0000000LU)?128:0)
|
||||
|
||||
//for upto 8-bit binary constants
|
||||
#define B8(d) ((unsigned char)B8__(HEX__(d)))
|
||||
|
||||
// for upto 16-bit binary constants, MSB first
|
||||
#define B16(dmsb,dlsb) (((unsigned short)B8(dmsb)<<8) \
|
||||
+ B8(dlsb))
|
||||
|
||||
// for upto 32-bit binary constants, MSB first */
|
||||
#define B32(dmsb,db2,db3,dlsb) (((unsigned long)B8(dmsb)<<24) \
|
||||
+ ((unsigned long)B8(db2)<<16) \
|
||||
+ ((unsigned long)B8(db3)<<8) \
|
||||
+ B8(dlsb))
|
||||
|
||||
//Sample usage:
|
||||
//B8(01010101) = 85
|
||||
//B16(10101010,01010101) = 43605
|
||||
//B32(10000000,11111111,10101010,01010101) = 2164238933
|
||||
//---------------------------
|
||||
|
||||
#ifndef CTASSERT
|
||||
#define CTASSERT(x) typedef char __assert ## y[(x) ? 1 : -1]
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -41,8 +41,12 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef NOSSE2
|
||||
#define DESMUME_CPUEXT_STRING " NOSSE2"
|
||||
#ifndef ENABLE_SSE2
|
||||
#ifndef ENABLE_SSE
|
||||
#define DESMUME_CPUEXT_STRING " NOSSE"
|
||||
#else
|
||||
#define DESMUME_CPUEXT_STRING " NOSSE2"
|
||||
#endif
|
||||
#else
|
||||
#define DESMUME_CPUEXT_STRING ""
|
||||
#endif
|
||||
|
|
|
@ -710,42 +710,6 @@
|
|||
</File>
|
||||
</Filter>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="asm"
|
||||
>
|
||||
<File
|
||||
RelativePath="..\matrix_sse2-x64.asm"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="MASM"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="MASM"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release FastBuild|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="MASM"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath="..\matrix_sse2-x86.asm"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="addons"
|
||||
>
|
||||
|
|
|
@ -521,142 +521,6 @@
|
|||
</File>
|
||||
</Filter>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="asm"
|
||||
>
|
||||
<File
|
||||
RelativePath="..\matrix_sse2-x64.asm"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
Description="Assembling x64..."
|
||||
CommandLine="ml64 /nologo /c /Zi /Fo"$(IntDir)\$(InputName).obj" "$(InputPath)"
"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
Description="Assembling x64..."
|
||||
CommandLine="ml64 /nologo /c /Zi /Fo"$(IntDir)\$(InputName).obj" "$(InputPath)"
"
|
||||
Outputs="$(IntDir)\$(InputName).obj"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release FastBuild|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
Description="Assembling x64..."
|
||||
CommandLine="ml64 /nologo /c /Zi /Fo"$(IntDir)\$(InputName).obj" "$(InputPath)"
"
|
||||
Outputs="$(IntDir)\$(InputName).obj"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
Description="Assembling x64..."
|
||||
CommandLine="ml64 /nologo /c /Zi /Fo"$(IntDir)\$(InputName).obj" "$(InputPath)"
"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Interim SSE2|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
Description="Assembling x64..."
|
||||
CommandLine="ml64 /nologo /c /Zi /Fo"$(IntDir)\$(InputName).obj" "$(InputPath)"
"
|
||||
Outputs="$(IntDir)\$(InputName).obj"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Interim|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
Description="Assembling x64..."
|
||||
CommandLine="ml64 /nologo /c /Zi /Fo"$(IntDir)\$(InputName).obj" "$(InputPath)"
"
|
||||
Outputs="$(IntDir)\$(InputName).obj"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release (public)|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
Description="Assembling x64..."
|
||||
CommandLine="ml64 /nologo /c /Zi /Fo"$(IntDir)\$(InputName).obj" "$(InputPath)"
"
|
||||
Outputs="$(IntDir)\$(InputName).obj"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSE2 (public)|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
Description="Assembling x64..."
|
||||
CommandLine="ml64 /nologo /c /Zi /Fo"$(IntDir)\$(InputName).obj" "$(InputPath)"
"
|
||||
Outputs="$(IntDir)\$(InputName).obj"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath="..\matrix_sse2-x86.asm"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="MASM"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Interim SSE2|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="MASM"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Interim|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="MASM"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release (public)|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="MASM"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSE2 (public)|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="MASM"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="addons"
|
||||
>
|
||||
|
|
|
@ -5,10 +5,13 @@
|
|||
//to customize your build, place a customized copy in the userconfig directory
|
||||
//(alongside this defaultconfig directory)
|
||||
|
||||
//#define NOSSE2 //disables SSE2 optimizations (better change it in the vc++ codegen options too)
|
||||
//disables SSE and SSE2 optimizations (better change it in the vc++ codegen options too)
|
||||
//note that you may have to use this if your compiler doesn't support standard SSE intrinsics
|
||||
//#define NOSSE
|
||||
//#define NOSSE2
|
||||
|
||||
//#define DEVELOPER //enables dev+ features
|
||||
//#define GDB_STUB //enables the gdb stub. for some reason this is separate from dev+ for now
|
||||
//#define SSE2_NOINTRIN //indicates that you have a crippled compiler with no sse2 intrinsics (only relevant for SSE2 builds)
|
||||
|
||||
|
||||
#endif //_USERCONFIG_H
|
||||
|
|
Loading…
Reference in New Issue