extract texture cache from OGLRender so that it can be reused by rasterizer. add interpolator for material color and texture to rasterizer

2009-02-03 02:03:49 +00:00 · 2009-02-03 02:03:49 +00:00 · 64feb0117c
parent 49365a2630
commit 64feb0117c
6 changed files with 870 additions and 739 deletions
--- a/desmume/src/OGLRender.cpp
+++ b/desmume/src/OGLRender.cpp
@ -69,123 +69,8 @@ static void ENDGL() {
 #include "gfx3d.h"

 #include "shaders.h"
+#include "texcache.h"

-//This class represents a number of regions of memory which should be viewed as contiguous
-class MemSpan
-{
-public:
-	static const int MAXSIZE = 8;
-
-	MemSpan() 
-		: numItems(0)
-	{}
-
-	int numItems;
-
-	struct Item {
-		u32 start;
-		u32 len;
-		u8* ptr;
-		u32 ofs; //offset within the memspan
-	} items[MAXSIZE];
-
-	int size;
-
-	//this MemSpan shall be considered the first argument to a standard memcmp
-	//the length shall be as specified in this MemSpan, unless you specify otherwise
-	int memcmp(void* buf2, int size=-1)
-	{
-		if(size==-1) size = this->size;
-		size = std::min(this->size,size);
-		for(int i=0;i<numItems;i++)
-		{
-			Item &item = items[i];
-			int todo = std::min((int)item.len,size);
-			size -= todo;
-			int temp = ::memcmp(item.ptr,((u8*)buf2)+item.ofs,todo);
-			if(temp) return temp;
-			if(size == 0) break;
-		}
-		return 0;
-	}
-
-	//dumps the memspan to the specified buffer
-	//you may set size to limit the size to be copied
-	int dump(void* buf, int size=-1)
-	{
-		if(size==-1) size = this->size;
-		size = std::min(this->size,size);
-		u8* bufptr = (u8*)buf;
-		int done = 0;
-		for(int i=0;i<numItems;i++)
-		{
-			Item item = items[i];
-			int todo = std::min((int)item.len,size);
-			size -= todo;
-			done += todo;
-			memcpy(bufptr,item.ptr,todo);
-			bufptr += todo;
-			if(size==0) return done;
-		}
-		return done;
-	}
-};
-
-//creates a MemSpan in texture memory
-static MemSpan MemSpan_TexMem(u32 ofs, u32 len) 
-{
-	MemSpan ret;
-	ret.size = len;
-	u32 currofs = 0;
-	while(len) {
-		MemSpan::Item &curr = ret.items[ret.numItems++];
-		curr.start = ofs&0x1FFFF;
-		u32 slot = (ofs>>17)&3; //slots will wrap around
-		curr.len = std::min(len,0x20000-curr.start);
-		curr.ofs = currofs;
-		len -= curr.len;
-		ofs += curr.len;
-		currofs += curr.len;
-		u8* ptr = ARM9Mem.textureSlotAddr[slot];
-		
-		if(ptr == ARM9Mem.blank_memory) {
-			PROGINFO("Tried to reference unmapped texture memory: slot %d\n",slot);
-		}
-		curr.ptr = ptr + curr.start;
-	}
-	return ret;
-}
-
-//creates a MemSpan in texture palette memory
-static MemSpan MemSpan_TexPalette(u32 ofs, u32 len) 
-{
-	MemSpan ret;
-	ret.size = len;
-	u32 currofs = 0;
-	while(len) {
-		MemSpan::Item &curr = ret.items[ret.numItems++];
-		curr.start = ofs&0x3FFF;
-		u32 slot = (ofs>>14)&7; //this masks to 8 slots, but there are really only 6
-		if(slot>5) {
-			PROGINFO("Texture palette overruns texture memory. Wrapping at palette slot 0.\n");
-			slot -= 5;
-		}
-		curr.len = std::min(len,0x4000-curr.start);
-		curr.ofs = currofs;
-		len -= curr.len;
-		ofs += curr.len;
-		//if(len != 0) 
-			//here is an actual test case of bank spanning
-		currofs += curr.len;
-		u8* ptr = ARM9Mem.texPalSlot[slot];
-		
-		if(ptr == ARM9Mem.blank_memory) {
-			PROGINFO("Tried to reference unmapped texture palette memory: 16k slot #%d\n",slot);
-		}
-		curr.ptr = ptr + curr.start;
-	}
-	return ret;
-}


 #ifndef CTASSERT
@ -199,14 +84,12 @@ static const unsigned short map3d_cull[4] = {GL_FRONT_AND_BACK, GL_FRONT, GL_BAC
 static const int texEnv[4] = { GL_MODULATE, GL_DECAL, GL_MODULATE, GL_MODULATE };
 static const int depthFunc[2] = { GL_LESS, GL_EQUAL };
 static bool needRefreshFramebuffer = false;
-static unsigned char texMAP[1024*2048*4]; 
-static unsigned int textureMode=TEXMODE_NONE;
+

 float clearAlpha;


-//raw ds format poly attributes, installed from the display list
-static u32 textureFormat=0, texturePalette=0;
+

 //derived values extracted from polyattr etc
 static bool wireframe=false, alpha31=false;
@ -219,6 +102,8 @@ static bool alphaDepthWrite;
 static unsigned int lightMask=0;
 static bool isTranslucent;

+static u32 textureFormat=0, texturePalette=0;
+
 //------------------------------------------------------------

 #define OGLEXT(x,y) x y = 0;
@ -335,37 +220,6 @@ static void _xglDisable(GLenum cap) {
 	_xglDisable(cap); }


-//================================================= Textures
-#define MAX_TEXTURE 500
-#ifdef SSE2
-struct ALIGN(16) TextureCache
-#else
-struct ALIGN(8) TextureCache
-#endif
-{
-	GLenum				id;
-	u32					frm;
-	u32					mode;
-	u32					pal;
-	u32					sizeX;
-	u32					sizeY;
-	float				invSizeX;
-	float				invSizeY;
-	int					textureSize, indexSize;
-	u8					texture[128*1024]; // 128Kb texture slot
-	u8					palette[256*2];
-
-	//set if this texture is suspected be invalid due to a vram reconfigure
-	bool				suspectedInvalid;
-
-};
-
-TextureCache	texcache[MAX_TEXTURE+1];
-u32				texcache_count;
-
-u32				texcache_start;
-u32				texcache_stop;
-//u32				texcache_last;

 GLenum			oglTempTextureID[MAX_TEXTURE];
 GLenum			oglToonTableTextureID;
@ -418,6 +272,7 @@ GLuint shaderProgram;

 static GLuint hasTexLoc;
 static GLuint texBlendLoc;
+static bool hasTexture = false;

 /* Shaders init */

@ -479,24 +334,38 @@ static void OGLReset()
 	int i;

 	//reset the texture cache
-	memset(&texcache,0,sizeof(texcache));
-	texcache_count=0;
+	TexCache_Reset();
 	for (i = 0; i < MAX_TEXTURE; i++)
 		texcache[i].id=oglTempTextureID[i];
-	texcache_start=0;
-	texcache_stop=MAX_TEXTURE<<1;

-	for(i=0;i<MAX_TEXTURE+1;i++)
-		texcache[i].suspectedInvalid = true;
-
-	//clear the framebuffers
 //	memset(GPU_screenStencil,0,sizeof(GPU_screenStencil));
 	memset(GPU_screen3D,0,sizeof(GPU_screen3D));
 	needRefreshFramebuffer = false;
-	memset(texMAP, 0, sizeof(texMAP)); 
-	textureMode=TEXMODE_NONE;
 }

+static void BindTexture(u32 tx)
+{
+	glBindTexture(GL_TEXTURE_2D,(GLuint)texcache[tx].id);
+	glMatrixMode (GL_TEXTURE);
+	glLoadIdentity ();
+	glScaled (texcache[tx].invSizeX, texcache[tx].invSizeY, 1.0f);
+
+	glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+	glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (BIT16(texcache[tx].frm) ? (BIT18(texcache[tx].frm)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP));
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (BIT17(texcache[tx].frm) ? (BIT19(texcache[tx].frm)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP));
+}
+
+static void BindTextureData(u32 tx, u8* data)
+{
+	BindTexture(tx);
+
+	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 
+		texcache[tx].sizeX, texcache[tx].sizeY, 0, 
+		GL_RGBA, GL_UNSIGNED_BYTE, data);
+}
+
+
 static char OGLInit(void)
 {
 	GLuint loc;
@ -509,6 +378,9 @@ static char OGLInit(void)
 	if(!BEGINGL())
 		return 0;

+	TexCache_BindTexture = BindTexture;
+	TexCache_BindTextureData = BindTextureData;
+
 	glPixelStorei(GL_PACK_ALIGNMENT,8);

 	xglEnable		(GL_NORMALIZE);
@ -628,60 +500,22 @@ static void OGLClose()
 	ENDGL();
 }

-//todo - make all color conversions go through a properly spread table!!
-
-#if defined (DEBUG_DUMP_TEXTURE) && defined (WIN32)
-static void DebugDumpTexture(int which)
-{
-	char fname[100];
-	sprintf(fname,"c:\\dump\\%d.bmp", which);
-
-	glBindTexture(GL_TEXTURE_2D,texcache[which].id);
-	  glGetTexImage( GL_TEXTURE_2D ,
-			      0,
-			    GL_BGRA_EXT,
-			      GL_UNSIGNED_BYTE,
-			      texMAP);
-
-	NDS_WriteBMP_32bppBuffer(texcache[which].sizeX,texcache[which].sizeY,texMAP,fname);
-}
-#else
-#define DebugDumpTexture(which) do { (void)which; } while (0)
-#endif
-
-//================================================================================
-static int lastTexture = -1;
-static bool hasTexture = false;
 static void setTexture(unsigned int format, unsigned int texpal)
 {
-	//for each texformat, number of palette entries
-	const int palSizes[] = {0, 32, 4, 16, 256, 0, 8, 0};
-
-	//for each texformat, multiplier from numtexels to numbytes (fixed point 30.2)
-	const int texSizes[] = {0, 4, 1, 2, 4, 1, 4, 8};
-
-	//used to hold a copy of the palette specified for this texture
-	u16 pal[256];
-
-	u32 *dwdst = (u32*)texMAP;
-	
-	textureMode = (unsigned short)((format>>26)&0x07);
-	unsigned int sizeX=(8 << ((format>>20)&0x07));
-	unsigned int sizeY=(8 << ((format>>23)&0x07));
-	unsigned int imageSize = sizeX*sizeY;
-
-	u8 *adr;
+	textureFormat = format;
+	texturePalette = texpal;

+	u32 textureMode = (unsigned short)((format>>26)&0x07);

 	if (format==0)
 	{
-		texcache_count=-1;
+//		texcache_count=-1;
 		if(hasShaders && hasTexture) { glUniform1i(hasTexLoc, 0); hasTexture = false; }
 		return;
 	}
 	if (textureMode==0)
 	{
-		texcache_count=-1;
+//		texcache_count=-1;
 		if(hasShaders && hasTexture) { glUniform1i(hasTexLoc, 0); hasTexture = false; }
 		return;
 	}
@ -692,400 +526,8 @@ static void setTexture(unsigned int format, unsigned int texpal)
 		glActiveTexture(GL_TEXTURE0);
 	}

-	u32 paletteAddress;

-	switch (textureMode)
-	{
-	case TEXMODE_I2:
-		paletteAddress = texturePalette<<3;
-		break;
-	case TEXMODE_A3I5: //a3i5
-	case TEXMODE_I4: //i4
-	case TEXMODE_I8: //i8
-	case TEXMODE_A5I3: //a5i3
-	case TEXMODE_16BPP: //16bpp
-	case TEXMODE_4X4: //4x4
-	default:
-		paletteAddress = texturePalette<<4;
-		break;
-	}
-
-	//analyze the texture memory mapping and the specifications of this texture
-	int palSize = palSizes[textureMode];
-	int texSize = (imageSize*texSizes[textureMode])>>2; //shifted because the texSizes multiplier is fixed point
-	MemSpan ms = MemSpan_TexMem((format&0xFFFF)<<3,texSize);
-	MemSpan mspal = MemSpan_TexPalette(paletteAddress,palSize*2);
-
-	//determine the location for 4x4 index data
-	u32 indexBase;
-	if((format & 0xc000) == 0x8000) indexBase = 0x30000;
-	else indexBase = 0x20000;
-
-	u32 indexOffset = (format&0x3FFF)<<2;
-
-	int indexSize = 0;
-	MemSpan msIndex;
-	if(textureMode == TEXMODE_4X4)
-	{
-		indexSize = imageSize>>3;
-		msIndex = MemSpan_TexMem(indexOffset+indexBase,indexSize);
-	}
-
-
-	//dump the palette to a temp buffer, so that we don't have to worry about memory mapping.
-	//this isnt such a problem with texture memory, because we read sequentially from it.
-	//however, we read randomly from palette memory, so the mapping is more costly.
-	mspal.dump(pal);
-
-
-	u32 tx=texcache_start;
-
-	//if(false)
-	while (TRUE)
-	{
-		//conditions where we give up and regenerate the texture:
-		if (texcache_stop == tx) break;
-		if (texcache[tx].frm == 0) break;
-
-		//conditions where we reject matches:
-		//when the teximage or texpal params dont match 
-		//(this is our key for identifying palettes in the cache)
-		if (texcache[tx].frm != format) goto REJECT;
-		if (texcache[tx].pal != texpal) goto REJECT;
-
-		//the texture matches params, but isnt suspected invalid. accept it.
-		if (!texcache[tx].suspectedInvalid) goto ACCEPT;
-
-		//if we couldnt cache this entire texture due to it being too large, then reject it
-		if (texSize+indexSize > (int)sizeof(texcache[tx].texture)) goto REJECT;
-
-		//when the palettes dont match:
-		//note that we are considering 4x4 textures to have a palette size of 0.
-		//they really have a potentially HUGE palette, too big for us to handle like a normal palette,
-		//so they go through a different system
-		if (mspal.size != 0 && memcmp(texcache[tx].palette,pal,mspal.size)) goto REJECT;
-
-		//when the texture data doesn't match
-		if(ms.memcmp(texcache[tx].texture,sizeof(texcache[tx].texture))) goto REJECT;
-
-		//if the texture is 4x4 then the index data must match
-		if(textureMode == TEXMODE_4X4)
-		{
-			if(msIndex.memcmp(texcache[tx].texture + texcache[tx].textureSize,texcache[tx].indexSize)) goto REJECT; 
-		}
-
-
-ACCEPT:
-		texcache[tx].suspectedInvalid = false;
-		texcache_count = tx;
-		if(lastTexture == -1 || (int)tx != lastTexture)
-		{
-			lastTexture = tx;
-			glBindTexture(GL_TEXTURE_2D,texcache[tx].id);
-			glMatrixMode (GL_TEXTURE);
-			glLoadIdentity ();
-			glScaled (texcache[tx].invSizeX, texcache[tx].invSizeY, 1.0f);
-		}
-		return;
- 
-REJECT:
-		tx++;
-		if ( tx > MAX_TEXTURE )
-		{
-			texcache_stop=texcache_start;
-			texcache[texcache_stop].frm=0;
-			texcache_start++;
-			if (texcache_start>MAX_TEXTURE) 
-			{
-				texcache_start=0;
-				texcache_stop=MAX_TEXTURE<<1;
-			}
-			tx=0;
-		}
-	}
-
-	lastTexture = tx;
-	glBindTexture(GL_TEXTURE_2D, texcache[tx].id);
-
-	texcache[tx].suspectedInvalid = false;
-	texcache[tx].frm=format;
-	texcache[tx].mode=textureMode;
-	texcache[tx].pal=texpal;
-	texcache[tx].sizeX=sizeX;
-	texcache[tx].sizeY=sizeY;
-	texcache[tx].invSizeX=1.0f/((float)(sizeX));
-	texcache[tx].invSizeY=1.0f/((float)(sizeY));
-	texcache[tx].textureSize = ms.dump(texcache[tx].texture,sizeof(texcache[tx].texture));
-
-	//dump palette data for cache keying
-	if ( palSize )
-	{
-		memcpy(texcache[tx].palette, pal, palSize*2);
-	}
-	//dump 4x4 index data for cache keying
-	texcache[tx].indexSize = 0;
-	if(textureMode == TEXMODE_4X4)
-	{
-		texcache[tx].indexSize = std::min(msIndex.size,(int)sizeof(texcache[tx].texture) - texcache[tx].textureSize);
-		msIndex.dump(texcache[tx].texture+texcache[tx].textureSize,texcache[tx].indexSize);
-	}
-
-
-	glMatrixMode (GL_TEXTURE);
-	glLoadIdentity ();
-	glScaled (texcache[tx].invSizeX, texcache[tx].invSizeY, 1.0f);
-
-
-	//INFO("Texture %03i - format=%08X; pal=%04X (mode %X, width %04i, height %04i)\n",i, texcache[i].frm, texcache[i].pal, texcache[i].mode, sizeX, sizeY);
-
-	//============================================================================ Texture conversion
-	u32 palZeroTransparent = (1-((format>>29)&1))*255;						// shash: CONVERT THIS TO A TABLE :)
-
-	switch (texcache[tx].mode)
-	{
-	case TEXMODE_A3I5:
-		{
-			for(int j=0;j<ms.numItems;j++) {
-				adr = ms.items[j].ptr;
-				for(u32 x = 0; x < ms.items[j].len; x++)
-				{
-					u16 c = pal[*adr&31];
-					u8 alpha = *adr>>5;
-					*dwdst++ = RGB15TO32(c,material_3bit_to_8bit[alpha]);
-					adr++;
-				}
-			}
-
-			break;
-		}
-	case TEXMODE_I2:
-		{
-			for(int j=0;j<ms.numItems;j++) {
-				adr = ms.items[j].ptr;
-				for(u32 x = 0; x < ms.items[j].len; x++)
-				{
-					u8 bits;
-					u16 c;
-
-					bits = (*adr)&0x3;
-					c = pal[bits];
-					*dwdst++ = RGB15TO32(c,(bits == 0) ? palZeroTransparent : 255);
-
-					bits = ((*adr)>>2)&0x3;
-					c = pal[bits];
-					*dwdst++ = RGB15TO32(c,(bits == 0) ? palZeroTransparent : 255);
-
-					bits = ((*adr)>>4)&0x3;
-					c = pal[bits];
-					*dwdst++ = RGB15TO32(c,(bits == 0) ? palZeroTransparent : 255);
-
-					bits = ((*adr)>>6)&0x3;
-					c = pal[bits];
-					*dwdst++ = RGB15TO32(c,(bits == 0) ? palZeroTransparent : 255);
-
-					adr++;
-				}
-			}
-			break;
-		}
-	case TEXMODE_I4:
-		{
-			for(int j=0;j<ms.numItems;j++) {
-				adr = ms.items[j].ptr;
-				for(u32 x = 0; x < ms.items[j].len; x++)
-				{
-					u8 bits;
-					u16 c;
-
-					bits = (*adr)&0xF;
-					c = pal[bits];
-					*dwdst++ = RGB15TO32(c,(bits == 0) ? palZeroTransparent : 255);
-
-					bits = ((*adr)>>4);
-					c = pal[bits];
-					*dwdst++ = RGB15TO32(c,(bits == 0) ? palZeroTransparent : 255);
-
-					adr++;
-				}
-			}
-			break;
-		}
-	case TEXMODE_I8:
-		{
-			for(int j=0;j<ms.numItems;j++) {
-				adr = ms.items[j].ptr;
-				for(u32 x = 0; x < ms.items[j].len; ++x)
-				{
-					u16 c = pal[*adr];
-					*dwdst++ = RGB15TO32(c,(*adr == 0) ? palZeroTransparent : 255);
-					adr++;
-				}
-			}
-		}
-		break;
-	case TEXMODE_4X4:
-		{
-			//RGB16TO32 is used here because the other conversion macros result in broken interpolation logic
-
-			if(ms.numItems != 1) {
-				PROGINFO("Your 4x4 texture has overrun its texture slot.\n");
-			}
-			//this check isnt necessary since the addressing is tied to the texture data which will also run out:
-			//if(msIndex.numItems != 1) PROGINFO("Your 4x4 texture index has overrun its slot.\n");
-
-#define PAL4X4(offset) ( *(u16*)( ARM9Mem.texPalSlot[((paletteAddress + (offset)*2)>>14)] + ((paletteAddress + (offset)*2)&0x3FFF) ) )
-
-			u16* slot1;
-			u32* map = (u32*)ms.items[0].ptr;
-			u32 limit = ms.items[0].len<<2;
-			u32 d = 0;
-			if ( (texcache[tx].frm & 0xc000) == 0x8000)
-				// texel are in slot 2
-				slot1=(u16*)&ARM9Mem.textureSlotAddr[1][((texcache[tx].frm & 0x3FFF)<<2)+0x010000];
-			else 
-				slot1=(u16*)&ARM9Mem.textureSlotAddr[1][(texcache[tx].frm & 0x3FFF)<<2];
-
-			u16 yTmpSize = (texcache[tx].sizeY>>2);
-			u16 xTmpSize = (texcache[tx].sizeX>>2);
-
-			//this is flagged whenever a 4x4 overruns its slot.
-			//i am guessing we just generate black in that case
-			bool dead = false;
-
-			for (int y = 0; y < yTmpSize; y ++)
-			{
-				u32 tmpPos[4]={(y<<2)*texcache[tx].sizeX,((y<<2)+1)*texcache[tx].sizeX,
-					((y<<2)+2)*texcache[tx].sizeX,((y<<2)+3)*texcache[tx].sizeX};
-				for (int x = 0; x < xTmpSize; x ++, d++)
-				{
-					if(d >= limit)
-						dead = true;
-
-					if(dead) {
-						for (int sy = 0; sy < 4; sy++)
-						{
-							u32 currentPos = (x<<2) + tmpPos[sy];
-							dwdst[currentPos] = dwdst[currentPos+1] = dwdst[currentPos+2] = dwdst[currentPos+3] = 0;
-						}
-						continue;
-					}
-
-					u32 currBlock	= map[d];
-					u16 pal1		= slot1[d];
-					u16 pal1offset	= (pal1 & 0x3FFF)<<1;
-					u8  mode		= pal1>>14;
-					u32 tmp_col[4];
-
-					tmp_col[0]=RGB16TO32(PAL4X4(pal1offset),255);
-					tmp_col[1]=RGB16TO32(PAL4X4(pal1offset+1),255);
-
-					switch (mode) 
-					{
-					case 0:
-						tmp_col[2]=RGB16TO32(PAL4X4(pal1offset+2),255);
-						tmp_col[3]=RGB16TO32(0x7FFF,0);
-						break;
-					case 1:
-						tmp_col[2]=(((tmp_col[0]&0xFF)+(tmp_col[1]&0xff))>>1)|
-							(((tmp_col[0]&(0xFF<<8))+(tmp_col[1]&(0xFF<<8)))>>1)|
-							(((tmp_col[0]&(0xFF<<16))+(tmp_col[1]&(0xFF<<16)))>>1)|
-							(0xff<<24);
-						tmp_col[3]=RGB16TO32(0x7FFF,0);
-						break;
-					case 2:
-						tmp_col[2]=RGB16TO32(PAL4X4(pal1offset+2),255);
-						tmp_col[3]=RGB16TO32(PAL4X4(pal1offset+3),255);
-						break;
-					case 3: 
-						{
-							u32 red1, red2;
-							u32 green1, green2;
-							u32 blue1, blue2;
-							u16 tmp1, tmp2;
-
-							red1=tmp_col[0]&0xff;
-							green1=(tmp_col[0]>>8)&0xff;
-							blue1=(tmp_col[0]>>16)&0xff;
-							red2=tmp_col[1]&0xff;
-							green2=(tmp_col[1]>>8)&0xff;
-							blue2=(tmp_col[1]>>16)&0xff;
-
-							tmp1=((red1*5+red2*3)>>6)|
-								(((green1*5+green2*3)>>6)<<5)|
-								(((blue1*5+blue2*3)>>6)<<10);
-							tmp2=((red2*5+red1*3)>>6)|
-								(((green2*5+green1*3)>>6)<<5)|
-								(((blue2*5+blue1*3)>>6)<<10);
-
-							tmp_col[2]=RGB16TO32(tmp1,255);
-							tmp_col[3]=RGB16TO32(tmp2,255);
-							break;
-						}
-					}
-
-					//set all 16 texels
-					for (int sy = 0; sy < 4; sy++)
-					{
-						// Texture offset
-						u32 currentPos = (x<<2) + tmpPos[sy];
-						u8 currRow = (u8)((currBlock>>(sy<<3))&0xFF);
-
-						dwdst[currentPos] = tmp_col[currRow&3];
-						dwdst[currentPos+1] = tmp_col[(currRow>>2)&3];
-						dwdst[currentPos+2] = tmp_col[(currRow>>4)&3];
-						dwdst[currentPos+3] = tmp_col[(currRow>>6)&3];
-					}
-
-
-				}
-			}
-
-
-			break;
-		}
-	case TEXMODE_A5I3:
-		{
-			for(int j=0;j<ms.numItems;j++) {
-				adr = ms.items[j].ptr;
-				for(u32 x = 0; x < ms.items[j].len; ++x)
-				{
-					u16 c = pal[*adr&0x07];
-					u8 alpha = (*adr>>3);
-					*dwdst++ = RGB15TO32(c,material_5bit_to_8bit[alpha]);
-					adr++;
-				}
-			}
-			break;
-		}
-	case TEXMODE_16BPP:
-		{
-			for(int j=0;j<ms.numItems;j++) {
-				u16* map = (u16*)ms.items[j].ptr;
-				for(u32 x = 0; x < ms.items[j].len; ++x)
-				{
-					u16 c = map[x];
-					int alpha = ((c&0x8000)?255:0);
-					*dwdst++ = RGB15TO32(c&0x7FFF,alpha);
-				}
-			}
-			break;
-		}
-	}
-
-	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 
-		texcache[tx].sizeX, texcache[tx].sizeY, 0, 
-		GL_RGBA, GL_UNSIGNED_BYTE, texMAP);
-
-	DebugDumpTexture(tx);
-
-	//============================================================================================
-
-	texcache_count=tx;
-
-	glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-	glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (BIT16(texcache[tx].frm) ? (BIT18(texcache[tx].frm)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP));
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (BIT17(texcache[tx].frm) ? (BIT19(texcache[tx].frm)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP));
+	TexCache_SetTexture(format, texpal);
 }


--- a/desmume/src/gfx3d.h
+++ b/desmume/src/gfx3d.h
@ -27,6 +27,22 @@
 #include "types.h"
 #include <iosfwd>

+//produce a 32bpp color from a DS RGB16
+#define RGB16TO32(col,alpha) (((alpha)<<24) | ((((col) & 0x7C00)>>7)<<16) | ((((col) & 0x3E0)>>2)<<8) | (((col) & 0x1F)<<3))
+
+//produce a 32bpp color from a ds RGB15 plus an 8bit alpha, using a table
+#define RGB15TO32(col,alpha8) ( ((alpha8)<<24) | color_15bit_to_24bit[col&0x7FFF] )
+
+//produce a 24bpp color from a ds RGB15, using a table
+#define RGB15TO24_REVERSE(col) ( color_15bit_to_24bit_reverse[col&0x7FFF] )
+
+//produce a 32bpp color from a ds RGB15 plus an 8bit alpha, not using a table (but using other tables)
+#define RGB15TO32_DIRECT(col,alpha8) ( ((alpha8)<<24) | (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
+
+//produce a 15bpp color from individual 5bit components
+#define R5G5B5TORGB15(r,g,b) ((r)|((g)<<5)|((b)<<10))
+
+
 #define TEXMODE_NONE 0
 #define TEXMODE_A3I5 1
 #define TEXMODE_I2 2
@ -141,18 +157,6 @@ extern GFX3D gfx3d;

 //---------------------

-//produce a 32bpp color from a DS RGB16
-#define RGB16TO32(col,alpha) (((alpha)<<24) | ((((col) & 0x7C00)>>7)<<16) | ((((col) & 0x3E0)>>2)<<8) | (((col) & 0x1F)<<3))
-
-//produce a 32bpp color from a ds RGB15 plus an 8bit alpha, using a table
-#define RGB15TO32(col,alpha8) ( ((alpha8)<<24) | color_15bit_to_24bit[col&0x7FFF] )
-
-//produce a 24bpp color from a ds RGB15, using a table
-#define RGB15TO24_REVERSE(col) ( color_15bit_to_24bit_reverse[col&0x7FFF] )
-
-//produce a 32bpp color from a ds RGB15 plus an 8bit alpha, not using a table (but using other tables)
-#define RGB15TO32_DIRECT(col,alpha8) ( ((alpha8)<<24) | (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
-
 extern CACHE_ALIGN u32 color_15bit_to_24bit[32768];
 extern CACHE_ALIGN u32 color_15bit_to_24bit_reverse[32768];
 extern CACHE_ALIGN u8 mixTable555[32][32][32];
--- a/desmume/src/rasterize.cpp
+++ b/desmume/src/rasterize.cpp
@ -22,10 +22,13 @@
 */

 #include "Rasterize.h"
+
+#include <algorithm>
+
 #include "common.h"
 #include "render3D.h"
 #include "gfx3d.h"
-#include <algorithm>
+#include "texcache.h"

 using std::min;
 using std::max;
@ -33,9 +36,13 @@ using std::max;
 template<typename T> T min(T a, T b, T c) { return min(min(a,b),c); }
 template<typename T> T max(T a, T b, T c) { return max(max(a,b),c); }

-
 static u16 screen[256*192];

+static struct
+{
+	int width, height;
+} Texture;
+
 void set_pixel(int x, int y, u16 color)
 {
 	if(x<0 || y<0 || x>=256 || y>=192) return;
@ -50,31 +57,74 @@ void hline(int x, int y, int xe, u16 color)

 //http://www.devmaster.net/forums/showthread.php?t=1884

+#if defined(_MSC_VER)
+inline int iround(float x)
+{
+  int t;
+
+  __asm
+  {
+    fld  x
+    fistp t
+  }
+
+  return t;
+}
+#else
 int iround(float f) {
 	return (int)f; //lol
 }
+#endif

-void triangle_from_devmaster(int x1, int y1, int x2, int y2, int x3, int y3, u16 color)
+struct Interpolator
 {
-	int desty = 0;
-	struct Vertex {
-		int x,y;
-	} v1, v2, v3;
-	v1.x = x1;
-	v1.y = y1;
-	v2.x = x2;
-	v2.y = y2;
-	v3.x = x3;
-	v3.y = y3;
+	int A,B,C;
+	float dx, dy;
+	float Z, pZ;

-    // 28.4 fixed-point coordinates
-    const int Y1 = iround(16.0f * v1.y);
-    const int Y2 = iround(16.0f * v2.y);
-    const int Y3 = iround(16.0f * v3.y);
+	struct {
+		int x,y,z;
+	} point0;
+	
+	Interpolator(int x1, int x2, int x3, int y1, int y2, int y3, int z1, int z2, int z3)
+	{
+		A = (z3 - z1) * (y2 - y1) - (z2 - z1) * (y3 - y1);
+		B = (x3 - x1) * (z2 - z1) - (x2 - x1) * (z3 - z1);
+		C = (x2 - x1) * (y3 - y1) - (x3 - x1) * (y2 - y1);
+		dx = -(float)A / C;
+		dy = -(float)B / C;
+		point0.x = x1;
+		point0.y = y1;
+		point0.z = z1;
+	}

-    const int X1 = iround(16.0f * v1.x);
-    const int X2 = iround(16.0f * v2.x);
-    const int X3 = iround(16.0f * v3.x);
+	void init(int x, int y)
+	{
+		Z = point0.z + dx * (x-point0.x) + dy * (y-point0.y);
+	}
+
+	FORCEINLINE int cur() { return iround(Z); }
+	
+	FORCEINLINE void push() { pZ = Z; }
+	FORCEINLINE void pop() { Z = pZ; }
+	FORCEINLINE void incy() { Z += dy; }
+	FORCEINLINE void incx() { Z += dx; }
+};
+
+//http://www.devmaster.net/forums/showthread.php?t=1884&page=1
+//todo - change to the tile-based renderer and try to apply some optimizations from that thread
+void triangle_from_devmaster(VERT** verts)
+{
+	u16 color =0x7FFF;
+
+	// 28.4 fixed-point coordinates
+    const int Y1 = iround(16.0f * verts[0]->coord[1]);
+    const int Y2 = iround(16.0f * verts[1]->coord[1]);
+    const int Y3 = iround(16.0f * verts[2]->coord[1]);
+
+    const int X1 = iround(16.0f * verts[0]->coord[0]);
+    const int X2 = iround(16.0f * verts[1]->coord[0]);
+    const int X3 = iround(16.0f * verts[2]->coord[0]);

    // Deltas
    const int DX12 = X1 - X2;
@ -100,15 +150,7 @@ void triangle_from_devmaster(int x1, int y1, int x2, int y2, int x3, int y3, u16
    int miny = (min(Y1, Y2, Y3) + 0xF) >> 4;
    int maxy = (max(Y1, Y2, Y3) + 0xF) >> 4;

-    // Block size, standard 8x8 (must be power of two)
-    const int q = 8;
-
-    // Start in corner of 8x8 block
-    minx &= ~(q - 1);
-    miny &= ~(q - 1);
-
-    //(char*&)colorBuffer += miny * stride;
-	desty = miny;
+	int desty = miny;

    // Half-edge constants
    int C1 = DY12 * X1 - DX12 * Y1;
@ -120,95 +162,87 @@ void triangle_from_devmaster(int x1, int y1, int x2, int y2, int x3, int y3, u16
    if(DY23 < 0 || (DY23 == 0 && DX23 > 0)) C2++;
    if(DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++;

-    // Loop through blocks
-    for(int y = miny; y < maxy; y += q)
+    int CY1 = C1 + DX12 * (miny << 4) - DY12 * (minx << 4);
+    int CY2 = C2 + DX23 * (miny << 4) - DY23 * (minx << 4);
+    int CY3 = C3 + DX31 * (miny << 4) - DY31 * (minx << 4);
+
+	float fx1 = verts[0]->coord[0], fy1 = verts[0]->coord[1];
+	float fx2 = verts[1]->coord[0], fy2 = verts[1]->coord[1];
+	float fx3 = verts[2]->coord[0], fy3 = verts[2]->coord[1];
+	u8 r1 = verts[0]->color[0], g1 = verts[0]->color[1], b1 = verts[0]->color[2];
+	u8 r2 = verts[1]->color[0], g2 = verts[1]->color[1], b2 = verts[1]->color[2];
+	u8 r3 = verts[2]->color[0], g3 = verts[2]->color[1], b3 = verts[2]->color[2];
+	int u1 = verts[0]->texcoord[0], v1 = verts[0]->texcoord[1];
+	int u2 = verts[1]->texcoord[0], v2 = verts[1]->texcoord[1];
+	int u3 = verts[2]->texcoord[0], v3 = verts[2]->texcoord[1];
+
+	Interpolator i_color_r(fx1,fx2,fx3,fy1,fy2,fy3,r1,r2,r3);
+	Interpolator i_color_g(fx1,fx2,fx3,fy1,fy2,fy3,g1,g2,g3);
+	Interpolator i_color_b(fx1,fx2,fx3,fy1,fy2,fy3,b1,b2,b3);
+	Interpolator i_tex_u(fx1,fx2,fx3,fy1,fy2,fy3,u1,u2,u3);
+	Interpolator i_tex_v(fx1,fx2,fx3,fy1,fy2,fy3,v1,v2,v3);
+	
+
+	i_color_r.init(minx,miny);
+	i_color_g.init(minx,miny);
+	i_color_b.init(minx,miny);
+	i_tex_u.init(minx,miny);
+	i_tex_v.init(minx,miny);
+
+    for(int y = miny; y < maxy; y++)
    {
-        for(int x = minx; x < maxx; x += q)
+        int CX1 = CY1;
+        int CX2 = CY2;
+        int CX3 = CY3;
+
+		bool done = false;
+		i_color_r.push(); i_color_g.push(); i_color_b.push(); 
+		i_tex_u.push(); i_tex_v.push();
+        for(int x = minx; x < maxx; x++)
        {
-            // Corners of block
-            int x0 = x << 4;
-            int x1 = (x + q - 1) << 4;
-            int y0 = y << 4;
-            int y1 = (y + q - 1) << 4;
-
-            // Evaluate half-space functions
-            bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0;
-            bool a10 = C1 + DX12 * y0 - DY12 * x1 > 0;
-            bool a01 = C1 + DX12 * y1 - DY12 * x0 > 0;
-            bool a11 = C1 + DX12 * y1 - DY12 * x1 > 0;
-            int a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3);
-    
-            bool b00 = C2 + DX23 * y0 - DY23 * x0 > 0;
-            bool b10 = C2 + DX23 * y0 - DY23 * x1 > 0;
-            bool b01 = C2 + DX23 * y1 - DY23 * x0 > 0;
-            bool b11 = C2 + DX23 * y1 - DY23 * x1 > 0;
-            int b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3);
-    
-            bool c00 = C3 + DX31 * y0 - DY31 * x0 > 0;
-            bool c10 = C3 + DX31 * y0 - DY31 * x1 > 0;
-            bool c01 = C3 + DX31 * y1 - DY31 * x0 > 0;
-            bool c11 = C3 + DX31 * y1 - DY31 * x1 > 0;
-            int c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3);
-
-            // Skip block when outside an edge
-            if(a == 0x0 || b == 0x0 || c == 0x0) continue;
-
-            //unsigned int *buffer = colorBuffer;
-			int _desty = desty;
-
-            // Accept whole block when totally covered
-            if(a == 0xF && b == 0xF && c == 0xF)
+            if(CX1 > 0 && CX2 > 0 && CX3 > 0)
            {
-                for(int iy = 0; iy < q; iy++)
-                {
-                    for(int ix = x; ix < x + q; ix++)
-                    {
-                        //buffer[ix] = 0x00007F00;<< // Green
-						set_pixel(ix,_desty,color);
-                    }
+				//material color
+				//color = R5G5B5TORGB15(i_color_r.cur(),i_color_g.cur(),i_color_b.cur());
+				
+				//texture
+				int u = i_tex_u.cur();
+				int v = i_tex_v.cur();
+				if(u<0) u = 0;
+				if(v<0) v = 0;
+				u32 color32 = ((u32*)TexCache_texMAP)[v*Texture.width+u];
+				color32>>=3;
+				color32 &= 0x1F1F1F1F;
+				u8* color8 = (u8*)&color32;
+				color = (color8[0] | (color8[1] << 5) | (color8[2] << 10));
+	
+				//hack: for testing, dont render non-opaque textures
+				if(color8[3] < 0x1F) return;
+				
+				set_pixel(x,desty,color);

-                    //(char*&)buffer += stride;
-					_desty++;
+				done = true;
+            } else if(done) break;

-                }
-            }
-            else // Partially covered block
-            {
-                int CY1 = C1 + DX12 * y0 - DY12 * x0;
-                int CY2 = C2 + DX23 * y0 - DY23 * x0;
-                int CY3 = C3 + DX31 * y0 - DY31 * x0;
+			i_color_r.incx(); i_color_g.incx(); i_color_b.incx();
+			i_tex_u.incx(); i_tex_v.incx();

-                for(int iy = y; iy < y + q; iy++)
-                {
-                    int CX1 = CY1;
-                    int CX2 = CY2;
-                    int CX3 = CY3;
-
-                    for(int ix = x; ix < x + q; ix++)
-                    {
-                        if(CX1 > 0 && CX2 > 0 && CX3 > 0)
-                        {
-                            //buffer[ix] = 0x0000007F;<< // Blue
-							set_pixel(ix,_desty,color);
-                        }
-
-                        CX1 -= FDY12;
-                        CX2 -= FDY23;
-                        CX3 -= FDY31;
-                    }
-
-                    CY1 += FDX12;
-                    CY2 += FDX23;
-                    CY3 += FDX31;
-
-                    //(char*&)buffer += stride;
-					_desty ++;
-                }
-            }
+            CX1 -= FDY12;
+            CX2 -= FDY23;
+            CX3 -= FDY31;
        }
+		i_color_r.pop(); i_color_r.incy();
+		i_color_g.pop(); i_color_g.incy();
+		i_color_b.pop(); i_color_b.incy();
+		i_tex_u.pop(); i_tex_u.incy();
+		i_tex_v.pop(); i_tex_v.incy();

-        //(char*&)colorBuffer += q * stride;
-		desty += q;
+
+        CY1 += FDX12;
+        CY2 += FDX23;
+        CY3 += FDX31;
+
+		desty++;
    }
 }

@ -245,20 +279,9 @@ static void Render()
 	{
 		VERT &vert = gfx3d.vertlist->list[i];

-		////perspective division
-		//vert.coord[0] = (vert.coord[0] + vert.coord[3]) / 2 / vert.coord[3];
-		//vert.coord[1] = (vert.coord[1] + vert.coord[3]) / 2 / vert.coord[3];
-		//vert.coord[2] = (vert.coord[2] + vert.coord[3]) / 2 / vert.coord[3];
-		//vert.coord[3] = 1;
-
-		////transform to viewport. this is badly broken
-		//vert.coord[0] = (vert.coord[0])*128;
-		//vert.coord[1] = (vert.coord[1])*96;
-
+		//perspective division and viewport transform
 		vert.coord[0] = (vert.coord[0]+vert.coord[3])*256 / (2*vert.coord[3]) + 0;
 		vert.coord[1] = (vert.coord[1]+vert.coord[3])*192 / (2*vert.coord[3]) + 0;
-
-		int zzz=9;
 	}


@ -268,21 +291,40 @@ static void Render()
 		POLY *poly = &gfx3d.polylist->list[gfx3d.indexlist[i]];
 		int type = poly->type;

+		TexCache_SetTexture(poly->texParam,poly->texPalette);
+		if(TexCache_Curr())
+			Texture.width = TexCache_Curr()->sizeX;

-		if(type == 3) {
-			VERT* vert[3] = {
+		//note that when we build our triangle vert lists, we reorder them for our renderer.
+		//we should probably fix the renderer so we dont have to do this;
+		//but then again, what does it matter?
+		if(type == 4) {
+
+			VERT* vertA[3] = {
 				&gfx3d.vertlist->list[poly->vertIndexes[0]],
+				&gfx3d.vertlist->list[poly->vertIndexes[2]],
 				&gfx3d.vertlist->list[poly->vertIndexes[1]],
+			};
+
+			triangle_from_devmaster(vertA);
+
+			VERT* vertB[3] = {
+				&gfx3d.vertlist->list[poly->vertIndexes[0]],
+				&gfx3d.vertlist->list[poly->vertIndexes[3]],
 				&gfx3d.vertlist->list[poly->vertIndexes[2]],
 			};
-			u16 color = vert[0]->color[0] | (vert[0]->color[1]<<5) | (vert[0]->color[2]<<10);

+			triangle_from_devmaster(vertB);

-			triangle_from_devmaster(
-				vert[0]->coord[0],vert[0]->coord[1],
-				vert[1]->coord[0],vert[1]->coord[1],
-				vert[2]->coord[0],vert[2]->coord[1],
-				color);
+		}
+		if(type == 3) {
+			VERT* vert[3] = {
+				&gfx3d.vertlist->list[poly->vertIndexes[2]],
+				&gfx3d.vertlist->list[poly->vertIndexes[1]],
+				&gfx3d.vertlist->list[poly->vertIndexes[0]],
+			};
+
+			triangle_from_devmaster(vert);
 		}

 	}
--- a/desmume/src/texcache.cpp
+++ b/desmume/src/texcache.cpp
@ -0,0 +1,594 @@
+#include "texcache.h"
+
+#include <algorithm>
+
+#include "bits.h"
+#include "common.h"
+#include "debug.h"
+#include "gfx3d.h"
+#include "NDSSystem.h"
+
+using std::min;
+using std::max;
+
+//This class represents a number of regions of memory which should be viewed as contiguous
+class MemSpan
+{
+public:
+	static const int MAXSIZE = 8;
+
+	MemSpan() 
+		: numItems(0)
+	{}
+
+	int numItems;
+
+	struct Item {
+		u32 start;
+		u32 len;
+		u8* ptr;
+		u32 ofs; //offset within the memspan
+	} items[MAXSIZE];
+
+	int size;
+
+	//this MemSpan shall be considered the first argument to a standard memcmp
+	//the length shall be as specified in this MemSpan, unless you specify otherwise
+	int memcmp(void* buf2, int size=-1)
+	{
+		if(size==-1) size = this->size;
+		size = min(this->size,size);
+		for(int i=0;i<numItems;i++)
+		{
+			Item &item = items[i];
+			int todo = min((int)item.len,size);
+			size -= todo;
+			int temp = ::memcmp(item.ptr,((u8*)buf2)+item.ofs,todo);
+			if(temp) return temp;
+			if(size == 0) break;
+		}
+		return 0;
+	}
+
+	//dumps the memspan to the specified buffer
+	//you may set size to limit the size to be copied
+	int dump(void* buf, int size=-1)
+	{
+		if(size==-1) size = this->size;
+		size = min(this->size,size);
+		u8* bufptr = (u8*)buf;
+		int done = 0;
+		for(int i=0;i<numItems;i++)
+		{
+			Item item = items[i];
+			int todo = min((int)item.len,size);
+			size -= todo;
+			done += todo;
+			memcpy(bufptr,item.ptr,todo);
+			bufptr += todo;
+			if(size==0) return done;
+		}
+		return done;
+	}
+};
+
+//creates a MemSpan in texture memory
+static MemSpan MemSpan_TexMem(u32 ofs, u32 len) 
+{
+	MemSpan ret;
+	ret.size = len;
+	u32 currofs = 0;
+	while(len) {
+		MemSpan::Item &curr = ret.items[ret.numItems++];
+		curr.start = ofs&0x1FFFF;
+		u32 slot = (ofs>>17)&3; //slots will wrap around
+		curr.len = min(len,0x20000-curr.start);
+		curr.ofs = currofs;
+		len -= curr.len;
+		ofs += curr.len;
+		currofs += curr.len;
+		u8* ptr = ARM9Mem.textureSlotAddr[slot];
+		
+		if(ptr == ARM9Mem.blank_memory) {
+			PROGINFO("Tried to reference unmapped texture memory: slot %d\n",slot);
+		}
+		curr.ptr = ptr + curr.start;
+	}
+	return ret;
+}
+
+//creates a MemSpan in texture palette memory
+static MemSpan MemSpan_TexPalette(u32 ofs, u32 len) 
+{
+	MemSpan ret;
+	ret.size = len;
+	u32 currofs = 0;
+	while(len) {
+		MemSpan::Item &curr = ret.items[ret.numItems++];
+		curr.start = ofs&0x3FFF;
+		u32 slot = (ofs>>14)&7; //this masks to 8 slots, but there are really only 6
+		if(slot>5) {
+			PROGINFO("Texture palette overruns texture memory. Wrapping at palette slot 0.\n");
+			slot -= 5;
+		}
+		curr.len = min(len,0x4000-curr.start);
+		curr.ofs = currofs;
+		len -= curr.len;
+		ofs += curr.len;
+		//if(len != 0) 
+			//here is an actual test case of bank spanning
+		currofs += curr.len;
+		u8* ptr = ARM9Mem.texPalSlot[slot];
+		
+		if(ptr == ARM9Mem.blank_memory) {
+			PROGINFO("Tried to reference unmapped texture palette memory: 16k slot #%d\n",slot);
+		}
+		curr.ptr = ptr + curr.start;
+	}
+	return ret;
+}
+
+//================================================= Textures
+TextureCache	texcache[MAX_TEXTURE+1];
+//u32				texcache_count;
+u32				texcache_start;
+u32				texcache_stop;
+
+
+u8 TexCache_texMAP[1024*2048*4]; 
+//raw ds format poly attributes
+
+//todo - make all color conversions go through a properly spread table!!
+
+#if defined (DEBUG_DUMP_TEXTURE) && defined (WIN32)
+static void DebugDumpTexture(int which)
+{
+	char fname[100];
+	sprintf(fname,"c:\\dump\\%d.bmp", which);
+
+	glBindTexture(GL_TEXTURE_2D,texcache[which].id);
+	  glGetTexImage( GL_TEXTURE_2D ,
+			      0,
+			    GL_BGRA_EXT,
+			      GL_UNSIGNED_BYTE,
+			      TexCache_texMAP);
+
+	NDS_WriteBMP_32bppBuffer(texcache[which].sizeX,texcache[which].sizeY,TexCache_texMAP,fname);
+}
+#else
+#define DebugDumpTexture(which) do { (void)which; } while (0)
+#endif
+
+
+static int lastTexture = -1;
+void TexCache_SetTexture(unsigned int format, unsigned int texpal)
+{
+	//for each texformat, number of palette entries
+	const int palSizes[] = {0, 32, 4, 16, 256, 0, 8, 0};
+
+	//for each texformat, multiplier from numtexels to numbytes (fixed point 30.2)
+	const int texSizes[] = {0, 4, 1, 2, 4, 1, 4, 8};
+
+	//used to hold a copy of the palette specified for this texture
+	u16 pal[256];
+
+	u32 *dwdst = (u32*)TexCache_texMAP;
+	
+	u32 textureMode = (unsigned short)((format>>26)&0x07);
+	unsigned int sizeX=(8 << ((format>>20)&0x07));
+	unsigned int sizeY=(8 << ((format>>23)&0x07));
+	unsigned int imageSize = sizeX*sizeY;
+
+	u8 *adr;
+
+	u32 paletteAddress;
+
+	switch (textureMode)
+	{
+	case TEXMODE_I2:
+		paletteAddress = texpal<<3;
+		break;
+	case TEXMODE_A3I5: //a3i5
+	case TEXMODE_I4: //i4
+	case TEXMODE_I8: //i8
+	case TEXMODE_A5I3: //a5i3
+	case TEXMODE_16BPP: //16bpp
+	case TEXMODE_4X4: //4x4
+	default:
+		paletteAddress = texpal<<4;
+		break;
+	}
+
+	//analyze the texture memory mapping and the specifications of this texture
+	int palSize = palSizes[textureMode];
+	int texSize = (imageSize*texSizes[textureMode])>>2; //shifted because the texSizes multiplier is fixed point
+	MemSpan ms = MemSpan_TexMem((format&0xFFFF)<<3,texSize);
+	MemSpan mspal = MemSpan_TexPalette(paletteAddress,palSize*2);
+
+	//determine the location for 4x4 index data
+	u32 indexBase;
+	if((format & 0xc000) == 0x8000) indexBase = 0x30000;
+	else indexBase = 0x20000;
+
+	u32 indexOffset = (format&0x3FFF)<<2;
+
+	int indexSize = 0;
+	MemSpan msIndex;
+	if(textureMode == TEXMODE_4X4)
+	{
+		indexSize = imageSize>>3;
+		msIndex = MemSpan_TexMem(indexOffset+indexBase,indexSize);
+	}
+
+
+	//dump the palette to a temp buffer, so that we don't have to worry about memory mapping.
+	//this isnt such a problem with texture memory, because we read sequentially from it.
+	//however, we read randomly from palette memory, so the mapping is more costly.
+	mspal.dump(pal);
+
+
+	u32 tx=texcache_start;
+
+	//if(false)
+	while (TRUE)
+	{
+		//conditions where we give up and regenerate the texture:
+		if (texcache_stop == tx) break;
+		if (texcache[tx].frm == 0) break;
+
+		//conditions where we reject matches:
+		//when the teximage or texpal params dont match 
+		//(this is our key for identifying palettes in the cache)
+		if (texcache[tx].frm != format) goto REJECT;
+		if (texcache[tx].pal != texpal) goto REJECT;
+
+		//the texture matches params, but isnt suspected invalid. accept it.
+		if (!texcache[tx].suspectedInvalid) goto ACCEPT;
+
+		//if we couldnt cache this entire texture due to it being too large, then reject it
+		if (texSize+indexSize > (int)sizeof(texcache[tx].texture)) goto REJECT;
+
+		//when the palettes dont match:
+		//note that we are considering 4x4 textures to have a palette size of 0.
+		//they really have a potentially HUGE palette, too big for us to handle like a normal palette,
+		//so they go through a different system
+		if (mspal.size != 0 && memcmp(texcache[tx].palette,pal,mspal.size)) goto REJECT;
+
+		//when the texture data doesn't match
+		if(ms.memcmp(texcache[tx].texture,sizeof(texcache[tx].texture))) goto REJECT;
+
+		//if the texture is 4x4 then the index data must match
+		if(textureMode == TEXMODE_4X4)
+		{
+			if(msIndex.memcmp(texcache[tx].texture + texcache[tx].textureSize,texcache[tx].indexSize)) goto REJECT; 
+		}
+
+
+ACCEPT:
+		texcache[tx].suspectedInvalid = false;
+//		texcache_count = tx;
+		if(lastTexture == -1 || (int)tx != lastTexture)
+		{
+			lastTexture = tx;
+			if(TexCache_BindTexture)
+				TexCache_BindTexture(tx);
+		}
+		return;
+ 
+REJECT:
+		tx++;
+		if ( tx > MAX_TEXTURE )
+		{
+			texcache_stop=texcache_start;
+			texcache[texcache_stop].frm=0;
+			texcache_start++;
+			if (texcache_start>MAX_TEXTURE) 
+			{
+				texcache_start=0;
+				texcache_stop=MAX_TEXTURE<<1;
+			}
+			tx=0;
+		}
+	}
+
+	lastTexture = tx;
+	//glBindTexture(GL_TEXTURE_2D, texcache[tx].id);
+
+	texcache[tx].suspectedInvalid = false;
+	texcache[tx].frm=format;
+	texcache[tx].mode=textureMode;
+	texcache[tx].pal=texpal;
+	texcache[tx].sizeX=sizeX;
+	texcache[tx].sizeY=sizeY;
+	texcache[tx].invSizeX=1.0f/((float)(sizeX));
+	texcache[tx].invSizeY=1.0f/((float)(sizeY));
+	texcache[tx].textureSize = ms.dump(texcache[tx].texture,sizeof(texcache[tx].texture));
+
+	//dump palette data for cache keying
+	if ( palSize )
+	{
+		memcpy(texcache[tx].palette, pal, palSize*2);
+	}
+	//dump 4x4 index data for cache keying
+	texcache[tx].indexSize = 0;
+	if(textureMode == TEXMODE_4X4)
+	{
+		texcache[tx].indexSize = min(msIndex.size,(int)sizeof(texcache[tx].texture) - texcache[tx].textureSize);
+		msIndex.dump(texcache[tx].texture+texcache[tx].textureSize,texcache[tx].indexSize);
+	}
+
+
+	//glMatrixMode (GL_TEXTURE);
+	//glLoadIdentity ();
+	//glScaled (texcache[tx].invSizeX, texcache[tx].invSizeY, 1.0f);
+
+
+	//INFO("Texture %03i - format=%08X; pal=%04X (mode %X, width %04i, height %04i)\n",i, texcache[i].frm, texcache[i].pal, texcache[i].mode, sizeX, sizeY);
+
+	//============================================================================ Texture conversion
+	u32 palZeroTransparent = (1-((format>>29)&1))*255;						// shash: CONVERT THIS TO A TABLE :)
+
+	switch (texcache[tx].mode)
+	{
+	case TEXMODE_A3I5:
+		{
+			for(int j=0;j<ms.numItems;j++) {
+				adr = ms.items[j].ptr;
+				for(u32 x = 0; x < ms.items[j].len; x++)
+				{
+					u16 c = pal[*adr&31];
+					u8 alpha = *adr>>5;
+					*dwdst++ = RGB15TO32(c,material_3bit_to_8bit[alpha]);
+					adr++;
+				}
+			}
+
+			break;
+		}
+	case TEXMODE_I2:
+		{
+			for(int j=0;j<ms.numItems;j++) {
+				adr = ms.items[j].ptr;
+				for(u32 x = 0; x < ms.items[j].len; x++)
+				{
+					u8 bits;
+					u16 c;
+
+					bits = (*adr)&0x3;
+					c = pal[bits];
+					*dwdst++ = RGB15TO32(c,(bits == 0) ? palZeroTransparent : 255);
+
+					bits = ((*adr)>>2)&0x3;
+					c = pal[bits];
+					*dwdst++ = RGB15TO32(c,(bits == 0) ? palZeroTransparent : 255);
+
+					bits = ((*adr)>>4)&0x3;
+					c = pal[bits];
+					*dwdst++ = RGB15TO32(c,(bits == 0) ? palZeroTransparent : 255);
+
+					bits = ((*adr)>>6)&0x3;
+					c = pal[bits];
+					*dwdst++ = RGB15TO32(c,(bits == 0) ? palZeroTransparent : 255);
+
+					adr++;
+				}
+			}
+			break;
+		}
+	case TEXMODE_I4:
+		{
+			for(int j=0;j<ms.numItems;j++) {
+				adr = ms.items[j].ptr;
+				for(u32 x = 0; x < ms.items[j].len; x++)
+				{
+					u8 bits;
+					u16 c;
+
+					bits = (*adr)&0xF;
+					c = pal[bits];
+					*dwdst++ = RGB15TO32(c,(bits == 0) ? palZeroTransparent : 255);
+
+					bits = ((*adr)>>4);
+					c = pal[bits];
+					*dwdst++ = RGB15TO32(c,(bits == 0) ? palZeroTransparent : 255);
+
+					adr++;
+				}
+			}
+			break;
+		}
+	case TEXMODE_I8:
+		{
+			for(int j=0;j<ms.numItems;j++) {
+				adr = ms.items[j].ptr;
+				for(u32 x = 0; x < ms.items[j].len; ++x)
+				{
+					u16 c = pal[*adr];
+					*dwdst++ = RGB15TO32(c,(*adr == 0) ? palZeroTransparent : 255);
+					adr++;
+				}
+			}
+		}
+		break;
+	case TEXMODE_4X4:
+		{
+			//RGB16TO32 is used here because the other conversion macros result in broken interpolation logic
+
+			if(ms.numItems != 1) {
+				PROGINFO("Your 4x4 texture has overrun its texture slot.\n");
+			}
+			//this check isnt necessary since the addressing is tied to the texture data which will also run out:
+			//if(msIndex.numItems != 1) PROGINFO("Your 4x4 texture index has overrun its slot.\n");
+
+#define PAL4X4(offset) ( *(u16*)( ARM9Mem.texPalSlot[((paletteAddress + (offset)*2)>>14)] + ((paletteAddress + (offset)*2)&0x3FFF) ) )
+
+			u16* slot1;
+			u32* map = (u32*)ms.items[0].ptr;
+			u32 limit = ms.items[0].len<<2;
+			u32 d = 0;
+			if ( (texcache[tx].frm & 0xc000) == 0x8000)
+				// texel are in slot 2
+				slot1=(u16*)&ARM9Mem.textureSlotAddr[1][((texcache[tx].frm & 0x3FFF)<<2)+0x010000];
+			else 
+				slot1=(u16*)&ARM9Mem.textureSlotAddr[1][(texcache[tx].frm & 0x3FFF)<<2];
+
+			u16 yTmpSize = (texcache[tx].sizeY>>2);
+			u16 xTmpSize = (texcache[tx].sizeX>>2);
+
+			//this is flagged whenever a 4x4 overruns its slot.
+			//i am guessing we just generate black in that case
+			bool dead = false;
+
+			for (int y = 0; y < yTmpSize; y ++)
+			{
+				u32 tmpPos[4]={(y<<2)*texcache[tx].sizeX,((y<<2)+1)*texcache[tx].sizeX,
+					((y<<2)+2)*texcache[tx].sizeX,((y<<2)+3)*texcache[tx].sizeX};
+				for (int x = 0; x < xTmpSize; x ++, d++)
+				{
+					if(d >= limit)
+						dead = true;
+
+					if(dead) {
+						for (int sy = 0; sy < 4; sy++)
+						{
+							u32 currentPos = (x<<2) + tmpPos[sy];
+							dwdst[currentPos] = dwdst[currentPos+1] = dwdst[currentPos+2] = dwdst[currentPos+3] = 0;
+						}
+						continue;
+					}
+
+					u32 currBlock	= map[d];
+					u16 pal1		= slot1[d];
+					u16 pal1offset	= (pal1 & 0x3FFF)<<1;
+					u8  mode		= pal1>>14;
+					u32 tmp_col[4];
+
+					tmp_col[0]=RGB16TO32(PAL4X4(pal1offset),255);
+					tmp_col[1]=RGB16TO32(PAL4X4(pal1offset+1),255);
+
+					switch (mode) 
+					{
+					case 0:
+						tmp_col[2]=RGB16TO32(PAL4X4(pal1offset+2),255);
+						tmp_col[3]=RGB16TO32(0x7FFF,0);
+						break;
+					case 1:
+						tmp_col[2]=(((tmp_col[0]&0xFF)+(tmp_col[1]&0xff))>>1)|
+							(((tmp_col[0]&(0xFF<<8))+(tmp_col[1]&(0xFF<<8)))>>1)|
+							(((tmp_col[0]&(0xFF<<16))+(tmp_col[1]&(0xFF<<16)))>>1)|
+							(0xff<<24);
+						tmp_col[3]=RGB16TO32(0x7FFF,0);
+						break;
+					case 2:
+						tmp_col[2]=RGB16TO32(PAL4X4(pal1offset+2),255);
+						tmp_col[3]=RGB16TO32(PAL4X4(pal1offset+3),255);
+						break;
+					case 3: 
+						{
+							u32 red1, red2;
+							u32 green1, green2;
+							u32 blue1, blue2;
+							u16 tmp1, tmp2;
+
+							red1=tmp_col[0]&0xff;
+							green1=(tmp_col[0]>>8)&0xff;
+							blue1=(tmp_col[0]>>16)&0xff;
+							red2=tmp_col[1]&0xff;
+							green2=(tmp_col[1]>>8)&0xff;
+							blue2=(tmp_col[1]>>16)&0xff;
+
+							tmp1=((red1*5+red2*3)>>6)|
+								(((green1*5+green2*3)>>6)<<5)|
+								(((blue1*5+blue2*3)>>6)<<10);
+							tmp2=((red2*5+red1*3)>>6)|
+								(((green2*5+green1*3)>>6)<<5)|
+								(((blue2*5+blue1*3)>>6)<<10);
+
+							tmp_col[2]=RGB16TO32(tmp1,255);
+							tmp_col[3]=RGB16TO32(tmp2,255);
+							break;
+						}
+					}
+
+					//set all 16 texels
+					for (int sy = 0; sy < 4; sy++)
+					{
+						// Texture offset
+						u32 currentPos = (x<<2) + tmpPos[sy];
+						u8 currRow = (u8)((currBlock>>(sy<<3))&0xFF);
+
+						dwdst[currentPos] = tmp_col[currRow&3];
+						dwdst[currentPos+1] = tmp_col[(currRow>>2)&3];
+						dwdst[currentPos+2] = tmp_col[(currRow>>4)&3];
+						dwdst[currentPos+3] = tmp_col[(currRow>>6)&3];
+					}
+
+
+				}
+			}
+
+
+			break;
+		}
+	case TEXMODE_A5I3:
+		{
+			for(int j=0;j<ms.numItems;j++) {
+				adr = ms.items[j].ptr;
+				for(u32 x = 0; x < ms.items[j].len; ++x)
+				{
+					u16 c = pal[*adr&0x07];
+					u8 alpha = (*adr>>3);
+					*dwdst++ = RGB15TO32(c,material_5bit_to_8bit[alpha]);
+					adr++;
+				}
+			}
+			break;
+		}
+	case TEXMODE_16BPP:
+		{
+			for(int j=0;j<ms.numItems;j++) {
+				u16* map = (u16*)ms.items[j].ptr;
+				for(u32 x = 0; x < ms.items[j].len; ++x)
+				{
+					u16 c = map[x];
+					int alpha = ((c&0x8000)?255:0);
+					*dwdst++ = RGB15TO32(c&0x7FFF,alpha);
+				}
+			}
+			break;
+		}
+	}
+
+	if(TexCache_BindTextureData != 0)
+		TexCache_BindTextureData(tx,TexCache_texMAP);
+
+	DebugDumpTexture(tx);
+
+	//============================================================================================
+
+//	texcache_count=tx;
+
+
+}
+
+void TexCache_Reset()
+{
+	memset(&texcache,0,sizeof(texcache));
+
+	//texcache_count=0;
+	texcache_start=0;
+	texcache_stop=MAX_TEXTURE<<1;
+
+	for(int i=0;i<MAX_TEXTURE+1;i++)
+		texcache[i].suspectedInvalid = true;
+}
+
+TextureCache* TexCache_Curr()
+{
+	if(lastTexture == -1)
+		return NULL;
+	else return &texcache[lastTexture];
+}
+
+void (*TexCache_BindTexture)(u32 texnum) = NULL;
+void (*TexCache_BindTextureData)(u32 texnum, u8* data);
--- a/desmume/src/texcache.h
+++ b/desmume/src/texcache.h
@ -0,0 +1,41 @@
+#ifndef _TEXCACHE_H_
+#define _TEXCACHE_H_
+
+#include "common.h"
+
+#define MAX_TEXTURE 500
+#ifdef SSE2
+struct ALIGN(16) TextureCache
+#else
+struct ALIGN(8) TextureCache
+#endif
+{
+	u32					id;
+	u32					frm;
+	u32					mode;
+	u32					pal;
+	u32					sizeX;
+	u32					sizeY;
+	float				invSizeX;
+	float				invSizeY;
+	int					textureSize, indexSize;
+	u8					texture[128*1024]; // 128Kb texture slot
+	u8					palette[256*2];
+
+	//set if this texture is suspected be invalid due to a vram reconfigure
+	bool				suspectedInvalid;
+
+};
+
+extern TextureCache	texcache[MAX_TEXTURE+1];
+
+extern void (*TexCache_BindTexture)(u32 texnum);
+extern void (*TexCache_BindTextureData)(u32 texnum, u8* data);
+
+void TexCache_Reset();
+void TexCache_SetTexture(unsigned int format, unsigned int texpal);
+
+extern u8 TexCache_texMAP[1024*2048*4]; 
+TextureCache* TexCache_Curr();
+
+#endif
--- a/desmume/src/windows/DeSmuME_2005.vcproj
+++ b/desmume/src/windows/DeSmuME_2005.vcproj
@ -1160,6 +1160,14 @@
 			RelativePath="..\SPU.h"
 			>
 		</File>
+		<File
+			RelativePath="..\texcache.cpp"
+			>
+		</File>
+		<File
+			RelativePath="..\texcache.h"
+			>
+		</File>
 		<File
 			RelativePath="..\thumb_instructions.cpp"
 			>