rewrite texture cache, change commandline from --single-core to --num-cores=N, add multithreading to rasterizer, add toggles to 3d config to disable fog+edgemarking for little speedups in games that use them.

2009-10-28 09:39:52 +00:00 · 2009-10-28 09:39:52 +00:00 · 8934925019
parent dd117dc47f
commit 8934925019
21 changed files with 1749 additions and 1192 deletions
--- a/desmume/src/GPU.cpp
+++ b/desmume/src/GPU.cpp
@ -55,9 +55,6 @@ GPU::MosaicLookup GPU::mosaicLookup;
 //#define DEBUG_TRI

 CACHE_ALIGN u8 GPU_screen[4*256*192];
-u8 *GPU_tempScanline;
-CACHE_ALIGN u16 GPU_tempScanlineBuffer[256];
-
 CACHE_ALIGN u8 sprWin[256];


@ -2237,7 +2234,7 @@ template<bool SKIP> static void GPU_RenderLine_DispCapture(u16 l)
 									//INFO("Capture screen (BG + OBJ + 3D)\n");

 									u8 *src;
-									src = (u8*)(GPU_tempScanline);
+									src = (u8*)(gpu->tempScanline);
 									CAPCOPY(src,cap_dst);
 								}
 							break;
@ -2279,7 +2276,7 @@ template<bool SKIP> static void GPU_RenderLine_DispCapture(u16 l)
 						if (gpu->dispCapCnt.srcA == 0)
 						{
 							// Capture screen (BG + OBJ + 3D)
-							srcA = (u16*)(GPU_tempScanline);
+							srcA = (u16*)(gpu->tempScanline);
 						}
 						else
 						{
@ -2579,10 +2576,10 @@ void GPU_RenderLine(NDS_Screen * screen, u16 l, bool skip)
 	//generate the 2d engine output
 	if(gpu->dispMode == 1) {
 		//optimization: render straight to the output buffer when thats what we are going to end up displaying anyway
-		GPU_tempScanline = screen->gpu->currDst = (u8 *)(GPU_screen) + (screen->offset + l) * 512;
+		gpu->tempScanline = screen->gpu->currDst = (u8 *)(GPU_screen) + (screen->offset + l) * 512;
 	} else {
 		//otherwise, we need to go to a temp buffer
-		GPU_tempScanline = screen->gpu->currDst = (u8 *)GPU_tempScanlineBuffer;
+		gpu->tempScanline = screen->gpu->currDst = (u8 *)gpu->tempScanlineBuffer;
 	}

 	GPU_RenderLine_layer(screen, l);
--- a/desmume/src/GPU.h
+++ b/desmume/src/GPU.h
@ -736,6 +736,9 @@ struct GPU
 	u16 *currentFadeInColors, *currentFadeOutColors;
 	bool blend2[8];

+	CACHE_ALIGN u16 tempScanlineBuffer[256];
+	u8 *tempScanline;
+
 	u8	MasterBrightMode;
 	u32 MasterBrightFactor;

--- a/desmume/src/Makefile.am
+++ b/desmume/src/Makefile.am
@ -43,6 +43,7 @@ libdesmume_a_SOURCES = \
 	utils/md5.cpp utils/md5.h utils/valuearray.h utils/xstring.cpp utils/xstring.h \
 	utils/decrypt/crc.cpp utils/decrypt/crc.h utils/decrypt/decrypt.cpp \
 	utils/decrypt/decrypt.h utils/decrypt/header.cpp utils/decrypt/header.h \
+	utils/task.cpp utils/task.h \
        addons.cpp addons.h \
 	addons/compactFlash.cpp addons/gbagame.cpp addons/none.cpp addons/rumblepak.cpp addons/guitarGrip.cpp addons/expMemory.cpp fs.h \
 	cheatSystem.cpp cheatSystem.h \
--- a/desmume/src/NDSSystem.cpp
+++ b/desmume/src/NDSSystem.cpp
@ -1881,6 +1881,14 @@ void Sequencer::init()
 	#endif
 }

+//this isnt helping much right now. work on it later
+//#include "utils/task.h"
+//Task taskSubGpu(true);
+//void* renderSubScreen(void*)
+//{
+//	GPU_RenderLine(&SubScreen, nds.VCount, SkipCur2DFrame);
+//	return NULL;
+//}

 static void execHardware_hblank()
 {
@ -1907,8 +1915,10 @@ static void execHardware_hblank()
 		//in practice we need to be more forgiving, in case things have overrun the scanline start.
 		//this should be safe since games cannot do anything timing dependent until this next
 		//scanline begins, anyway (as this scanline was in the middle of drawing)
+		//taskSubGpu.execute(renderSubScreen,NULL);
 		GPU_RenderLine(&MainScreen, nds.VCount, SkipCur2DFrame);
 		GPU_RenderLine(&SubScreen, nds.VCount, SkipCur2DFrame);
+		//taskSubGpu.finish();

 		//trigger hblank dmas
 		//but notice, we do that just after we finished drawing the line
@ -1963,12 +1973,12 @@ static void execHardware_hstart_vblankStart()
 static void execHardware_hstart_vcount()
 {
 	u16 vmatch = T1ReadWord(MMU.ARM9_REG, 4);
-	if(nds.VCount==((vmatch>>8)|((vmatch<<1)&(1<<8))))
+	vmatch = ((vmatch>>8)|((vmatch<<1)&(1<<8)));
+	if(nds.VCount==vmatch)
 	{
 		//arm9 vmatch
 		T1WriteWord(MMU.ARM9_REG, 4, T1ReadWord(MMU.ARM9_REG, 4) | 4);
 		if(T1ReadWord(MMU.ARM9_REG, 4) & 32) {
-			//printf("VMATCH FIRING! vc=%03d\n",nds.VCount);
 			NDS_makeARM9Int(2);
 		}
 	}
@ -1976,7 +1986,8 @@ static void execHardware_hstart_vcount()
 		T1WriteWord(MMU.ARM9_REG, 4, T1ReadWord(MMU.ARM9_REG, 4) & 0xFFFB);

 	vmatch = T1ReadWord(MMU.ARM7_REG, 4);
-	if(nds.VCount==((vmatch>>8)|((vmatch<<1)&(1<<8))))
+	vmatch = ((vmatch>>8)|((vmatch<<1)&(1<<8)));
+	if(nds.VCount==vmatch)
 	{
 		//arm7 vmatch
 		T1WriteWord(MMU.ARM7_REG, 4, T1ReadWord(MMU.ARM7_REG, 4) | 4);
--- a/desmume/src/NDSSystem.h
+++ b/desmume/src/NDSSystem.h
@ -421,17 +421,19 @@ int NDS_WriteBMP_32bppBuffer(int width, int height, const void* buf, const char

 extern struct TCommonSettings {
 	TCommonSettings() 
-		: HighResolutionInterpolateColor(true)
-		, UseExtBIOS(false)
+		: UseExtBIOS(false)
 		, SWIFromBIOS(false)
 		, UseExtFirmware(false)
 		, BootFromFirmware(false)
 		, DebugConsole(false)
-		, single_core(true)
+		, num_cores(1)
 		, spuInterpolationMode(SPUInterpolation_Linear)
 		//, gfx3d_flushMode(0)
 		, manualBackupType(0)
 		, micMode(InternalNoise)
+		, GFX3D_HighResolutionInterpolateColor(true)
+		, GFX3D_EdgeMark(true)
+		, GFX3D_Fog(true)
 	{
 		strcpy(ARM9BIOS, "biosnds9.bin");
 		strcpy(ARM7BIOS, "biosnds7.bin");
@ -443,7 +445,9 @@ extern struct TCommonSettings {
 		for(int i=0;i<16;i++)
 			spu_muteChannels[i] = false;
 	}
-	bool HighResolutionInterpolateColor;
+	bool GFX3D_HighResolutionInterpolateColor;
+	bool GFX3D_EdgeMark;
+	bool GFX3D_Fog;

 	bool UseExtBIOS;
 	char ARM9BIOS[256];
@ -456,7 +460,8 @@ extern struct TCommonSettings {

 	bool DebugConsole;

-	bool single_core;
+	int num_cores;
+	bool single_core() { return num_cores==1; }
 	
 	struct _Wifi {
 		int mode;
--- a/desmume/src/OGLRender.cpp
+++ b/desmume/src/OGLRender.cpp
@ -24,6 +24,8 @@
 //so, it doesnt composite to 2d correctly.
 //(re: new super mario brothers renders the stormclouds at the beginning)

+#include <queue>
+
 #include "OGLRender.h"
 #include "debug.h"

@ -208,9 +210,8 @@ static void _xglDisable(GLenum cap) {
 	CTASSERT((cap-0x0B00)<0x100); \
 	_xglDisable(cap); }

+static std::queue<GLuint> freeTextureIds;

-
-GLenum			oglTempTextureID[MAX_TEXTURE];
 GLenum			oglToonTableTextureID;

 #define NOSHADERS(s)					{ hasShaders = false; INFO("Shaders aren't supported on your system, using fixed pipeline\n(%s)\n", s); return; }
@ -252,17 +253,16 @@ GLenum			oglToonTableTextureID;

 bool hasShaders = false;

-/* Vertex shader */
 GLuint vertexShaderID;
-/* Fragment shader */
 GLuint fragmentShaderID;
-/* Shader program */
 GLuint shaderProgram;

 static GLuint hasTexLoc;
 static GLuint texBlendLoc;
 static bool hasTexture = false;

+static ADPCMCacheItem* currTexture = NULL;
+
 /* Shaders init */

 static void createShaders()
@ -337,45 +337,54 @@ static void OGLReset()
 	}

 	TexCache_Reset();
-
-	for (int i = 0; i < MAX_TEXTURE; i++)
-		texcache[i].id=oglTempTextureID[i];
+	currTexture = NULL;

 //	memset(GPU_screenStencil,0,sizeof(GPU_screenStencil));
 	memset(GPU_screen3D,0,sizeof(GPU_screen3D));
 }

+//static class OGLTexCacheUser : public ITexCacheUser
+//{
+//public:
+//	virtual void BindTexture(u32 tx)
+//	{
+//		glBindTexture(GL_TEXTURE_2D,(GLuint)texcache[tx].id);
+//		glMatrixMode (GL_TEXTURE);
+//		glLoadIdentity ();
+//		glScaled (texcache[tx].invSizeX, texcache[tx].invSizeY, 1.0f);
+//
+//		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+//		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+//
+//		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (BIT16(texcache[tx].frm) ? (BIT18(texcache[tx].frm)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP));
+//		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (BIT17(texcache[tx].frm) ? (BIT19(texcache[tx].frm)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP));
+//	}
+//
+//	virtual void BindTextureData(u32 tx, u8* data)
+//	{
+//		BindTexture(tx);
+//
+//	#if 0
+//		for (int i=0; i < texcache[tx].sizeX * texcache[tx].sizeY*4; i++)
+//			data[i] = 0xFF;
+//	#endif
+//		glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 
+//			texcache[tx].sizeX, texcache[tx].sizeY, 0, 
+//			GL_RGBA, GL_UNSIGNED_BYTE, data);
+//	}
+//} textures;
+//
+//static TexCacheUnit texCacheUnit;

-
-
-static void BindTexture(u32 tx)
+static void expandFreeTextures()
 {
-	glBindTexture(GL_TEXTURE_2D,(GLuint)texcache[tx].id);
-	glMatrixMode (GL_TEXTURE);
-	glLoadIdentity ();
-	glScaled (texcache[tx].invSizeX, texcache[tx].invSizeY, 1.0f);
-
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (BIT16(texcache[tx].frm) ? (BIT18(texcache[tx].frm)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP));
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (BIT17(texcache[tx].frm) ? (BIT19(texcache[tx].frm)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP));
+	const int kInitTextures = 128;
+	GLuint oglTempTextureID[kInitTextures];
+	glGenTextures(kInitTextures, &oglTempTextureID[0]);
+	for(int i=0;i<kInitTextures;i++)
+		freeTextureIds.push(oglTempTextureID[i]);
 }

-static void BindTextureData(u32 tx, u8* data)
-{
-	BindTexture(tx);
-
-#if 0
-	for (int i=0; i < texcache[tx].sizeX * texcache[tx].sizeY*4; i++)
-		data[i] = 0xFF;
-#endif
-	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 
-		texcache[tx].sizeX, texcache[tx].sizeY, 0, 
-		GL_RGBA, GL_UNSIGNED_BYTE, data);
-}
-
-
 static char OGLInit(void)
 {
 	GLuint loc = 0;
@ -388,9 +397,7 @@ static char OGLInit(void)
 	if(!BEGINGL())
 		return 0;

-	TexCache_BindTexture = BindTexture;
-	TexCache_BindTextureData = BindTextureData;
-	glGenTextures (MAX_TEXTURE, &oglTempTextureID[0]);
+	expandFreeTextures();

 	glPixelStorei(GL_PACK_ALIGNMENT,8);

@ -498,12 +505,28 @@ static void OGLClose()
 		hasShaders = false;
 	}

-	glDeleteTextures(MAX_TEXTURE, &oglTempTextureID[0]);
+	//kill the tex cache to free all the texture ids
+	TexCache_Reset();
+
+	while(!freeTextureIds.empty())
+	{
+		GLuint temp = freeTextureIds.front();
+		freeTextureIds.pop();
+		glDeleteTextures(1,&temp);
+	}
+	//glDeleteTextures(MAX_TEXTURE, &oglTempTextureID[0]);
 	glDeleteTextures(1, &oglToonTableTextureID);

 	ENDGL();
 }

+static void texDeleteCallback(ADPCMCacheItem* item)
+{
+	freeTextureIds.push((GLuint)item->texid);
+	if(currTexture == item)
+		currTexture = NULL;
+}
+
 static void setTexture(unsigned int format, unsigned int texpal)
 {
 	textureFormat = format;
@ -529,7 +552,43 @@ static void setTexture(unsigned int format, unsigned int texpal)
 	}


-	TexCache_SetTexture<TexFormat_32bpp>(format, texpal);
+//	texCacheUnit.TexCache_SetTexture<TexFormat_32bpp>(format, texpal);
+	ADPCMCacheItem* newTexture = TexCache_SetTexture(TexFormat_32bpp,format,texpal);
+	if(newTexture != currTexture)
+	{
+		currTexture = newTexture;
+		//has the ogl renderer initialized the texture?
+		if(!currTexture->deleteCallback)
+		{
+			currTexture->deleteCallback = texDeleteCallback;
+			if(freeTextureIds.empty()) expandFreeTextures();
+			currTexture->texid = (void*)freeTextureIds.front();
+			freeTextureIds.pop();
+
+			glBindTexture(GL_TEXTURE_2D,(GLuint)currTexture->texid);
+
+			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+
+			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (BIT16(currTexture->texformat) ? (BIT18(currTexture->texformat)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP));
+			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (BIT17(currTexture->texformat) ? (BIT19(currTexture->texformat)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP));
+
+			glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 
+				currTexture->sizeX, currTexture->sizeY, 0, 
+				GL_RGBA, GL_UNSIGNED_BYTE, currTexture->decoded);
+		}
+		else
+		{
+			//otherwise, just bind it
+			glBindTexture(GL_TEXTURE_2D,(GLuint)currTexture->texid);
+		}
+
+		//in either case, we need to setup the tex mtx
+		glMatrixMode(GL_TEXTURE);
+		glLoadIdentity();
+		glScalef(currTexture->invSizeX, currTexture->invSizeY, 1.0f);
+
+	}
 }


@ -902,6 +961,9 @@ static void OGLRender()
 		}
 	}

+	//needs to happen before endgl because it could free some textureids for expired cache items
+	TexCache_EvictFrame();
+
 	ENDGL();

 	GL_ReadFramebuffer();
--- a/desmume/src/aggdraw.cpp
+++ b/desmume/src/aggdraw.cpp
@ -148,7 +148,7 @@ void Agg_init()
 	aggDraw.target = targets[0];

 	//if we're single core, we don't want to waste time compositing
-	if(CommonSettings.single_core)
+	if(CommonSettings.single_core())
 		aggDraw.hud = &agg_targetScreen;

 	//and the more clever compositing isnt supported in non-windows
--- a/desmume/src/commandline.cpp
+++ b/desmume/src/commandline.cpp
@ -40,8 +40,7 @@ CommandLine::CommandLine()
 , _record_movie_file(0)
 , _cflash_image(0)
 , _cflash_path(0)
-, _single_core(0)
-, _multi_core(0)
+, _num_cores(-1)
 , _bios_arm9(NULL)
 , _bios_arm7(NULL)
 , _bios_swi(0)
@ -74,8 +73,7 @@ void CommandLine::loadCommonOptions()
 		{ "bios-arm7", 0, 0, G_OPTION_ARG_FILENAME, &_bios_arm7, "Uses the arm7 bios provided at the specified path", "BIOS_ARM7_PATH"},
 		{ "bios-swi", 0, 0, G_OPTION_ARG_INT, &_bios_swi, "Uses SWI from the provided bios files", "BIOS_SWI"},
 #ifdef _MSC_VER
-		{ "single-core", 0, 0, G_OPTION_ARG_NONE, &_single_core, "Limit execution to use approximately only one core", "NUM_CORES"},
-		{ "multi-core", 0, 0, G_OPTION_ARG_NONE, &_multi_core, "Act as if multiple cores are present, even on a single-core machine", "MULTI_CORE"},
+		{ "num-cores", 0, 0, G_OPTION_ARG_NONE, &_num_cores, "Override numcores detection and use this many", "NUM_CORES"},
 		{ "scanline-filter-a", 0, 0, G_OPTION_ARG_INT, &scanline_filter_a, "Intensity of fadeout for scanlines filter (edge) (default 2)", "SCANLINE_FILTER_A"},
 		{ "scanline-filter-b", 0, 0, G_OPTION_ARG_INT, &scanline_filter_b, "Intensity of fadeout for scanlines filter (corner) (default 4)", "SCANLINE_FILTER_B"},
 #endif
@ -103,8 +101,7 @@ bool CommandLine::parse(int argc,char **argv)
 	if(_cflash_image) cflash_image = _cflash_image;
 	if(_cflash_path) cflash_path = _cflash_path;

-	if(_single_core) CommonSettings.single_core = true;
-	if(_multi_core) CommonSettings.single_core = false;
+	if(_num_cores != -1) CommonSettings.num_cores = _num_cores;

 	//TODO MAX PRIORITY! change ARM9BIOS etc to be a std::string
 	if(_bios_arm9) { CommonSettings.UseExtBIOS = true; strcpy(CommonSettings.ARM9BIOS,_bios_arm9); }
--- a/desmume/src/commandline.h
+++ b/desmume/src/commandline.h
@ -75,8 +75,7 @@ private:
 	char* _cflash_path;
 	char* _bios_arm9, *_bios_arm7;
 	int _bios_swi;
-	int _single_core;
-	int _multi_core;
+	int _num_cores;
 };

 #endif
--- a/desmume/src/gfx3d.cpp
+++ b/desmume/src/gfx3d.cpp
@ -2341,7 +2341,7 @@ static FORCEINLINE VERT clipPoint(VERT* inside, VERT* outside, int coord, int wh
 	INTERP(coord[0]); INTERP(coord[1]); INTERP(coord[2]); INTERP(coord[3]);
 	INTERP(texcoord[0]); INTERP(texcoord[1]);

-	if(CommonSettings.HighResolutionInterpolateColor)
+	if(CommonSettings.GFX3D_HighResolutionInterpolateColor)
 	{
 		INTERP(fcolor[0]); INTERP(fcolor[1]); INTERP(fcolor[2]);
 	}
--- a/desmume/src/rasterize.cpp
+++ b/desmume/src/rasterize.cpp
@ -1,8 +1,4 @@
-/*  Copyright (C) 2006 yopyop
-    yopyop156@ifrance.com
-    yopyop156.ifrance.com
-
-	Copyright 2009 DeSmuME team
+/*  Copyright 2009 DeSmuME team

    This file is part of DeSmuME

@ -46,6 +42,7 @@
 #include "gfx3d.h"
 #include "texcache.h"
 #include "NDSSystem.h"
+#include "utils/task.h"

 //#undef FORCEINLINE
 //#define FORCEINLINE
@ -63,8 +60,6 @@ template<typename T> T _max(T a, T b, T c, T d) { return max(_max(a,b,d),c); }

 static const int kUnsetTranslucentPolyID = 255;

-static int polynum;
-
 static u8 modulate_table[64][64];
 static u8 decal_table[32][64][64];
 static u8 index_lookup_table[65];
@ -72,6 +67,9 @@ static u8 index_start_table[8];

 static GFX3D_Clipper clipper;
 static GFX3D_Clipper::TClippedPoly *clippedPolys = NULL;
+static ADPCMCacheItem* polyTexKeys[POLYLIST_SIZE];
+static bool polyVisible[POLYLIST_SIZE];
+static bool polyBackfacing[POLYLIST_SIZE];
 static int clippedPolyCounter;


@ -118,65 +116,242 @@ static FORCEINLINE int fastFloor(float f)



-//----texture cache---

-//TODO - the texcache could ask for a buffer to generate into
-//that would avoid us ever having to buffercopy..
-struct TextureBuffers
+union FragmentColor {
+	u32 color;
+	struct {
+		u8 r,g,b,a;
+	};
+};
+
+struct Fragment
 {
-	static const int numTextures = MAX_TEXTURE+1;
-	u8* buffers[numTextures];
+	u32 depth;

-	void clear() { memset(buffers,0,sizeof(buffers)); }
+	struct {
+		u8 opaque, translucent;
+	} polyid;

-	TextureBuffers()
-	{
-		clear();
-	}
+	u8 stencil;

-	void free()
-	{
-		for(int i=0;i<numTextures;i++)
-			delete[] buffers[i];
-		clear();
-	}
+	struct {
+		u8 isTranslucentPoly:1;
+		u8 fogged:1;
+	};
+};

-	~TextureBuffers() {
-		free();
-	}
+//INLINE static void SubmitVertex(int vert_index, VERT& rawvert)
+//{
+//	verts[vert_index] = &rawvert;
+//}

-	void setCurrent(int num)
-	{
-		currentData = buffers[num];
-	}
+static Fragment screen[256*192];
+static FragmentColor screenColor[256*192];
+static FragmentColor toonTable[32];
+static u8 fogTable[32768];

-	void create(int num, u8* data)
-	{
-		delete[] buffers[num];
-		int size = texcache[num].sizeX * texcache[num].sizeY * 4;
-		buffers[num] = new u8[size];
-		setCurrent(num);
-		memcpy(currentData,data,size);
-	}
-	u8* currentData;
-} textures;
-
-//called from the texture cache to change the active texture
-static void BindTexture(u32 tx)
-{
-	textures.setCurrent(tx);
+FORCEINLINE int iround(float f) {
+	return (int)f; //lol
 }

-//caled from the texture cache to change to a new texture
-static void BindTextureData(u32 tx, u8* data)
+
+typedef int fixed28_4;
+
+static bool failure;
+
+// handle floor divides and mods correctly 
+FORCEINLINE void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod)
 {
-	textures.create(tx,data);
+	//These must be caused by invalid or degenerate shapes.. not sure yet.
+	//check it out in the mario face intro of SM64
+	//so, we have to take out the assert.
+	//I do know that we handle SOME invalid shapes without crashing,
+	//since I see them acting poppy in a way that doesnt happen in the HW.. so alas it is also incorrect.
+	//This particular incorrectness is not likely ever to get fixed!
+
+	//assert(Denominator > 0);		
+
+	//but we have to bail out since our handling for these cases currently steps scanlines 
+	//the wrong way and goes totally nuts (freezes)
+	if(Denominator<=0) 
+		failure = true;
+
+	if(Numerator >= 0) {
+		// positive case, C is okay
+		Floor = Numerator / Denominator;
+		Mod = Numerator % Denominator;
+	} else {
+		// Numerator is negative, do the right thing
+		Floor = -((-Numerator) / Denominator);
+		Mod = (-Numerator) % Denominator;
+		if(Mod) {
+			// there is a remainder
+			Floor--; Mod = Denominator - Mod;
+		}
+	}
 }

-//---------------
+FORCEINLINE fixed28_4 FloatToFixed28_4( float Value ) {
+	return (fixed28_4)(Value * 16);
+}
+FORCEINLINE float Fixed28_4ToFloat( fixed28_4 Value ) {
+	return Value / 16.0f;
+}
+//inline fixed16_16 FloatToFixed16_16( float Value ) {
+//	return (fixed16_6)(Value * 65536);
+//}
+//inline float Fixed16_16ToFloat( fixed16_16 Value ) {
+//	return Value / 65536.0;
+//}
+FORCEINLINE fixed28_4 Fixed28_4Mul( fixed28_4 A, fixed28_4 B ) {
+	// could make this asm to prevent overflow
+	return (A * B) / 16;	// 28.4 * 28.4 = 24.8 / 16 = 28.4
+}
+FORCEINLINE int Ceil28_4( fixed28_4 Value ) {
+	int ReturnValue;
+	int Numerator = Value - 1 + 16;
+	if(Numerator >= 0) {
+		ReturnValue = Numerator/16;
+	} else {
+		// deal with negative numerators correctly
+		ReturnValue = -((-Numerator)/16);
+		ReturnValue -= ((-Numerator) % 16) ? 1 : 0;
+	}
+	return ReturnValue;
+}

-struct PolyAttr
+struct edge_fx_fl {
+	edge_fx_fl() {}
+	edge_fx_fl(int Top, int Bottom, VERT** verts);
+	FORCEINLINE int Step();
+
+	VERT** verts;
+	long X, XStep, Numerator, Denominator;			// DDA info for x
+	long ErrorTerm;
+	int Y, Height;					// current y and vertical count
+	
+	struct Interpolant {
+		float curr, step, stepExtra;
+		FORCEINLINE void doStep() { curr += step; }
+		FORCEINLINE void doStepExtra() { curr += stepExtra; }
+		FORCEINLINE void initialize(float top, float bottom, float dx, float dy, long XStep, float XPrestep, float YPrestep) {
+			dx = 0;
+			dy *= (bottom-top);
+			curr = top + YPrestep * dy + XPrestep * dx;
+			step = XStep * dx + dy;
+			stepExtra = dx;
+		}
+	};
+	
+	static const int NUM_INTERPOLANTS = 7;
+	union {
+		struct {
+			Interpolant invw,z,u,v,color[3];
+		};
+		Interpolant interpolants[NUM_INTERPOLANTS];
+	};
+	void FORCEINLINE doStepInterpolants() { for(int i=0;i<NUM_INTERPOLANTS;i++) interpolants[i].doStep(); }
+	void FORCEINLINE doStepExtraInterpolants() { for(int i=0;i<NUM_INTERPOLANTS;i++) interpolants[i].doStepExtra(); }
+};
+
+FORCEINLINE edge_fx_fl::edge_fx_fl(int Top, int Bottom, VERT** verts) {
+	this->verts = verts;
+	Y = Ceil28_4((fixed28_4)verts[Top]->y);
+	int YEnd = Ceil28_4((fixed28_4)verts[Bottom]->y);
+	Height = YEnd - Y;
+
+	if(Height)
+	{
+		long dN = long(verts[Bottom]->y - verts[Top]->y);
+		long dM = long(verts[Bottom]->x - verts[Top]->x);
+	
+		long InitialNumerator = (long)(dM*16*Y - dM*verts[Top]->y + dN*verts[Top]->x - 1 + dN*16);
+		FloorDivMod(InitialNumerator,dN*16,X,ErrorTerm);
+		FloorDivMod(dM*16,dN*16,XStep,Numerator);
+		Denominator = dN*16;
+	
+		float YPrestep = Fixed28_4ToFloat((fixed28_4)(Y*16 - verts[Top]->y));
+		float XPrestep = Fixed28_4ToFloat((fixed28_4)(X*16 - verts[Top]->x));
+
+		float dy = 1/Fixed28_4ToFloat(dN);
+		float dx = 1/Fixed28_4ToFloat(dM);
+		
+		invw.initialize(1/verts[Top]->w,1/verts[Bottom]->w,dx,dy,XStep,XPrestep,YPrestep);
+		u.initialize(verts[Top]->u,verts[Bottom]->u,dx,dy,XStep,XPrestep,YPrestep);
+		v.initialize(verts[Top]->v,verts[Bottom]->v,dx,dy,XStep,XPrestep,YPrestep);
+		z.initialize(verts[Top]->z,verts[Bottom]->z,dx,dy,XStep,XPrestep,YPrestep);
+		for(int i=0;i<3;i++)
+			color[i].initialize(verts[Top]->fcolor[i],verts[Bottom]->fcolor[i],dx,dy,XStep,XPrestep,YPrestep);
+	}
+}
+
+FORCEINLINE int edge_fx_fl::Step() {
+	X += XStep; Y++; Height--;
+	doStepInterpolants();
+
+	ErrorTerm += Numerator;
+	if(ErrorTerm >= Denominator) {
+		X++;
+		ErrorTerm -= Denominator;
+		doStepExtraInterpolants();
+	}
+	return Height;
+}	
+
+
+
+static FORCEINLINE void alphaBlend(FragmentColor & dst, const FragmentColor & src)
 {
+	if(gfx3d.enableAlphaBlending)
+	{
+		if(src.a == 0 || dst.a == 0)
+		{
+			dst = src;
+		}
+		else
+		{
+			u8 alpha = src.a+1;
+			u8 invAlpha = 32 - alpha;
+			dst.r = (alpha*src.r + invAlpha*dst.r)>>5;
+			dst.g = (alpha*src.g + invAlpha*dst.g)>>5;
+			dst.b = (alpha*src.b + invAlpha*dst.b)>>5;
+		}
+
+		dst.a = max(src.a,dst.a);
+	}
+	else
+	{
+		if(src.a == 0)
+		{
+			//do nothing; the fragment is totally transparent
+		}
+		else
+		{
+			dst = src;
+		}
+	}
+}
+
+
+
+class RasterizerUnit
+{
+public:
+
+	int SLI_MASK, SLI_VALUE;
+
+	RasterizerUnit()
+		: sampler(*this)
+		, shader(sampler)
+	{
+	}
+
+	ADPCMCacheItem* lastTexKey;
+	
+	VERT* verts[MAX_CLIPPED_VERTS];
+
+	struct PolyAttr
+	{
 		u32 val;

 		bool decalMode;
@ -217,50 +392,17 @@ struct PolyAttr
 			fogged = BIT15(val);
 		}

-} polyAttr;
-
-union FragmentColor {
-	u32 color;
-	struct {
-		u8 r,g,b,a;
-	};
-};
-
-struct Fragment
-{
-	u32 depth;
-
-	struct {
-		u8 opaque, translucent;
-	} polyid;
-
-	u8 stencil;
-
-	struct {
-		u8 isTranslucentPoly:1;
-		u8 fogged:1;
-	};
-};
-
-static VERT* verts[MAX_CLIPPED_VERTS];
-
-//INLINE static void SubmitVertex(int vert_index, VERT& rawvert)
-//{
-//	verts[vert_index] = &rawvert;
-//}
-
-static Fragment screen[256*192];
-static FragmentColor screenColor[256*192];
-static FragmentColor toonTable[32];
-static u8 fogTable[32768];
-
-FORCEINLINE int iround(float f) {
-	return (int)f; //lol
-}
+	} polyAttr;


-static struct Sampler
-{
+	struct Sampler
+	{
+		Sampler(RasterizerUnit& _unit)
+			: unit(_unit)
+		{}
+
+		RasterizerUnit& unit;
+			
 		int width, height;
 		int wmask, hmask;
 		int wrap;
@ -332,14 +474,18 @@ static struct Sampler
 			dowrap(iu,iv);

 			FragmentColor color;
-		color.color = ((u32*)textures.currentData)[(iv<<wshift)+iu];
+			color.color = ((u32*)unit.lastTexKey->decoded)[(iv<<wshift)+iu];
 			return color;
 		}

-} sampler;
+	} sampler;

-struct Shader
-{
+	struct Shader
+	{
+		Shader(Sampler& _sampler)
+			:sampler(_sampler)
+		{}
+		Sampler& sampler;
 		u8 mode;
 		void setup(u32 polyattr)
 		{
@ -438,41 +584,10 @@ struct Shader
 			}
 		}

-} shader;
+	} shader;

-static FORCEINLINE void alphaBlend(FragmentColor & dst, const FragmentColor & src)
-{
-	if(gfx3d.enableAlphaBlending)
+	FORCEINLINE void pixel(int adr,float r, float g, float b, float invu, float invv, float w, float z)
 	{
-		if(src.a == 0 || dst.a == 0)
-		{
-			dst = src;
-		}
-		else
-		{
-			u8 alpha = src.a+1;
-			u8 invAlpha = 32 - alpha;
-			dst.r = (alpha*src.r + invAlpha*dst.r)>>5;
-			dst.g = (alpha*src.g + invAlpha*dst.g)>>5;
-			dst.b = (alpha*src.b + invAlpha*dst.b)>>5;
-		}
-
-		dst.a = max(src.a,dst.a);
-	}
-	else
-	{
-		if(src.a == 0)
-		{
-			//do nothing; the fragment is totally transparent
-		}
-		else
-		{
-			dst = src;
-		}
-	}
-}
-
-static FORCEINLINE void pixel(int adr,float r, float g, float b, float invu, float invv, float w, float z) {
 		Fragment &destFragment = screen[adr];
 		FragmentColor &destFragmentColor = screenColor[adr];

@ -601,153 +716,11 @@ static FORCEINLINE void pixel(int adr,float r, float g, float b, float invu, flo
 		depth_fail:
 		rejected_fragment:
 		;
-}
-
-
-typedef int fixed28_4;
-
-static bool failure;
-
-// handle floor divides and mods correctly 
-FORCEINLINE void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod)
-{
-	//These must be caused by invalid or degenerate shapes.. not sure yet.
-	//check it out in the mario face intro of SM64
-	//so, we have to take out the assert.
-	//I do know that we handle SOME invalid shapes without crashing,
-	//since I see them acting poppy in a way that doesnt happen in the HW.. so alas it is also incorrect.
-	//This particular incorrectness is not likely ever to get fixed!
-
-	//assert(Denominator > 0);		
-
-	//but we have to bail out since our handling for these cases currently steps scanlines 
-	//the wrong way and goes totally nuts (freezes)
-	if(Denominator<=0) 
-		failure = true;
-
-	if(Numerator >= 0) {
-		// positive case, C is okay
-		Floor = Numerator / Denominator;
-		Mod = Numerator % Denominator;
-	} else {
-		// Numerator is negative, do the right thing
-		Floor = -((-Numerator) / Denominator);
-		Mod = (-Numerator) % Denominator;
-		if(Mod) {
-			// there is a remainder
-			Floor--; Mod = Denominator - Mod;
 	}
-	}
-}

-FORCEINLINE fixed28_4 FloatToFixed28_4( float Value ) {
-	return (fixed28_4)(Value * 16);
-}
-FORCEINLINE float Fixed28_4ToFloat( fixed28_4 Value ) {
-	return Value / 16.0f;
-}
-//inline fixed16_16 FloatToFixed16_16( float Value ) {
-//	return (fixed16_6)(Value * 65536);
-//}
-//inline float Fixed16_16ToFloat( fixed16_16 Value ) {
-//	return Value / 65536.0;
-//}
-FORCEINLINE fixed28_4 Fixed28_4Mul( fixed28_4 A, fixed28_4 B ) {
-	// could make this asm to prevent overflow
-	return (A * B) / 16;	// 28.4 * 28.4 = 24.8 / 16 = 28.4
-}
-FORCEINLINE int Ceil28_4( fixed28_4 Value ) {
-	int ReturnValue;
-	int Numerator = Value - 1 + 16;
-	if(Numerator >= 0) {
-		ReturnValue = Numerator/16;
-	} else {
-		// deal with negative numerators correctly
-		ReturnValue = -((-Numerator)/16);
-		ReturnValue -= ((-Numerator) % 16) ? 1 : 0;
-	}
-	return ReturnValue;
-}
-
-struct edge_fx_fl {
-	edge_fx_fl() {}
-	edge_fx_fl(int Top, int Bottom);
-	FORCEINLINE int Step();
-
-	long X, XStep, Numerator, Denominator;			// DDA info for x
-	long ErrorTerm;
-	int Y, Height;					// current y and vertical count
-	
-	struct Interpolant {
-		float curr, step, stepExtra;
-		FORCEINLINE void doStep() { curr += step; }
-		FORCEINLINE void doStepExtra() { curr += stepExtra; }
-		FORCEINLINE void initialize(float top, float bottom, float dx, float dy, long XStep, float XPrestep, float YPrestep) {
-			dx = 0;
-			dy *= (bottom-top);
-			curr = top + YPrestep * dy + XPrestep * dx;
-			step = XStep * dx + dy;
-			stepExtra = dx;
-		}
-	};
-	
-	static const int NUM_INTERPOLANTS = 7;
-	union {
-		struct {
-			Interpolant invw,z,u,v,color[3];
-		};
-		Interpolant interpolants[NUM_INTERPOLANTS];
-	};
-	void FORCEINLINE doStepInterpolants() { for(int i=0;i<NUM_INTERPOLANTS;i++) interpolants[i].doStep(); }
-	void FORCEINLINE doStepExtraInterpolants() { for(int i=0;i<NUM_INTERPOLANTS;i++) interpolants[i].doStepExtra(); }
-};
-
-FORCEINLINE edge_fx_fl::edge_fx_fl(int Top, int Bottom) {
-	Y = Ceil28_4((fixed28_4)verts[Top]->y);
-	int YEnd = Ceil28_4((fixed28_4)verts[Bottom]->y);
-	Height = YEnd - Y;
-
-	if(Height)
+	//draws a single scanline
+	FORCEINLINE void drawscanline(edge_fx_fl *pLeft, edge_fx_fl *pRight)
 	{
-		long dN = long(verts[Bottom]->y - verts[Top]->y);
-		long dM = long(verts[Bottom]->x - verts[Top]->x);
-	
-		long InitialNumerator = (long)(dM*16*Y - dM*verts[Top]->y + dN*verts[Top]->x - 1 + dN*16);
-		FloorDivMod(InitialNumerator,dN*16,X,ErrorTerm);
-		FloorDivMod(dM*16,dN*16,XStep,Numerator);
-		Denominator = dN*16;
-	
-		float YPrestep = Fixed28_4ToFloat((fixed28_4)(Y*16 - verts[Top]->y));
-		float XPrestep = Fixed28_4ToFloat((fixed28_4)(X*16 - verts[Top]->x));
-
-		float dy = 1/Fixed28_4ToFloat(dN);
-		float dx = 1/Fixed28_4ToFloat(dM);
-		
-		invw.initialize(1/verts[Top]->w,1/verts[Bottom]->w,dx,dy,XStep,XPrestep,YPrestep);
-		u.initialize(verts[Top]->u,verts[Bottom]->u,dx,dy,XStep,XPrestep,YPrestep);
-		v.initialize(verts[Top]->v,verts[Bottom]->v,dx,dy,XStep,XPrestep,YPrestep);
-		z.initialize(verts[Top]->z,verts[Bottom]->z,dx,dy,XStep,XPrestep,YPrestep);
-		for(int i=0;i<3;i++)
-			color[i].initialize(verts[Top]->fcolor[i],verts[Bottom]->fcolor[i],dx,dy,XStep,XPrestep,YPrestep);
-	}
-}
-
-FORCEINLINE int edge_fx_fl::Step() {
-	X += XStep; Y++; Height--;
-	doStepInterpolants();
-
-	ErrorTerm += Numerator;
-	if(ErrorTerm >= Denominator) {
-		X++;
-		ErrorTerm -= Denominator;
-		doStepExtraInterpolants();
-	}
-	return Height;
-}	
-
-//draws a single scanline
-FORCEINLINE static void drawscanline(edge_fx_fl *pLeft, edge_fx_fl *pRight)
-{
 		int XStart = pLeft->X;
 		int width = pRight->X - XStart;

@ -801,32 +774,35 @@ FORCEINLINE static void drawscanline(edge_fx_fl *pLeft, edge_fx_fl *pRight)
 			color[1] += dc_dx[1];
 			color[2] += dc_dx[2];
 		}
-}
+	}

-//runs several scanlines, until an edge is finished
-static void runscanlines(edge_fx_fl *left, edge_fx_fl *right)
-{
+	//runs several scanlines, until an edge is finished
+	template<bool SLI>
+	void runscanlines(edge_fx_fl *left, edge_fx_fl *right)
+	{
 		//do not overstep either of the edges
 		int Height = min(left->Height,right->Height);
 		while(Height--) {
+			if(!SLI || (left->Y & SLI_MASK) == SLI_VALUE)
 				drawscanline(left,right);
 			left->Step(); 
 			right->Step();
 		}
-}
+	}

-//rotates verts counterclockwise
-template<int type>
-INLINE static void rot_verts() {
+	
+	//rotates verts counterclockwise
+	template<int type>
+	INLINE void rot_verts() {
 		#define ROTSWAP(X) if(type>X) swap(verts[X-1],verts[X]);
 		ROTSWAP(1); ROTSWAP(2); ROTSWAP(3); ROTSWAP(4);
-	ROTSWAP(5); ROTSWAP(6); ROTSWAP(7);
-}
+		ROTSWAP(5); ROTSWAP(6); ROTSWAP(7); ROTSWAP(8); ROTSWAP(9);
+	}

-//rotate verts until vert0.y is minimum, and then vert0.x is minimum in case of ties
-//this is a necessary precondition for our shape engine
-template<int type>
-static void sort_verts(bool backwards) {
+	//rotate verts until vert0.y is minimum, and then vert0.x is minimum in case of ties
+	//this is a necessary precondition for our shape engine
+	template<int type>
+	void sort_verts(bool backwards) {
 		//if the verts are backwards, reorder them first
 		if(backwards)
 			for(int i=0;i<type/2;i++)
@ -837,7 +813,7 @@ static void sort_verts(bool backwards) {
 			//this was the only way we could get this to unroll
 			#define CHECKY(X) if(type>X) if(verts[0]->y > verts[X]->y) goto doswap;
 			CHECKY(1); CHECKY(2); CHECKY(3); CHECKY(4);
-		CHECKY(5); CHECKY(6); CHECKY(7);
+			CHECKY(5); CHECKY(6); CHECKY(7); CHECKY(8); CHECKY(9);
 			break;
 			
 		doswap:
@ -847,13 +823,14 @@ static void sort_verts(bool backwards) {
 		while(verts[0]->y == verts[1]->y && verts[0]->x > verts[1]->x)
 			rot_verts<type>();
 		
-}
+	}

-//This function can handle any convex N-gon up to octagons
-//verts must be clockwise.
-//I didnt reference anything for this algorithm but it seems like I've seen it somewhere before.
-static void shape_engine(int type, bool backwards)
-{
+	//This function can handle any convex N-gon up to octagons
+	//verts must be clockwise.
+	//I didnt reference anything for this algorithm but it seems like I've seen it somewhere before.
+	template<bool SLI>
+	void shape_engine(int type, bool backwards)
+	{
 		failure = false;

 		switch(type) {
@ -863,6 +840,8 @@ static void shape_engine(int type, bool backwards)
 			case 6: sort_verts<6>(backwards); break;
 			case 7: sort_verts<7>(backwards); break;
 			case 8: sort_verts<8>(backwards); break;
+			case 9: sort_verts<9>(backwards); break;
+			case 10: sort_verts<10>(backwards); break;
 			default: printf("skipping type %d\n",type); return;
 		}

@ -879,17 +858,23 @@ static void shape_engine(int type, bool backwards)
 			//so that they can be continued on down the shape
 			assert(rv != type);
 			int _lv = lv==type?0:lv; //make sure that we ask for vert 0 when the variable contains the starting value
-		if(step_left) left = edge_fx_fl(_lv,lv-1);
-		if(step_right) right = edge_fx_fl(rv,rv+1);
+			if(step_left) left = edge_fx_fl(_lv,lv-1,(VERT**)&verts);
+			if(step_right) right = edge_fx_fl(rv,rv+1,(VERT**)&verts);
 			step_left = step_right = false;

 			//handle a failure in the edge setup due to nutty polys
 			if(failure) 
 				return;
 			
+			if(left.Height<0 || right.Height<0)
+			{
+				//i have NO IDEA WHY THIS HAPPENS
+				//but i think it was corrupting things in a bad way
+				//which was only revealed by the multicored rasterizer
+				return;
+			}

-
-		runscanlines(&left,&right);
+			runscanlines<SLI>(&left,&right);

 			//if we ran out of an edge, step to the next one
 			if(right.Height == 0) {
@ -905,6 +890,72 @@ static void shape_engine(int type, bool backwards)
 			if(lv<=rv+1) break;
 		}

+	}
+
+	template<bool SLI>
+	void mainLoop()
+	{
+		lastTexKey = NULL;
+
+		//a counter for how many polys got culled
+		int culled = 0;
+
+		u32 lastPolyAttr = 0;
+		u32 lastTextureFormat = 0, lastTexturePalette = 0;
+
+		//iterate over polys
+		for(int i=0;i<clippedPolyCounter;i++)
+		{
+			if(!polyVisible[i]) continue;
+
+			GFX3D_Clipper::TClippedPoly &clippedPoly = clippedPolys[i];
+			POLY *poly = clippedPoly.poly;
+			int type = clippedPoly.type;
+
+			//if(i == 0 || lastPolyAttr != poly->polyAttr)
+			{
+				polyAttr.setup(poly->polyAttr);
+				polyAttr.translucent = poly->isTranslucent();
+				lastPolyAttr = poly->polyAttr;
+			}
+
+
+			//if(i == 0 || lastTextureFormat != poly->texParam || lastTexturePalette != poly->texPalette)
+			{
+				sampler.setup(poly->texParam);
+				lastTextureFormat = poly->texParam;
+				lastTexturePalette = poly->texPalette;
+			}
+
+			lastTexKey = polyTexKeys[i];
+
+			//hmm... shader gets setup every time because it depends on sampler which may have just changed
+			shader.setup(poly->polyAttr);
+
+			for(int j=0;j<type;j++)
+				this->verts[j] = &clippedPoly.clipVerts[j];
+			for(int j=type;j<MAX_CLIPPED_VERTS;j++)
+				this->verts[j] = NULL;
+
+			polyAttr.backfacing = polyBackfacing[i];
+
+			shape_engine<SLI>(type,!polyAttr.backfacing);
+		}
+	}
+
+
+}; //rasterizerUnit
+
+
+static Task rasterizerUnitTask[4];
+static RasterizerUnit rasterizerUnit[4];
+static int rasterizerCores;
+
+void* execRasterizerUnit(void* arg)
+{
+	s32 which = (s32)arg;
+	rasterizerUnit[which].mainLoop<true>();
+	return 0;
 }

 static char SoftRastInit(void)
@ -912,6 +963,37 @@ static char SoftRastInit(void)
 	static bool tables_generated = false;
 	if(!tables_generated)
 	{
+		if(CommonSettings.num_cores>=4)
+		{
+			rasterizerCores = 4;
+			rasterizerUnit[0].SLI_MASK = 3;
+			rasterizerUnit[1].SLI_MASK = 3;
+			rasterizerUnit[2].SLI_MASK = 3;
+			rasterizerUnit[3].SLI_MASK = 3;
+			rasterizerUnit[0].SLI_VALUE = 0;
+			rasterizerUnit[1].SLI_VALUE = 1;
+			rasterizerUnit[2].SLI_VALUE = 2;
+			rasterizerUnit[3].SLI_VALUE = 3;
+			rasterizerUnitTask[0].start(false);
+			rasterizerUnitTask[1].start(false);
+			rasterizerUnitTask[2].start(false);
+			rasterizerUnitTask[3].start(false);
+		} else if(CommonSettings.num_cores>1)
+		{
+			rasterizerCores = 2;
+			rasterizerUnit[0].SLI_MASK = 1;
+			rasterizerUnit[1].SLI_MASK = 1;
+			rasterizerUnit[0].SLI_VALUE = 0;
+			rasterizerUnit[1].SLI_VALUE = 1;
+			rasterizerUnitTask[0].start(false);
+			rasterizerUnitTask[1].start(false);
+		} else {
+			rasterizerCores = 1;
+			rasterizerUnit[0].SLI_MASK = 0;
+			rasterizerUnit[0].SLI_VALUE = 0;
+		}
+
+
 		tables_generated = true;

 		clipper.clippedPolys = clippedPolys = new GFX3D_Clipper::TClippedPoly[POLYLIST_SIZE*2];
@ -942,8 +1024,6 @@ static char SoftRastInit(void)
 	}

 	TexCache_Reset();
-	TexCache_BindTexture = BindTexture;
-	TexCache_BindTextureData = BindTextureData;

 	printf("SoftRast Initialized\n");
 	return 1;
@ -969,7 +1049,7 @@ static void SoftRastFramebufferProcess()
 	// - the edges are completely sharp/opaque on the very brief title screen intro,
 	// - the level-start intro gets a pseudo-antialiasing effect around the silhouette,
 	// - the character edges in-level are clearly transparent, and also show well through shield powerups.
-	if(gfx3d.enableEdgeMarking)
+	if(gfx3d.enableEdgeMarking && CommonSettings.GFX3D_EdgeMark)
 	{ 
 		//TODO - need to test and find out whether these get grabbed at flush time, or at render time
 		//we can do this by rendering a 3d frame and then freezing the system, but only changing the edge mark colors
@ -1039,7 +1119,7 @@ static void SoftRastFramebufferProcess()
 		}
 	}

-	if(gfx3d.enableFog)
+	if(gfx3d.enableFog && CommonSettings.GFX3D_Fog)
 	{
 		u32 r = GFX3D_5TO6((gfx3d.fogColor)&0x1F);
 		u32 g = GFX3D_5TO6((gfx3d.fogColor>>5)&0x1F);
@ -1070,8 +1150,6 @@ static void SoftRastConvertFramebuffer()
 	memcpy(gfx3d_convertedScreen,screenColor,256*192*4);
 }

-
-
 static void SoftRastRender()
 {
 	Fragment clearFragment;
@ -1144,7 +1222,7 @@ static void SoftRastRender()
 	}

 	//setup fog variables (but only if fog is enabled)
-	if(gfx3d.enableFog)
+	if(gfx3d.enableFog && CommonSettings.GFX3D_Fog)
 	{
 		u8* fogDensity = MMU.MMU_MEM[ARMCPU_ARM9][0x40] + 0x360;
 #if 0
@ -1260,29 +1338,20 @@ static void SoftRastRender()
 		}
 	}

-	//a counter for how many polys got culled
-	int culled = 0;
-
-	u32 lastTextureFormat = 0, lastTexturePalette = 0, lastPolyAttr = 0;
-	
-	//iterate over polys
+	ADPCMCacheItem* lastTexKey = NULL;
+	u32 lastTextureFormat = 0, lastTexturePalette = 0;
 	bool needInitTexture = true;
 	for(int i=0;i<clippedPolyCounter;i++)
 	{
-		polynum = i;
-
 		GFX3D_Clipper::TClippedPoly &clippedPoly = clippedPolys[i];
 		POLY *poly = clippedPoly.poly;
 		int type = clippedPoly.type;

 		VERT* verts = &clippedPoly.clipVerts[0];

-		if(i == 0 || lastPolyAttr != poly->polyAttr)
-		{
+
+		RasterizerUnit::PolyAttr polyAttr;
 		polyAttr.setup(poly->polyAttr);
-			polyAttr.translucent = poly->isTranslucent();
-			lastPolyAttr = poly->polyAttr;
-		}

 		//HACK: backface culling
 		//this should be moved to gfx3d, but first we need to redo the way the lists are built
@ -1305,24 +1374,17 @@ static void SoftRastRender()
 		float facing = (verts[0].y + verts[n].y) * (verts[0].x - verts[n].x)
 					 + (verts[1].y + verts[0].y) * (verts[1].x - verts[0].x)
 					 + (verts[2].y + verts[1].y) * (verts[2].x - verts[1].x);
-		for(int i = 2; i < n; i++)
-			facing += (verts[i+1].y + verts[i].y) * (verts[i+1].x - verts[i].x);
-		polyAttr.backfacing = (facing < 0);
+		for(int j = 2; j < n; j++)
+			facing += (verts[j+1].y + verts[j].y) * (verts[j+1].x - verts[j].x);
+		polyBackfacing[i] = polyAttr.backfacing = (facing < 0);
 #endif

 		if(!polyAttr.isVisible(polyAttr.backfacing)) {
-			culled++;
+			polyVisible[i] = false;
 			continue;
 		}

-		if(needInitTexture || lastTextureFormat != poly->texParam || lastTexturePalette != poly->texPalette)
-		{
-			TexCache_SetTexture<TexFormat_15bpp>(poly->texParam,poly->texPalette);
-			sampler.setup(poly->texParam);
-			lastTextureFormat = poly->texParam;
-			lastTexturePalette = poly->texPalette;
-			needInitTexture = false;
-		}
+		polyVisible[i] = true;

 		//here is a hack which needs to be removed.
 		//at some point our shape engine needs these to be converted to "fixed point"
@ -1331,15 +1393,31 @@ static void SoftRastRender()
 			for(int k=0;k<2;k++)
 				verts[j].coord[k] = (float)iround(16.0f * verts[j].coord[k]);

-		//hmm... shader gets setup every time because it depends on sampler which may have just changed
-		shader.setup(poly->polyAttr);
-
-		for(int j=0;j<MAX_CLIPPED_VERTS;j++)
-			::verts[j] = &verts[j];
-
-		shape_engine(type,!polyAttr.backfacing);
+		//make sure all the textures we'll need are cached
+		if(needInitTexture || lastTextureFormat != poly->texParam || lastTexturePalette != poly->texPalette)
+		{
+			lastTexKey = TexCache_SetTexture(TexFormat_15bpp,poly->texParam,poly->texPalette);
+			lastTextureFormat = poly->texParam;
+			lastTexturePalette = poly->texPalette;
+			needInitTexture = false;
 		}

+		//printf("%08X %d\n",poly->texParam,rasterizerUnit[0].textures.currentNum);
+		polyTexKeys[i] = lastTexKey;
+	}
+
+	if(rasterizerCores==1)
+	{
+		rasterizerUnit[0].mainLoop<false>();
+	}
+	else
+	{
+		for(int i=0;i<rasterizerCores;i++) rasterizerUnitTask[i].execute(execRasterizerUnit,(void*)i);
+		for(int i=0;i<rasterizerCores;i++) rasterizerUnitTask[i].finish();
+	}
+
+	TexCache_EvictFrame();
+
 	SoftRastFramebufferProcess();

 	//	printf("rendered %d of %d polys after backface culling\n",gfx3d.polylist->count-culled,gfx3d.polylist->count);
@ -1354,3 +1432,4 @@ GPU3DInterface gpu3DRasterize = {
 	SoftRastRender,
 	SoftRastVramReconfigureSignal,
 };
+
--- a/desmume/src/rasterize.h
+++ b/desmume/src/rasterize.h
@ -1,8 +1,4 @@
-/*  Copyright (C) 2006 yopyop
-    yopyop156@ifrance.com
-    yopyop156.ifrance.com
-
-	Copyright 2009 DeSmuME team
+/*  Copyright 2009 DeSmuME team

    This file is part of DeSmuME

--- a/desmume/src/texcache.cpp
+++ b/desmume/src/texcache.cpp
@ -1,7 +1,8 @@
-#include "texcache.h"
-
 #include <string.h>
 #include <algorithm>
+#include <assert.h>
+
+#include "texcache.h"

 #include "bits.h"
 #include "common.h"
@ -15,6 +16,8 @@ using std::max;
 //only dump this from ogl renderer. for now, softrasterizer creates things in an incompatible pixel format
 //#define DEBUG_DUMP_TEXTURE

+#define CONVERT(color,alpha) ((TEXFORMAT == TexFormat_32bpp)?(RGB15TO32(color,alpha)):RGB15TO6665(color,alpha))
+
 //This class represents a number of regions of memory which should be viewed as contiguous
 class MemSpan
 {
@ -54,6 +57,8 @@ public:
 		return 0;
 	}

+	//TODO - get rid of duplication between these two methods.
+
 	//dumps the memspan to the specified buffer
 	//you may set size to limit the size to be copied
 	int dump(void* buf, int size=-1)
@ -160,12 +165,6 @@ static MemSpan MemSpan_TexPalette(u32 ofs, u32 len)
 	return ret;
 }

-TextureCache *texcache;
-u32 texcache_start;
-u32 texcache_stop;
-u8 *TexCache_texMAP = NULL;
-
-
 #if defined (DEBUG_DUMP_TEXTURE) && defined (WIN32)
 #define DO_DEBUG_DUMP_TEXTURE
 static void DebugDumpTexture(int which)
@ -178,28 +177,59 @@ static void DebugDumpTexture(int which)
 #endif


-static int lastTexture = -1;

-#define CONVERT(color,alpha) ((TEXFORMAT == TexFormat_32bpp)?(RGB15TO32(color,alpha)):RGB15TO6665(color,alpha))
-
-template<TexCache_TexFormat TEXFORMAT>
-void TexCache_SetTexture(u32 format, u32 texpal)
+//notes on the cache:
+//I am really unhappy with the ref counting. this needs to be automatic.
+//We could do something better than a linear search through cache items, but it may not be worth it.
+//Also we may need to rescan more often (every time a sample loops)
+class ADPCMCache
 {
+public:
+	ADPCMCache()
+		: list_front(NULL)
+		, list_back(NULL)
+		, cache_size(0)
+	{}
+
+	ADPCMCacheItem *list_front, *list_back;
+
+	//this ought to be enough for anyone
+	static const u32 kMaxCacheSize = 64*1024*1024; 
+	//this is not really precise, it is off by a constant factor
+	u32 cache_size;
+
+	void list_remove(ADPCMCacheItem* item) {
+		if(item->next) item->next->prev = item->prev;
+		if(item->prev) item->prev->next = item->next;
+		if(item == list_front) list_front = item->next;
+		if(item == list_back) list_back = item->prev;
+	}
+
+	void list_push_front(ADPCMCacheItem* item)
+	{
+		item->next = list_front;
+		if(list_front) list_front->prev = item;
+		else list_back = item;
+		item->prev = NULL;
+		list_front = item;
+	}
+
+	template<TexCache_TexFormat TEXFORMAT>
+	ADPCMCacheItem* scan(u32 format, u32 texpal)
+	{
 		//for each texformat, number of palette entries
-	const int palSizes[] = {0, 32, 4, 16, 256, 0, 8, 0};
+		static const int palSizes[] = {0, 32, 4, 16, 256, 0, 8, 0};

 		//for each texformat, multiplier from numtexels to numbytes (fixed point 30.2)
-	const int texSizes[] = {0, 4, 1, 2, 4, 1, 4, 8};
+		static const int texSizes[] = {0, 4, 1, 2, 4, 1, 4, 8};

 		//used to hold a copy of the palette specified for this texture
 		u16 pal[256];

-	u32 *dwdst = (u32*)TexCache_texMAP;
-	
 		u32 textureMode = (unsigned short)((format>>26)&0x07);
-	unsigned int sizeX=(8 << ((format>>20)&0x07));
-	unsigned int sizeY=(8 << ((format>>23)&0x07));
-	unsigned int imageSize = sizeX*sizeY;
+		u32 sizeX=(8 << ((format>>20)&0x07));
+		u32 sizeY=(8 << ((format>>23)&0x07));
+		u32 imageSize = sizeX*sizeY;

 		u8 *adr;

@ -246,110 +276,101 @@ void TexCache_SetTexture(u32 format, u32 texpal)
 		//dump the palette to a temp buffer, so that we don't have to worry about memory mapping.
 		//this isnt such a problem with texture memory, because we read sequentially from it.
 		//however, we read randomly from palette memory, so the mapping is more costly.
-#ifdef WORDS_BIGENDIAN
+		#ifdef WORDS_BIGENDIAN
 			mspal.dump16(pal);
-#else
+		#else
 			mspal.dump(pal);
-#endif
+		#endif

-
-	u32 tx=texcache_start;
-
-	//if(false)
-	while (TRUE)
+		for(ADPCMCacheItem* curr = list_front;curr;curr=curr->next)
 		{
-		//conditions where we give up and regenerate the texture:
-		if (texcache_stop == tx) break;
-		if (texcache[tx].frm == 0) break;
-
 			//conditions where we reject matches:
 			//when the teximage or texpal params dont match 
-		//(this is our key for identifying palettes in the cache)
-		if (texcache[tx].frm != format) goto REJECT;
-		if (texcache[tx].pal != texpal) goto REJECT;
+			//(this is our key for identifying textures in the cache)
+			if(curr->texformat != format) continue;
+			if(curr->texpal != texpal) continue;

+			//we're being asked for a different format than what we had cached.
+			if(curr->cacheFormat != TEXFORMAT) goto REJECT;
+
+			//not used anymore -- add another method to purge suspicious items from the cache
 			//the texture matches params, but isnt suspected invalid. accept it.
-		if (!texcache[tx].suspectedInvalid) goto ACCEPT;
-
-		//if we couldnt cache this entire texture due to it being too large, then reject it
-		if (texSize+indexSize > (int)sizeof(texcache[tx].dump.texture)) goto REJECT;
+			if (!curr->suspectedInvalid) return curr;

 			//when the palettes dont match:
 			//note that we are considering 4x4 textures to have a palette size of 0.
 			//they really have a potentially HUGE palette, too big for us to handle like a normal palette,
 			//so they go through a different system
-		if (mspal.size != 0 && memcmp(texcache[tx].dump.palette,pal,mspal.size)) goto REJECT;
+			if(mspal.size != 0 && memcmp(curr->dump.palette,pal,mspal.size)) goto REJECT;

 			//when the texture data doesn't match
-		if(ms.memcmp(texcache[tx].dump.texture,sizeof(texcache[tx].dump.texture))) goto REJECT;
+			if(ms.memcmp(curr->dump.texture,sizeof(curr->dump.texture))) goto REJECT;

 			//if the texture is 4x4 then the index data must match
 			if(textureMode == TEXMODE_4X4)
 			{
-			if(msIndex.memcmp(texcache[tx].dump.texture + texcache[tx].dump.textureSize,texcache[tx].dump.indexSize)) goto REJECT; 
+				if(msIndex.memcmp(curr->dump.texture + curr->dump.textureSize,curr->dump.indexSize)) goto REJECT; 
 			}

+			//we found a match. just return it
+			//curr->lock();
+			list_remove(curr);
+			list_push_front(curr);
+			return curr;

-ACCEPT:
-		texcache[tx].suspectedInvalid = false;
-		if(lastTexture == -1 || (int)tx != lastTexture)
-		{
-			lastTexture = tx;
-			if(TexCache_BindTexture)
-				TexCache_BindTexture(tx);
-		}
-		return;
- 
-REJECT:
-		tx++;
-		if ( tx > MAX_TEXTURE )
-		{
-			texcache_stop=texcache_start;
-			texcache[texcache_stop].frm=0;
-			texcache_start++;
-			if (texcache_start>MAX_TEXTURE) 
-			{
-				texcache_start=0;
-				texcache_stop=MAX_TEXTURE<<1;
-			}
-			tx=0;
-		}
+		REJECT:
+			//we found a cached item for the current address, but the data is stale.
+			//for a variety of complicated reasons, we need to throw it out right this instant.
+			list_remove(curr);
+			delete curr;
+			break;
 		}

-	lastTexture = tx;
-	//glBindTexture(GL_TEXTURE_2D, texcache[tx].id);
+		//item was not found. recruit an existing one (the oldest), or create a new one
+		//evict(); //reduce the size of the cache if necessary
+		//TODO - as a peculiarity of the texcache, eviction must happen after the entire 3d frame runs
+		//to support separate cache and read passes
+		ADPCMCacheItem* newitem = new ADPCMCacheItem();
+		list_push_front(newitem);
+		//newitem->lock();
+		newitem->suspectedInvalid = false;
+		newitem->texformat = format;
+		newitem->cacheFormat = TEXFORMAT;
+		newitem->texpal = texpal;
+		newitem->sizeX=sizeX;
+		newitem->sizeY=sizeY;
+		newitem->invSizeX=1.0f/((float)(sizeX));
+		newitem->invSizeY=1.0f/((float)(sizeY));
+		newitem->dump.textureSize = ms.dump(newitem->dump.texture,sizeof(newitem->dump.texture));
+		newitem->decode_len = sizeX*sizeY*4;
+		newitem->mode = textureMode;
+		cache_size += newitem->decode_len;
+		newitem->decoded = new u8[newitem->decode_len];
+		
+		u32 *dwdst = (u32*)newitem->decoded;

-	texcache[tx].suspectedInvalid = false;
-	texcache[tx].frm=format;
-	texcache[tx].mode=textureMode;
-	texcache[tx].pal=texpal;
-	texcache[tx].sizeX=sizeX;
-	texcache[tx].sizeY=sizeY;
-	texcache[tx].invSizeX=1.0f/((float)(sizeX));
-	texcache[tx].invSizeY=1.0f/((float)(sizeY));
-	texcache[tx].dump.textureSize = ms.dump(texcache[tx].dump.texture,sizeof(texcache[tx].dump.texture));
 		
 		//dump palette data for cache keying
-	if ( palSize )
+		if(palSize)
 		{
-		memcpy(texcache[tx].dump.palette, pal, palSize*2);
+			memcpy(newitem->dump.palette, pal, palSize*2);
 		}
 		//dump 4x4 index data for cache keying
-	texcache[tx].dump.indexSize = 0;
+		newitem->dump.indexSize = 0;
 		if(textureMode == TEXMODE_4X4)
 		{
-		texcache[tx].dump.indexSize = min(msIndex.size,(int)sizeof(texcache[tx].dump.texture) - texcache[tx].dump.textureSize);
-		msIndex.dump(texcache[tx].dump.texture+texcache[tx].dump.textureSize,texcache[tx].dump.indexSize);
+			newitem->dump.indexSize = min(msIndex.size,(int)sizeof(newitem->dump.texture) - newitem->dump.textureSize);
+			msIndex.dump(newitem->dump.texture+newitem->dump.textureSize,newitem->dump.indexSize);
 		}

+		//============================================================================ 
+		//Texture conversion
+		//============================================================================ 

-	//INFO("Texture %03i - format=%08X; pal=%04X (mode %X, width %04i, height %04i)\n",i, texcache[i].frm, texcache[i].pal, texcache[i].mode, sizeX, sizeY);
-
-	//============================================================================ Texture conversion
 		const u32 opaqueColor = TEXFORMAT==TexFormat_32bpp?255:31;
 		u32 palZeroTransparent = (1-((format>>29)&1))*opaqueColor;

-	switch (texcache[tx].mode)
+		switch (newitem->mode)
 		{
 		case TEXMODE_A3I5:
 			{
@ -366,9 +387,9 @@ REJECT:
 						adr++;
 					}
 				}
-
 				break;
 			}
+
 		case TEXMODE_I2:
 			{
 				for(int j=0;j<ms.numItems;j++) {
@ -443,20 +464,20 @@ REJECT:
 				//this check isnt necessary since the addressing is tied to the texture data which will also run out:
 				//if(msIndex.numItems != 1) PROGINFO("Your 4x4 texture index has overrun its slot.\n");

-#define PAL4X4(offset) ( *(u16*)( MMU.texInfo.texPalSlot[((paletteAddress + (offset)*2)>>14)] + ((paletteAddress + (offset)*2)&0x3FFF) ) )
+	#define PAL4X4(offset) ( *(u16*)( MMU.texInfo.texPalSlot[((paletteAddress + (offset)*2)>>14)] + ((paletteAddress + (offset)*2)&0x3FFF) ) )

 				u16* slot1;
 				u32* map = (u32*)ms.items[0].ptr;
 				u32 limit = ms.items[0].len<<2;
 				u32 d = 0;
-			if ( (texcache[tx].frm & 0xc000) == 0x8000)
+				if ( (format & 0xc000) == 0x8000)
 					// texel are in slot 2
-				slot1=(u16*)&MMU.texInfo.textureSlotAddr[1][((texcache[tx].frm & 0x3FFF)<<2)+0x010000];
+					slot1=(u16*)&MMU.texInfo.textureSlotAddr[1][((format & 0x3FFF)<<2)+0x010000];
 				else 
-				slot1=(u16*)&MMU.texInfo.textureSlotAddr[1][(texcache[tx].frm & 0x3FFF)<<2];
+					slot1=(u16*)&MMU.texInfo.textureSlotAddr[1][(format & 0x3FFF)<<2];

-			u16 yTmpSize = (texcache[tx].sizeY>>2);
-			u16 xTmpSize = (texcache[tx].sizeX>>2);
+				u16 yTmpSize = (sizeY>>2);
+				u16 xTmpSize = (sizeX>>2);

 				//this is flagged whenever a 4x4 overruns its slot.
 				//i am guessing we just generate black in that case
@ -464,8 +485,8 @@ REJECT:

 				for (int y = 0; y < yTmpSize; y ++)
 				{
-				u32 tmpPos[4]={(y<<2)*texcache[tx].sizeX,((y<<2)+1)*texcache[tx].sizeX,
-					((y<<2)+2)*texcache[tx].sizeX,((y<<2)+3)*texcache[tx].sizeX};
+					u32 tmpPos[4]={(y<<2)*sizeX,((y<<2)+1)*sizeX,
+						((y<<2)+2)*sizeX,((y<<2)+3)*sizeX};
 					for (int x = 0; x < xTmpSize; x ++, d++)
 					{
 						if(d >= limit)
@ -598,56 +619,85 @@ REJECT:
 				}
 				break;
 			}
-	}
+		} //switch(texture format)

-	if(TexCache_BindTextureData != 0)
-		TexCache_BindTextureData(tx,TexCache_texMAP);
+	/*if(user)
+		user->BindTextureData(tx,TexCache_texMAP);

 #ifdef DO_DEBUG_DUMP_TEXTURE
 	DebugDumpTexture(tx);
-#endif
+#endif*/

-}
+		return newitem;
+
+
+	} //scan()
+
+	void evict(const u32 target = kMaxCacheSize) {
+		//evicts old cache items until it is less than the max cache size
+		//this means we actually can exceed the cache by the size of the next item.
+		//if we really wanted to hold ourselves to it, we could evict to kMaxCacheSize-nextItemSize
+		while(cache_size > target)
+		{
+			ADPCMCacheItem *oldest = list_back;
+			while(oldest && oldest->lockCount>0) oldest = oldest->prev; //find an unlocked one
+			if(!oldest) 
+			{
+				//nothing we can do, everything in the cache is locked. maybe we're leaking.
+				//just quit trying to evict
+				return;
+			}
+			list_remove(oldest);
+			cache_size -= oldest->decode_len;
+			//printf("evicting! totalsize:%d\n",cache_size);
+			delete oldest;
+		}
+	}
+} adpcmCache;

 void TexCache_Reset()
 {
-	if(TexCache_texMAP == NULL) TexCache_texMAP = new u8[1024*2048*4]; 
-	if(texcache == NULL) texcache = new TextureCache[MAX_TEXTURE+1];
+	//if(TexCache_texMAP == NULL) TexCache_texMAP = new u8[1024*2048*4]; 
+	//if(texcache == NULL) texcache = new TextureCache[MAX_TEXTURE+1];

-	memset(texcache,0,sizeof(TextureCache[MAX_TEXTURE+1]));
+	//memset(texcache,0,sizeof(TextureCache[MAX_TEXTURE+1]));

-	texcache_start=0;
-	texcache_stop=MAX_TEXTURE<<1;
-}
-
-TextureCache* TexCache_Curr()
-{
-	if(lastTexture == -1)
-		return NULL;
-	else return &texcache[lastTexture];
+	//texcache_start=0;
+	//texcache_stop=MAX_TEXTURE<<1;
+	adpcmCache.evict(0);
 }

 void TexCache_Invalidate()
 {
-	//well, this is a very blunt instrument.
-	//lets just flag all the textures as invalid.
-	for(int i=0;i<MAX_TEXTURE+1;i++) {
-		texcache[i].suspectedInvalid = true;
+	////well, this is a very blunt instrument.
+	////lets just flag all the textures as invalid.
+	//for(int i=0;i<MAX_TEXTURE+1;i++) {
+	//	texcache[i].suspectedInvalid = true;

-		//invalidate all 4x4 textures when texture palettes change mappings
-		//this is necessary because we arent tracking 4x4 texture palettes to look for changes.
-		//Although I concede this is a bit paranoid.. I think the odds of anyone changing 4x4 palette data
-		//without also changing the texture data is pretty much zero.
-		//
-		//TODO - move this to a separate signal: split into TexReconfigureSignal and TexPaletteReconfigureSignal
-		if(texcache[i].mode == TEXMODE_4X4)
-			texcache[i].frm = 0;
+	//	//invalidate all 4x4 textures when texture palettes change mappings
+	//	//this is necessary because we arent tracking 4x4 texture palettes to look for changes.
+	//	//Although I concede this is a bit paranoid.. I think the odds of anyone changing 4x4 palette data
+	//	//without also changing the texture data is pretty much zero.
+	//	//
+	//	//TODO - move this to a separate signal: split into TexReconfigureSignal and TexPaletteReconfigureSignal
+	//	if(texcache[i].mode == TEXMODE_4X4)
+	//		texcache[i].frm = 0;
+	//}
+	adpcmCache.evict(0);
+}
+
+ADPCMCacheItem* TexCache_SetTexture(TexCache_TexFormat TEXFORMAT, u32 format, u32 texpal)
+{
+	switch(TEXFORMAT)
+	{
+	case TexFormat_32bpp: return adpcmCache.scan<TexFormat_32bpp>(format,texpal);
+	case TexFormat_15bpp: return adpcmCache.scan<TexFormat_15bpp>(format,texpal);
+	default: assert(false); return NULL;
 	}
 }

-void (*TexCache_BindTexture)(u32 texnum) = NULL;
-void (*TexCache_BindTextureData)(u32 texnum, u8* data);
-
-//these templates needed to be instantiated manually
-template void TexCache_SetTexture<TexFormat_32bpp>(u32 format, u32 texpal);
-template void TexCache_SetTexture<TexFormat_15bpp>(u32 format, u32 texpal);
+//call this periodically to keep the tex cache clean
+void TexCache_EvictFrame()
+{
+	adpcmCache.evict();
+}
--- a/desmume/src/texcache.h
+++ b/desmume/src/texcache.h
@ -5,47 +5,62 @@

 enum TexCache_TexFormat
 {
-	TexFormat_32bpp,
-	TexFormat_15bpp
+	TexFormat_None, //used when nothing yet is cached
+	TexFormat_32bpp, //used by ogl renderer
+	TexFormat_15bpp //used by rasterizer
 };

-#define MAX_TEXTURE 500
-
-
-struct CACHE_ALIGN TextureCache
+class ADPCMCacheItem
 {
-	u32					id;
-	u32					frm;
+public:
+	ADPCMCacheItem() 
+		: decoded(NULL)
+		, decode_len(0)
+		, next(NULL)
+		, prev(NULL)
+		, lockCount(0)
+		, cacheFormat(TexFormat_None)
+		, deleteCallback(NULL)
+		, suspectedInvalid(false)
+	{}
+	~ADPCMCacheItem() {
+		delete[] decoded;
+		if(deleteCallback) deleteCallback(this);
+	}
+	void unlock() { 
+		lockCount--;
+	}
+	void lock() { 
+		lockCount++;
+	}
+	u32 decode_len;
 	u32 mode;
-	u32					pal;
-	u32					sizeX;
-	u32					sizeY;
-	float				invSizeX;
-	float				invSizeY;
+	u8* decoded; //decoded texture data
+	ADPCMCacheItem *next, *prev; //double linked list
+	int lockCount;
+	bool suspectedInvalid;

+	u32 texformat, texpal;
+	u32 sizeX, sizeY;
+	float invSizeX, invSizeY;
+
+	void* texid; //used by ogl renderer for the texid
+	void (*deleteCallback)(ADPCMCacheItem*);
+
+	TexCache_TexFormat cacheFormat;
+
+	//TODO - this is a little wasteful
 	struct {
 		int					textureSize, indexSize;
 		u8					texture[128*1024]; // 128Kb texture slot
 		u8					palette[256*2];
 	} dump;
-
-	//set if this texture is suspected be invalid due to a vram reconfigure
-	bool				suspectedInvalid;
 };

-extern TextureCache	*texcache;
-
-extern void (*TexCache_BindTexture)(u32 texnum);
-extern void (*TexCache_BindTextureData)(u32 texnum, u8* data);
-
-void TexCache_Reset();
-
-template<TexCache_TexFormat>
-void TexCache_SetTexture(u32 format, u32 texpal);
-
 void TexCache_Invalidate();
+void TexCache_Reset();
+void TexCache_EvictFrame();

-extern u8 *TexCache_texMAP;
-TextureCache* TexCache_Curr();
+ADPCMCacheItem* TexCache_SetTexture(TexCache_TexFormat TEXFORMAT, u32 format, u32 texpal);

 #endif
--- a/desmume/src/utils/task.cpp
+++ b/desmume/src/utils/task.cpp
@ -0,0 +1,279 @@
+/*  Copyright 2009 DeSmuME team
+
+    This file is part of DeSmuME
+
+    DeSmuME is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DeSmuME is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DeSmuME; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
+*/
+
+#include "task.h"
+
+#ifdef _WIN32
+
+#include <windows.h>
+#include <stdio.h>
+
+class Task::Impl {
+public:
+	Impl();
+	~Impl();
+
+	bool spinlock;
+
+	void start(bool spinlock);
+
+	//execute some work
+	void execute(const TWork &work, void* param);
+
+	//wait for the work to complete
+	void* finish();
+
+	static DWORD __stdcall s_taskProc(void *ptr);
+	void taskProc();
+	void init();
+
+	//the work function that shall be executed
+	TWork work;
+	void* param;
+
+	HANDLE incomingWork, workDone, hThread;
+	volatile bool bIncomingWork, bWorkDone, bKill;
+	bool bStarted;
+};
+
+static void* killTask(void* task)
+{
+	((Task::Impl*)task)->bKill = true;
+	return 0;
+}
+
+Task::Impl::~Impl()
+{
+	if(!bStarted) return;
+
+	execute(killTask,this);
+	finish();
+
+	CloseHandle(incomingWork);
+	CloseHandle(workDone);
+	CloseHandle(hThread);
+}
+
+Task::Impl::Impl()
+	: work(NULL)
+	, bIncomingWork(false)
+	, bWorkDone(true)
+	, bKill(false)
+	, bStarted(false)
+{
+}
+
+DWORD __stdcall Task::Impl::s_taskProc(void *ptr)
+{
+	//just past the buck to the instance method
+	((Task::Impl*)ptr)->taskProc();
+	return 0;
+}
+
+void Task::Impl::taskProc()
+{
+	for(;;) {
+		if(bKill) break;
+		
+		//wait for a chunk of work
+		if(spinlock) while(!bIncomingWork) Sleep(0); 
+		else WaitForSingleObject(incomingWork,INFINITE); 
+		
+		bIncomingWork = false; 
+		//execute the work
+		param = work(param);
+		//signal completion
+		if(!spinlock) SetEvent(workDone); 
+		bWorkDone = true;
+	}
+}
+
+void Task::Impl::start(bool spinlock)
+{
+	bStarted = true;
+	this->spinlock = spinlock;
+	incomingWork = CreateEvent(NULL,FALSE,FALSE,NULL);
+	workDone = CreateEvent(NULL,FALSE,FALSE,NULL);
+	hThread = CreateThread(NULL,0,Task::Impl::s_taskProc,(void*)this, 0, NULL);
+}
+
+void Task::Impl::execute(const TWork &work, void* param) 
+{
+	//setup the work
+	this->work = work;
+	this->param = param;
+	bWorkDone = false;
+	//signal it to start
+	if(!spinlock) SetEvent(incomingWork); 
+	bIncomingWork = true;
+}
+
+void* Task::Impl::finish()
+{
+	//just wait for the work to be done
+	if(spinlock) 
+		while(!bWorkDone) 
+			Sleep(0);
+	else WaitForSingleObject(workDone,INFINITE); 
+	return param;
+}
+
+#else
+
+//just a stub impl that doesnt actually do any threading.
+//somebody needs to update the pthread implementation below
+class Task::Impl {
+public:
+	Impl() {}
+	~Impl() {}
+
+	void start(bool spinlock) {}
+
+	void* ret;
+	void execute(const TWork &work, void* param) { ret = work(param); }
+	
+	void* finish() { return ret; }
+};
+
+
+/*
+#include <pthread.h>
+
+class Task::Impl {
+public:
+	Impl();
+
+	//execute some work
+	void execute(const TWork &work, void* param);
+
+	//wait for the work to complete
+	void* finish();
+
+	pthread_t thread;
+	static void* s_taskProc(void *ptr);
+	void taskProc();
+	void init();
+
+	//the work function that shall be executed
+	TWork work;
+	void* param;
+
+	bool initialized;
+
+	struct WaitEvent
+	{
+		WaitEvent() 
+			: condition(PTHREAD_COND_INITIALIZER)
+			, mutex(PTHREAD_MUTEX_INITIALIZER)
+			, value(false)
+		{}
+		pthread_mutex_t mutex;
+		pthread_cond_t condition;
+		bool value;
+
+		//waits for the WaitEvent to be set
+		void waitAndClear()
+		{ 
+			lock();
+			if(!value)
+				pthread_cond_wait( &condition, &mutex );
+			value = false;
+			unlock();
+		}
+
+		//sets the WaitEvent
+		void signal()
+		{
+			lock();
+			if(!value) {
+				value = true;
+				pthread_cond_signal( &condition );
+			}
+			unlock();
+		}
+
+		//locks the condition's mutex
+		void lock() { pthread_mutex_lock(&mutex); }
+		
+		//unlocks the condition's mutex
+		void unlock() { pthread_mutex_unlock( &mutex ); }
+
+	} incomingWork, workDone;
+
+};
+
+Task::Impl::Impl()
+	: work(NULL)
+	, initialized(false)
+{
+}
+
+void* Task::Impl::s_taskProc(void *ptr)
+{
+	//just past the buck to the instance method
+	((Task::Impl*)ptr)->taskProc();
+	return 0;
+}
+
+void Task::Impl::taskProc()
+{
+	for(;;) {
+		//wait for a chunk of work
+		incomingWork.waitAndClear();
+		//execute the work
+		param = work(param);
+		//signal completion
+		workDone.signal();
+	}
+}
+
+void Task::Impl::init()
+{
+	pthread_create( &thread, NULL, Task::Impl::s_taskProc, (void*)this );     
+	initialized = true;
+}
+
+void Task::Impl::execute(const TWork &work, void* param) 
+{
+	//initialization is deferred to the first execute to give win32 time to startup
+	if(!initialized) init();
+	//setup the work
+	this->work = work;
+	this->param = param;
+	//signal it to start
+	incomingWork.signal();
+}
+
+void* Task::Impl::finish()
+{
+	//just wait for the work to be done
+	workDone.waitAndClear();
+	return param;
+}
+*/
+
+#endif
+
+void Task::start(bool spinlock) { impl->start(spinlock); }
+Task::Task() : impl(new Task::Impl()) {}
+Task::~Task() { delete impl; }
+void Task::execute(const TWork &work, void* param) { impl->execute(work,param); }
+void* Task::finish() { return impl->finish(); }
+
+
--- a/desmume/src/utils/task.h
+++ b/desmume/src/utils/task.h
@ -0,0 +1,46 @@
+/*  Copyright 2009 DeSmuME team
+
+    This file is part of DeSmuME
+
+    DeSmuME is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DeSmuME is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DeSmuME; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
+*/
+
+#ifndef _TASK_H_
+
+//Sort of like a single-thread thread pool.
+//You hand it a worker function and then call finish() to synch with its completion
+class Task
+{
+public:
+	Task();
+	~Task();
+	
+	typedef void * (*TWork)(void *);
+
+	void start(bool spinlock);
+
+	//execute some work
+	void execute(const TWork &work, void* param);
+
+	//wait for the work to complete
+	void* finish();
+
+	class Impl;
+	Impl *impl;
+
+};
+
+
+#endif
--- a/desmume/src/windows/DeSmuME_2005.vcproj
+++ b/desmume/src/windows/DeSmuME_2005.vcproj
@ -643,10 +643,6 @@
 				RelativePath=".\aviout.h"
 				>
 			</File>
-			<File
-				RelativePath=".\buildconfig.h"
-				>
-			</File>
 			<File
 				RelativePath=".\cheatsWin.cpp"
 				>
@ -971,6 +967,14 @@
 				RelativePath="..\utils\md5.h"
 				>
 			</File>
+			<File
+				RelativePath="..\utils\task.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\utils\task.h"
+				>
+			</File>
 			<File
 				RelativePath="..\utils\valuearray.h"
 				>
--- a/desmume/src/windows/DeSmuME_2008.vcproj
+++ b/desmume/src/windows/DeSmuME_2008.vcproj
@ -1019,6 +1019,14 @@
 					RelativePath="..\utils\md5.h"
 					>
 				</File>
+				<File
+					RelativePath="..\utils\task.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\utils\task.h"
+					>
+				</File>
 				<File
 					RelativePath="..\utils\valuearray.h"
 					>
--- a/desmume/src/windows/main.cpp
+++ b/desmume/src/windows/main.cpp
@ -1002,7 +1002,7 @@ static void DoDisplay(bool firstTime)
 		//on single core systems, draw straight to the screen
 		//we only do this once per emulated frame because we don't want to waste time redrawing
 		//on such lousy computers
-		if(CommonSettings.single_core)
+		if(CommonSettings.single_core())
 		{
 			aggDraw.hud->attach((u8*)video.buffer, 256, 384, 1024);
 			DoDisplay_DrawHud();
@ -1025,7 +1025,7 @@ static void DoDisplay(bool firstTime)
 	//apply user's filter
 	video.filter();

-	if(!CommonSettings.single_core)
+	if(!CommonSettings.single_core())
 	{
 		//draw and composite the OSD (but not if we are drawing osd straight to screen)
 		DoDisplay_DrawHud();
@ -1081,7 +1081,7 @@ void KillDisplay()

 void Display()
 {
-	if(CommonSettings.single_core)
+	if(CommonSettings.single_core())
 	{
 		video.srcBuffer = (u8*)GPU_screen;
 		DoDisplay(true);
@ -1229,7 +1229,7 @@ static void StepRunLoop_Paused()
 	Sleep(100);

 	// periodically update single-core OSD when paused and in the foreground
-	if(CommonSettings.single_core && GetActiveWindow() == mainLoopData.hwnd)
+	if(CommonSettings.single_core() && GetActiveWindow() == mainLoopData.hwnd)
 	{
 		video.srcBuffer = (u8*)GPU_screen;
 		DoDisplay(true);
@ -1718,7 +1718,7 @@ class WinDriver : public BaseDriver
 		// in multi-core mode now the display thread will probably
 		// wait for an invocation in this thread to happen,
 		// so handle that ASAP
-		if(!CommonSettings.single_core)
+		if(!CommonSettings.single_core())
 		{
 			ResetEvent(display_invoke_ready_event);
 			SetEvent(display_wakeup_event);
@ -1844,11 +1844,7 @@ int _main()
 	//this helps give a substantial speedup for singlecore users
 	SYSTEM_INFO systemInfo;
 	GetSystemInfo(&systemInfo);
-	if(systemInfo.dwNumberOfProcessors==1)
-		CommonSettings.single_core = true;
-	else
-		CommonSettings.single_core = false;
-
+	CommonSettings.num_cores = systemInfo.dwNumberOfProcessors;

 	char text[80];

@ -1948,7 +1944,7 @@ int _main()

 	//in case this isnt actually a singlecore system, but the user requested it
 	//then restrict ourselves to one core
-	if(CommonSettings.single_core)
+	if(CommonSettings.single_core())
 		SetProcessAffinityMask(GetCurrentProcess(),1);

 	MainWindow = new WINCLASS(CLASSNAME, hAppInst);
@ -2130,7 +2126,9 @@ int _main()
 	hKeyInputTimer = timeSetEvent (KeyInRepeatMSec, 0, KeyInputTimer, 0, TIME_PERIODIC);

 	cur3DCore = GetPrivateProfileInt("3D", "Renderer", GPU3D_OPENGL, IniName);
-	CommonSettings.HighResolutionInterpolateColor = GetPrivateProfileBool("3D", "HighResolutionInterpolateColor", 1, IniName);
+	CommonSettings.GFX3D_HighResolutionInterpolateColor = GetPrivateProfileBool("3D", "HighResolutionInterpolateColor", 1, IniName);
+	CommonSettings.GFX3D_EdgeMark = GetPrivateProfileBool("3D", "EnableEdgeMark", 1, IniName);
+	CommonSettings.GFX3D_Fog = GetPrivateProfileBool("3D", "EnableFog", 1, IniName);
 	//CommonSettings.gfx3d_flushMode = GetPrivateProfileInt("3D", "AlternateFlush", 0, IniName);
 	NDS_3D_ChangeCore(cur3DCore);

@ -3379,7 +3377,7 @@ LRESULT CALLBACK WindowProcedure (HWND hwnd, UINT message, WPARAM wParam, LPARAM
 			}
 			else
 			{
-				if(CommonSettings.single_core)
+				if(CommonSettings.single_core())
 				{
 					video.srcBuffer = (u8*)GPU_screen;
 					DoDisplay(true);
@ -4406,7 +4404,9 @@ LRESULT CALLBACK GFX3DSettingsDlgProc(HWND hw, UINT msg, WPARAM wp, LPARAM lp)
 		{
 			int i;

-			CheckDlgButton(hw,IDC_INTERPOLATECOLOR,CommonSettings.HighResolutionInterpolateColor?1:0);
+			CheckDlgButton(hw,IDC_INTERPOLATECOLOR,CommonSettings.GFX3D_HighResolutionInterpolateColor?1:0);
+			CheckDlgButton(hw,IDC_3DSETTINGS_EDGEMARK,CommonSettings.GFX3D_EdgeMark?1:0);
+			CheckDlgButton(hw,IDC_3DSETTINGS_FOG,CommonSettings.GFX3D_Fog?1:0);
 			//CheckDlgButton(hw,IDC_ALTERNATEFLUSH,CommonSettings.gfx3d_flushMode);

 			for(i = 0; core3DList[i] != NULL; i++)
@ -4423,10 +4423,14 @@ LRESULT CALLBACK GFX3DSettingsDlgProc(HWND hw, UINT msg, WPARAM wp, LPARAM lp)
 			{
 			case IDOK:
 				{
-					CommonSettings.HighResolutionInterpolateColor = IsDlgCheckboxChecked(hw,IDC_INTERPOLATECOLOR);
+					CommonSettings.GFX3D_HighResolutionInterpolateColor = IsDlgCheckboxChecked(hw,IDC_INTERPOLATECOLOR);
+					CommonSettings.GFX3D_EdgeMark = IsDlgCheckboxChecked(hw,IDC_3DSETTINGS_EDGEMARK);
+					CommonSettings.GFX3D_Fog = IsDlgCheckboxChecked(hw,IDC_3DSETTINGS_FOG);
 					NDS_3D_ChangeCore(ComboBox_GetCurSel(GetDlgItem(hw, IDC_3DCORE)));
 					WritePrivateProfileInt("3D", "Renderer", cur3DCore, IniName);
-					WritePrivateProfileInt("3D", "HighResolutionInterpolateColor", CommonSettings.HighResolutionInterpolateColor?1:0, IniName);
+					WritePrivateProfileInt("3D", "HighResolutionInterpolateColor", CommonSettings.GFX3D_HighResolutionInterpolateColor?1:0, IniName);
+					WritePrivateProfileInt("3D", "EnableEdgeMark", CommonSettings.GFX3D_EdgeMark?1:0, IniName);
+					WritePrivateProfileInt("3D", "EnableFog", CommonSettings.GFX3D_Fog?1:0, IniName);
 					//CommonSettings.gfx3d_flushMode = (IsDlgButtonChecked(hw,IDC_ALTERNATEFLUSH) == BST_CHECKED)?1:0;
 					//WritePrivateProfileInt("3D", "AlternateFlush", CommonSettings.gfx3d_flushMode, IniName);
 				}
--- a/desmume/src/windows/resource.h
+++ b/desmume/src/windows/resource.h
@ -277,7 +277,6 @@
 #define IDC_SOUNDCORECB                 1000
 #define IDC_USEEXTBIOS                  1000
 #define ID_BROWSE                       1000
-#define IDC_ALTERNATEFLUSH              1001
 #define IDC_BGMAP_BGXCNT                1001
 #define IDC_CHECKBOX_DEBUGGERMODE       1001
 #define IDC_EDIT01                      1001
@ -630,7 +629,9 @@
 #define IDC_GI_FATOFS                   4464
 #define IDC_INTERPOLATECOLOR            4464
 #define IDC_GI_FATSIZE                  4465
+#define IDC_3DSETTINGS_EDGEMARK         4465
 #define IDC_GI_ICONTITLEOFS             4466
+#define IDC_3DSETTINGS_FOG              4466
 #define IDC_GI_USEDROMSIZE              4467
 #define IDC_GI_ICON                     4469
 #define IDC_GI_TITLE                    4470
--- a/desmume/src/windows/resources.rc
+++ b/desmume/src/windows/resources.rc