From 89349250198f85970f79cecd07277e3430f00c6e Mon Sep 17 00:00:00 2001 From: zeromus Date: Wed, 28 Oct 2009 09:39:52 +0000 Subject: [PATCH] rewrite texture cache, change commandline from --single-core to --num-cores=N, add multithreading to rasterizer, add toggles to 3d config to disable fog+edgemarking for little speedups in games that use them. --- desmume/src/GPU.cpp | 11 +- desmume/src/GPU.h | 3 + desmume/src/Makefile.am | 3 +- desmume/src/NDSSystem.cpp | 17 +- desmume/src/NDSSystem.h | 15 +- desmume/src/OGLRender.cpp | 142 ++- desmume/src/aggdraw.cpp | 2 +- desmume/src/commandline.cpp | 9 +- desmume/src/commandline.h | 3 +- desmume/src/gfx3d.cpp | 2 +- desmume/src/rasterize.cpp | 1381 ++++++++++++----------- desmume/src/rasterize.h | 6 +- desmume/src/texcache.cpp | 884 ++++++++------- desmume/src/texcache.h | 79 +- desmume/src/utils/task.cpp | 279 +++++ desmume/src/utils/task.h | 46 + desmume/src/windows/DeSmuME_2005.vcproj | 12 +- desmume/src/windows/DeSmuME_2008.vcproj | 8 + desmume/src/windows/main.cpp | 36 +- desmume/src/windows/resource.h | 3 +- desmume/src/windows/resources.rc | Bin 441092 -> 438526 bytes 21 files changed, 1749 insertions(+), 1192 deletions(-) create mode 100644 desmume/src/utils/task.cpp create mode 100644 desmume/src/utils/task.h diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 977dee62b..3b8708fb3 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -55,9 +55,6 @@ GPU::MosaicLookup GPU::mosaicLookup; //#define DEBUG_TRI CACHE_ALIGN u8 GPU_screen[4*256*192]; -u8 *GPU_tempScanline; -CACHE_ALIGN u16 GPU_tempScanlineBuffer[256]; - CACHE_ALIGN u8 sprWin[256]; @@ -2237,7 +2234,7 @@ template static void GPU_RenderLine_DispCapture(u16 l) //INFO("Capture screen (BG + OBJ + 3D)\n"); u8 *src; - src = (u8*)(GPU_tempScanline); + src = (u8*)(gpu->tempScanline); CAPCOPY(src,cap_dst); } break; @@ -2279,7 +2276,7 @@ template static void GPU_RenderLine_DispCapture(u16 l) if (gpu->dispCapCnt.srcA == 0) { // Capture screen (BG + OBJ + 3D) - srcA = (u16*)(GPU_tempScanline); + srcA = (u16*)(gpu->tempScanline); } else { @@ -2579,10 +2576,10 @@ void GPU_RenderLine(NDS_Screen * screen, u16 l, bool skip) //generate the 2d engine output if(gpu->dispMode == 1) { //optimization: render straight to the output buffer when thats what we are going to end up displaying anyway - GPU_tempScanline = screen->gpu->currDst = (u8 *)(GPU_screen) + (screen->offset + l) * 512; + gpu->tempScanline = screen->gpu->currDst = (u8 *)(GPU_screen) + (screen->offset + l) * 512; } else { //otherwise, we need to go to a temp buffer - GPU_tempScanline = screen->gpu->currDst = (u8 *)GPU_tempScanlineBuffer; + gpu->tempScanline = screen->gpu->currDst = (u8 *)gpu->tempScanlineBuffer; } GPU_RenderLine_layer(screen, l); diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index dfd2b616b..aa18f9cba 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -736,6 +736,9 @@ struct GPU u16 *currentFadeInColors, *currentFadeOutColors; bool blend2[8]; + CACHE_ALIGN u16 tempScanlineBuffer[256]; + u8 *tempScanline; + u8 MasterBrightMode; u32 MasterBrightFactor; diff --git a/desmume/src/Makefile.am b/desmume/src/Makefile.am index 863141010..43ede26fc 100644 --- a/desmume/src/Makefile.am +++ b/desmume/src/Makefile.am @@ -43,7 +43,8 @@ libdesmume_a_SOURCES = \ utils/md5.cpp utils/md5.h utils/valuearray.h utils/xstring.cpp utils/xstring.h \ utils/decrypt/crc.cpp utils/decrypt/crc.h utils/decrypt/decrypt.cpp \ utils/decrypt/decrypt.h utils/decrypt/header.cpp utils/decrypt/header.h \ - addons.cpp addons.h \ + utils/task.cpp utils/task.h \ + addons.cpp addons.h \ addons/compactFlash.cpp addons/gbagame.cpp addons/none.cpp addons/rumblepak.cpp addons/guitarGrip.cpp addons/expMemory.cpp fs.h \ cheatSystem.cpp cheatSystem.h \ texcache.cpp texcache.h rasterize.cpp rasterize.h \ diff --git a/desmume/src/NDSSystem.cpp b/desmume/src/NDSSystem.cpp index 4951571e6..3478d1758 100644 --- a/desmume/src/NDSSystem.cpp +++ b/desmume/src/NDSSystem.cpp @@ -1881,6 +1881,14 @@ void Sequencer::init() #endif } +//this isnt helping much right now. work on it later +//#include "utils/task.h" +//Task taskSubGpu(true); +//void* renderSubScreen(void*) +//{ +// GPU_RenderLine(&SubScreen, nds.VCount, SkipCur2DFrame); +// return NULL; +//} static void execHardware_hblank() { @@ -1907,8 +1915,10 @@ static void execHardware_hblank() //in practice we need to be more forgiving, in case things have overrun the scanline start. //this should be safe since games cannot do anything timing dependent until this next //scanline begins, anyway (as this scanline was in the middle of drawing) + //taskSubGpu.execute(renderSubScreen,NULL); GPU_RenderLine(&MainScreen, nds.VCount, SkipCur2DFrame); GPU_RenderLine(&SubScreen, nds.VCount, SkipCur2DFrame); + //taskSubGpu.finish(); //trigger hblank dmas //but notice, we do that just after we finished drawing the line @@ -1963,12 +1973,12 @@ static void execHardware_hstart_vblankStart() static void execHardware_hstart_vcount() { u16 vmatch = T1ReadWord(MMU.ARM9_REG, 4); - if(nds.VCount==((vmatch>>8)|((vmatch<<1)&(1<<8)))) + vmatch = ((vmatch>>8)|((vmatch<<1)&(1<<8))); + if(nds.VCount==vmatch) { //arm9 vmatch T1WriteWord(MMU.ARM9_REG, 4, T1ReadWord(MMU.ARM9_REG, 4) | 4); if(T1ReadWord(MMU.ARM9_REG, 4) & 32) { - //printf("VMATCH FIRING! vc=%03d\n",nds.VCount); NDS_makeARM9Int(2); } } @@ -1976,7 +1986,8 @@ static void execHardware_hstart_vcount() T1WriteWord(MMU.ARM9_REG, 4, T1ReadWord(MMU.ARM9_REG, 4) & 0xFFFB); vmatch = T1ReadWord(MMU.ARM7_REG, 4); - if(nds.VCount==((vmatch>>8)|((vmatch<<1)&(1<<8)))) + vmatch = ((vmatch>>8)|((vmatch<<1)&(1<<8))); + if(nds.VCount==vmatch) { //arm7 vmatch T1WriteWord(MMU.ARM7_REG, 4, T1ReadWord(MMU.ARM7_REG, 4) | 4); diff --git a/desmume/src/NDSSystem.h b/desmume/src/NDSSystem.h index 915ccb4d9..c6621c071 100644 --- a/desmume/src/NDSSystem.h +++ b/desmume/src/NDSSystem.h @@ -421,17 +421,19 @@ int NDS_WriteBMP_32bppBuffer(int width, int height, const void* buf, const char extern struct TCommonSettings { TCommonSettings() - : HighResolutionInterpolateColor(true) - , UseExtBIOS(false) + : UseExtBIOS(false) , SWIFromBIOS(false) , UseExtFirmware(false) , BootFromFirmware(false) , DebugConsole(false) - , single_core(true) + , num_cores(1) , spuInterpolationMode(SPUInterpolation_Linear) //, gfx3d_flushMode(0) , manualBackupType(0) , micMode(InternalNoise) + , GFX3D_HighResolutionInterpolateColor(true) + , GFX3D_EdgeMark(true) + , GFX3D_Fog(true) { strcpy(ARM9BIOS, "biosnds9.bin"); strcpy(ARM7BIOS, "biosnds7.bin"); @@ -443,7 +445,9 @@ extern struct TCommonSettings { for(int i=0;i<16;i++) spu_muteChannels[i] = false; } - bool HighResolutionInterpolateColor; + bool GFX3D_HighResolutionInterpolateColor; + bool GFX3D_EdgeMark; + bool GFX3D_Fog; bool UseExtBIOS; char ARM9BIOS[256]; @@ -456,7 +460,8 @@ extern struct TCommonSettings { bool DebugConsole; - bool single_core; + int num_cores; + bool single_core() { return num_cores==1; } struct _Wifi { int mode; diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index aec84606f..71542753a 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -24,6 +24,8 @@ //so, it doesnt composite to 2d correctly. //(re: new super mario brothers renders the stormclouds at the beginning) +#include + #include "OGLRender.h" #include "debug.h" @@ -208,9 +210,8 @@ static void _xglDisable(GLenum cap) { CTASSERT((cap-0x0B00)<0x100); \ _xglDisable(cap); } +static std::queue freeTextureIds; - -GLenum oglTempTextureID[MAX_TEXTURE]; GLenum oglToonTableTextureID; #define NOSHADERS(s) { hasShaders = false; INFO("Shaders aren't supported on your system, using fixed pipeline\n(%s)\n", s); return; } @@ -252,17 +253,16 @@ GLenum oglToonTableTextureID; bool hasShaders = false; -/* Vertex shader */ GLuint vertexShaderID; -/* Fragment shader */ GLuint fragmentShaderID; -/* Shader program */ GLuint shaderProgram; static GLuint hasTexLoc; static GLuint texBlendLoc; static bool hasTexture = false; +static ADPCMCacheItem* currTexture = NULL; + /* Shaders init */ static void createShaders() @@ -337,45 +337,54 @@ static void OGLReset() } TexCache_Reset(); - - for (int i = 0; i < MAX_TEXTURE; i++) - texcache[i].id=oglTempTextureID[i]; + currTexture = NULL; // memset(GPU_screenStencil,0,sizeof(GPU_screenStencil)); memset(GPU_screen3D,0,sizeof(GPU_screen3D)); } +//static class OGLTexCacheUser : public ITexCacheUser +//{ +//public: +// virtual void BindTexture(u32 tx) +// { +// glBindTexture(GL_TEXTURE_2D,(GLuint)texcache[tx].id); +// glMatrixMode (GL_TEXTURE); +// glLoadIdentity (); +// glScaled (texcache[tx].invSizeX, texcache[tx].invSizeY, 1.0f); +// +// glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); +// glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); +// +// glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (BIT16(texcache[tx].frm) ? (BIT18(texcache[tx].frm)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP)); +// glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (BIT17(texcache[tx].frm) ? (BIT19(texcache[tx].frm)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP)); +// } +// +// virtual void BindTextureData(u32 tx, u8* data) +// { +// BindTexture(tx); +// +// #if 0 +// for (int i=0; i < texcache[tx].sizeX * texcache[tx].sizeY*4; i++) +// data[i] = 0xFF; +// #endif +// glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, +// texcache[tx].sizeX, texcache[tx].sizeY, 0, +// GL_RGBA, GL_UNSIGNED_BYTE, data); +// } +//} textures; +// +//static TexCacheUnit texCacheUnit; - - -static void BindTexture(u32 tx) +static void expandFreeTextures() { - glBindTexture(GL_TEXTURE_2D,(GLuint)texcache[tx].id); - glMatrixMode (GL_TEXTURE); - glLoadIdentity (); - glScaled (texcache[tx].invSizeX, texcache[tx].invSizeY, 1.0f); - - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (BIT16(texcache[tx].frm) ? (BIT18(texcache[tx].frm)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP)); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (BIT17(texcache[tx].frm) ? (BIT19(texcache[tx].frm)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP)); + const int kInitTextures = 128; + GLuint oglTempTextureID[kInitTextures]; + glGenTextures(kInitTextures, &oglTempTextureID[0]); + for(int i=0;itexid); + if(currTexture == item) + currTexture = NULL; +} + static void setTexture(unsigned int format, unsigned int texpal) { textureFormat = format; @@ -529,7 +552,43 @@ static void setTexture(unsigned int format, unsigned int texpal) } - TexCache_SetTexture(format, texpal); +// texCacheUnit.TexCache_SetTexture(format, texpal); + ADPCMCacheItem* newTexture = TexCache_SetTexture(TexFormat_32bpp,format,texpal); + if(newTexture != currTexture) + { + currTexture = newTexture; + //has the ogl renderer initialized the texture? + if(!currTexture->deleteCallback) + { + currTexture->deleteCallback = texDeleteCallback; + if(freeTextureIds.empty()) expandFreeTextures(); + currTexture->texid = (void*)freeTextureIds.front(); + freeTextureIds.pop(); + + glBindTexture(GL_TEXTURE_2D,(GLuint)currTexture->texid); + + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, (BIT16(currTexture->texformat) ? (BIT18(currTexture->texformat)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP)); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, (BIT17(currTexture->texformat) ? (BIT19(currTexture->texformat)?GL_MIRRORED_REPEAT:GL_REPEAT) : GL_CLAMP)); + + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, + currTexture->sizeX, currTexture->sizeY, 0, + GL_RGBA, GL_UNSIGNED_BYTE, currTexture->decoded); + } + else + { + //otherwise, just bind it + glBindTexture(GL_TEXTURE_2D,(GLuint)currTexture->texid); + } + + //in either case, we need to setup the tex mtx + glMatrixMode(GL_TEXTURE); + glLoadIdentity(); + glScalef(currTexture->invSizeX, currTexture->invSizeY, 1.0f); + + } } @@ -902,6 +961,9 @@ static void OGLRender() } } + //needs to happen before endgl because it could free some textureids for expired cache items + TexCache_EvictFrame(); + ENDGL(); GL_ReadFramebuffer(); diff --git a/desmume/src/aggdraw.cpp b/desmume/src/aggdraw.cpp index f30903e49..36c54bfa0 100644 --- a/desmume/src/aggdraw.cpp +++ b/desmume/src/aggdraw.cpp @@ -148,7 +148,7 @@ void Agg_init() aggDraw.target = targets[0]; //if we're single core, we don't want to waste time compositing - if(CommonSettings.single_core) + if(CommonSettings.single_core()) aggDraw.hud = &agg_targetScreen; //and the more clever compositing isnt supported in non-windows diff --git a/desmume/src/commandline.cpp b/desmume/src/commandline.cpp index 77a4b0937..32a6549d8 100644 --- a/desmume/src/commandline.cpp +++ b/desmume/src/commandline.cpp @@ -40,8 +40,7 @@ CommandLine::CommandLine() , _record_movie_file(0) , _cflash_image(0) , _cflash_path(0) -, _single_core(0) -, _multi_core(0) +, _num_cores(-1) , _bios_arm9(NULL) , _bios_arm7(NULL) , _bios_swi(0) @@ -74,8 +73,7 @@ void CommandLine::loadCommonOptions() { "bios-arm7", 0, 0, G_OPTION_ARG_FILENAME, &_bios_arm7, "Uses the arm7 bios provided at the specified path", "BIOS_ARM7_PATH"}, { "bios-swi", 0, 0, G_OPTION_ARG_INT, &_bios_swi, "Uses SWI from the provided bios files", "BIOS_SWI"}, #ifdef _MSC_VER - { "single-core", 0, 0, G_OPTION_ARG_NONE, &_single_core, "Limit execution to use approximately only one core", "NUM_CORES"}, - { "multi-core", 0, 0, G_OPTION_ARG_NONE, &_multi_core, "Act as if multiple cores are present, even on a single-core machine", "MULTI_CORE"}, + { "num-cores", 0, 0, G_OPTION_ARG_NONE, &_num_cores, "Override numcores detection and use this many", "NUM_CORES"}, { "scanline-filter-a", 0, 0, G_OPTION_ARG_INT, &scanline_filter_a, "Intensity of fadeout for scanlines filter (edge) (default 2)", "SCANLINE_FILTER_A"}, { "scanline-filter-b", 0, 0, G_OPTION_ARG_INT, &scanline_filter_b, "Intensity of fadeout for scanlines filter (corner) (default 4)", "SCANLINE_FILTER_B"}, #endif @@ -103,8 +101,7 @@ bool CommandLine::parse(int argc,char **argv) if(_cflash_image) cflash_image = _cflash_image; if(_cflash_path) cflash_path = _cflash_path; - if(_single_core) CommonSettings.single_core = true; - if(_multi_core) CommonSettings.single_core = false; + if(_num_cores != -1) CommonSettings.num_cores = _num_cores; //TODO MAX PRIORITY! change ARM9BIOS etc to be a std::string if(_bios_arm9) { CommonSettings.UseExtBIOS = true; strcpy(CommonSettings.ARM9BIOS,_bios_arm9); } diff --git a/desmume/src/commandline.h b/desmume/src/commandline.h index 81ae93a88..4436821c0 100644 --- a/desmume/src/commandline.h +++ b/desmume/src/commandline.h @@ -75,8 +75,7 @@ private: char* _cflash_path; char* _bios_arm9, *_bios_arm7; int _bios_swi; - int _single_core; - int _multi_core; + int _num_cores; }; #endif diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index b415ed6c8..78f26f4d4 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -2341,7 +2341,7 @@ static FORCEINLINE VERT clipPoint(VERT* inside, VERT* outside, int coord, int wh INTERP(coord[0]); INTERP(coord[1]); INTERP(coord[2]); INTERP(coord[3]); INTERP(texcoord[0]); INTERP(texcoord[1]); - if(CommonSettings.HighResolutionInterpolateColor) + if(CommonSettings.GFX3D_HighResolutionInterpolateColor) { INTERP(fcolor[0]); INTERP(fcolor[1]); INTERP(fcolor[2]); } diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index 9ea3a9241..4918ca689 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -1,8 +1,4 @@ -/* Copyright (C) 2006 yopyop - yopyop156@ifrance.com - yopyop156.ifrance.com - - Copyright 2009 DeSmuME team +/* Copyright 2009 DeSmuME team This file is part of DeSmuME @@ -46,6 +42,7 @@ #include "gfx3d.h" #include "texcache.h" #include "NDSSystem.h" +#include "utils/task.h" //#undef FORCEINLINE //#define FORCEINLINE @@ -63,8 +60,6 @@ template T _max(T a, T b, T c, T d) { return max(_max(a,b,d),c); } static const int kUnsetTranslucentPolyID = 255; -static int polynum; - static u8 modulate_table[64][64]; static u8 decal_table[32][64][64]; static u8 index_lookup_table[65]; @@ -72,6 +67,9 @@ static u8 index_start_table[8]; static GFX3D_Clipper clipper; static GFX3D_Clipper::TClippedPoly *clippedPolys = NULL; +static ADPCMCacheItem* polyTexKeys[POLYLIST_SIZE]; +static bool polyVisible[POLYLIST_SIZE]; +static bool polyBackfacing[POLYLIST_SIZE]; static int clippedPolyCounter; @@ -118,106 +116,6 @@ static FORCEINLINE int fastFloor(float f) -//----texture cache--- - -//TODO - the texcache could ask for a buffer to generate into -//that would avoid us ever having to buffercopy.. -struct TextureBuffers -{ - static const int numTextures = MAX_TEXTURE+1; - u8* buffers[numTextures]; - - void clear() { memset(buffers,0,sizeof(buffers)); } - - TextureBuffers() - { - clear(); - } - - void free() - { - for(int i=0;i>4)&0x3; - if(mode==3 && polyid !=0) return !backfacing; - //another reasonable possibility is that we should be forcing back faces to draw (mariokart doesnt use them) - //and then only using a single bit buffer (but a cursory test of this doesnt actually work) - - switch((val>>6)&3) { - case 0: return false; - case 1: return backfacing; - case 2: return !backfacing; - case 3: return true; - default: assert(false); return false; - } - } - - void setup(u32 polyAttr) - { - val = polyAttr; - decalMode = BIT14(val); - translucentDepthWrite = BIT11(val); - polyid = (polyAttr>>24)&0x3F; - alpha = (polyAttr>>16)&0x1F; - drawBackPlaneIntersectingPolys = BIT12(val); - fogged = BIT15(val); - } - -} polyAttr; union FragmentColor { u32 color; @@ -242,8 +140,6 @@ struct Fragment }; }; -static VERT* verts[MAX_CLIPPED_VERTS]; - //INLINE static void SubmitVertex(int vert_index, VERT& rawvert) //{ // verts[vert_index] = &rawvert; @@ -259,351 +155,6 @@ FORCEINLINE int iround(float f) { } -static struct Sampler -{ - int width, height; - int wmask, hmask; - int wrap; - int wshift; - int texFormat; - void setup(u32 texParam) - { - texFormat = (texParam>>26)&7; - wshift = ((texParam>>20)&0x07) + 3; - width=(1 << wshift); - height=(8 << ((texParam>>23)&0x07)); - wmask = width-1; - hmask = height-1; - wrap = (texParam>>16)&0xF; - } - - FORCEINLINE void clamp(int &val, const int size, const int sizemask){ - if(val<0) val = 0; - if(val>sizemask) val = sizemask; - } - FORCEINLINE void hclamp(int &val) { clamp(val,width,wmask); } - FORCEINLINE void vclamp(int &val) { clamp(val,height,hmask); } - - FORCEINLINE void repeat(int &val, const int size, const int sizemask) { - val &= sizemask; - } - FORCEINLINE void hrepeat(int &val) { repeat(val,width,wmask); } - FORCEINLINE void vrepeat(int &val) { repeat(val,height,hmask); } - - FORCEINLINE void flip(int &val, const int size, const int sizemask) { - val &= ((size<<1)-1); - if(val>=size) val = (size<<1)-val-1; - } - FORCEINLINE void hflip(int &val) { flip(val,width,wmask); } - FORCEINLINE void vflip(int &val) { flip(val,height,hmask); } - - FORCEINLINE void dowrap(int& iu, int& iv) - { - switch(wrap) { - //flip none - case 0x0: hclamp(iu); vclamp(iv); break; - case 0x1: hrepeat(iu); vclamp(iv); break; - case 0x2: hclamp(iu); vrepeat(iv); break; - case 0x3: hrepeat(iu); vrepeat(iv); break; - //flip S - case 0x4: hclamp(iu); vclamp(iv); break; - case 0x5: hflip(iu); vclamp(iv); break; - case 0x6: hclamp(iu); vrepeat(iv); break; - case 0x7: hflip(iu); vrepeat(iv); break; - //flip T - case 0x8: hclamp(iu); vclamp(iv); break; - case 0x9: hrepeat(iu); vclamp(iv); break; - case 0xA: hclamp(iu); vflip(iv); break; - case 0xB: hrepeat(iu); vflip(iv); break; - //flip both - case 0xC: hclamp(iu); vclamp(iv); break; - case 0xD: hflip(iu); vclamp(iv); break; - case 0xE: hclamp(iu); vflip(iv); break; - case 0xF: hflip(iu); vflip(iv); break; - } - } - - FORCEINLINE FragmentColor sample(float u, float v) - { - //finally, we can use floor here. but, it is slower than we want. - //the best solution is probably to wait until the pipeline is full of fixed point - s32 iu = s32floor(u); - s32 iv = s32floor(v); - dowrap(iu,iv); - - FragmentColor color; - color.color = ((u32*)textures.currentData)[(iv<>4)&0x3; - //if there is no texture set, then set to the mode which doesnt even use a texture - //(no texture makes sense for toon/highlight mode) - if(sampler.texFormat == 0 && (mode == 0 || mode == 1)) - mode = 4; - } - - float invu, invv, w; - FragmentColor materialColor; - - FORCEINLINE void shade(FragmentColor& dst) - { - FragmentColor texColor; - float u,v; - - switch(mode) - { - case 0: //modulate - u = invu*w; - v = invv*w; - texColor = sampler.sample(u,v); - dst.r = modulate_table[texColor.r][materialColor.r]; - dst.g = modulate_table[texColor.g][materialColor.g]; - dst.b = modulate_table[texColor.b][materialColor.b]; - dst.a = modulate_table[GFX3D_5TO6(texColor.a)][GFX3D_5TO6(materialColor.a)]>>1; - //dst.color.components.a = 31; - //#ifdef _MSC_VER - //if(GetAsyncKeyState(VK_SHIFT)) { - // //debugging tricks - // dst = materialColor; - // if(GetAsyncKeyState(VK_TAB)) { - // u8 alpha = dst.a; - // dst.color = polynum*8+8; - // dst.a = alpha; - // } - //} - //#endif - break; - case 1: //decal - u = invu*w; - v = invv*w; - texColor = sampler.sample(u,v); - dst.r = decal_table[texColor.a][texColor.r][materialColor.r]; - dst.g = decal_table[texColor.a][texColor.g][materialColor.g]; - dst.b = decal_table[texColor.a][texColor.b][materialColor.b]; - dst.a = materialColor.a; - break; - case 2: //toon/highlight shading - { - u = invu*w; - v = invv*w; - texColor = sampler.sample(u,v); - FragmentColor toonColor = toonTable[materialColor.r>>1]; - if(sampler.texFormat == 0) - { - //if no texture is set then we dont need to modulate texture with toon - //but rather just use toon directly - dst = toonColor; - dst.a = materialColor.a; - } - else - { - if(gfx3d.shading == GFX3D::HIGHLIGHT) - { - dst.r = modulate_table[texColor.r][materialColor.r]; - dst.g = modulate_table[texColor.g][materialColor.r]; - dst.b = modulate_table[texColor.b][materialColor.r]; - dst.a = modulate_table[GFX3D_5TO6(texColor.a)][GFX3D_5TO6(materialColor.a)]>>1; - - dst.r = min(63, (dst.r + toonColor.r)); - dst.g = min(63, (dst.g + toonColor.g)); - dst.b = min(63, (dst.b + toonColor.b)); - } - else - { - dst.r = modulate_table[texColor.r][toonColor.r]; - dst.g = modulate_table[texColor.g][toonColor.g]; - dst.b = modulate_table[texColor.b][toonColor.b]; - dst.a = modulate_table[GFX3D_5TO6(texColor.a)][GFX3D_5TO6(materialColor.a)]>>1; - } - } - - } - break; - case 3: //shadows - //is this right? only with the material color? - dst = materialColor; - break; - case 4: //our own special mode which only uses the material color (for when texturing is disabled) - dst = materialColor; - break; - - } - } - -} shader; - -static FORCEINLINE void alphaBlend(FragmentColor & dst, const FragmentColor & src) -{ - if(gfx3d.enableAlphaBlending) - { - if(src.a == 0 || dst.a == 0) - { - dst = src; - } - else - { - u8 alpha = src.a+1; - u8 invAlpha = 32 - alpha; - dst.r = (alpha*src.r + invAlpha*dst.r)>>5; - dst.g = (alpha*src.g + invAlpha*dst.g)>>5; - dst.b = (alpha*src.b + invAlpha*dst.b)>>5; - } - - dst.a = max(src.a,dst.a); - } - else - { - if(src.a == 0) - { - //do nothing; the fragment is totally transparent - } - else - { - dst = src; - } - } -} - -static FORCEINLINE void pixel(int adr,float r, float g, float b, float invu, float invv, float w, float z) { - Fragment &destFragment = screen[adr]; - FragmentColor &destFragmentColor = screenColor[adr]; - - u32 depth; - if(gfx3d.wbuffer) - { - //not sure about this - //this value was chosen to make the skybox, castle window decals, and water level render correctly in SM64 - depth = u32floor(4096*w); - } - else - { - depth = u32floor(z*0x7FFF); - depth <<= 9; - } - - if(polyAttr.decalMode) - { - if(depth != destFragment.depth) - { - goto depth_fail; - } - } - else - { - if(depth>=destFragment.depth) - { - goto depth_fail; - } - } - - //handle shadow polys - if(shader.mode == 3) - { - if(polyAttr.polyid == 0) - { - //that's right! stencil buffers, despite reports to the contrary, must be more than 1 bit - //this is necessary to make these cases work all at once. - //1. sm64 (standing near signs and blocks) - //2. mariokart (no glitches in shadow shape in kart selector) - //3. mariokart (no junk beneath platform in kart selector / no shadow beneath grate floor in bowser stage) - //(specifically, the shadows in mario kart are complicated concave shapes) - destFragment.stencil++; - goto rejected_fragment; - } - else - { - if(destFragment.stencil) - { - destFragment.stencil--; - goto rejected_fragment; - } - - //shadow polys have a special check here to keep from self-shadowing when user - //has tried to prevent it from happening - //if this isnt here, then the vehicle select in mariokart will look terrible - if(destFragment.polyid.opaque == polyAttr.polyid) - goto rejected_fragment; - } - } - - shader.w = w; - shader.invu = invu; - shader.invv = invv; - - //perspective-correct the colors - r = (r * w) + 0.5f; - g = (g * w) + 0.5f; - b = (b * w) + 0.5f; - - //this is a HACK: - //we are being very sloppy with our interpolation precision right now - //and rather than fix it, i just want to clamp it - shader.materialColor.r = max(0U,min(63U,u32floor(r))); - shader.materialColor.g = max(0U,min(63U,u32floor(g))); - shader.materialColor.b = max(0U,min(63U,u32floor(b))); - - shader.materialColor.a = polyAttr.alpha; - - //pixel shader - FragmentColor shaderOutput; - shader.shade(shaderOutput); - - //we shouldnt do any of this if we generated a totally transparent pixel - if(shaderOutput.a != 0) - { - //alpha test (don't have any test cases for this...? is it in the right place...?) - if(gfx3d.enableAlphaTest) - { - if(shaderOutput.a < gfx3d.alphaTestRef) - goto rejected_fragment; - } - - //handle polyids - bool isOpaquePixel = shaderOutput.a == 31; - if(isOpaquePixel) - { - destFragment.polyid.opaque = polyAttr.polyid; - destFragment.isTranslucentPoly = polyAttr.translucent?1:0; - destFragment.fogged = polyAttr.fogged; - destFragmentColor = shaderOutput; - } - else - { - //dont overwrite pixels on translucent polys with the same polyids - if(destFragment.polyid.translucent == polyAttr.polyid) - goto rejected_fragment; - - //originally we were using a test case of shadows-behind-trees in sm64ds - //but, it looks bad in that game. this is actually correct - //if this isnt correct, then complex shape cart shadows in mario kart don't work right - destFragment.polyid.translucent = polyAttr.polyid; - - //alpha blending and write color - alphaBlend(destFragmentColor, shaderOutput); - - destFragment.fogged &= polyAttr.fogged; - } - - //depth writing - if(isOpaquePixel || polyAttr.translucentDepthWrite) - destFragment.depth = depth; - - } - - depth_fail: - rejected_fragment: - ; -} - - typedef int fixed28_4; static bool failure; @@ -671,9 +222,10 @@ FORCEINLINE int Ceil28_4( fixed28_4 Value ) { struct edge_fx_fl { edge_fx_fl() {} - edge_fx_fl(int Top, int Bottom); + edge_fx_fl(int Top, int Bottom, VERT** verts); FORCEINLINE int Step(); + VERT** verts; long X, XStep, Numerator, Denominator; // DDA info for x long ErrorTerm; int Y, Height; // current y and vertical count @@ -702,7 +254,8 @@ struct edge_fx_fl { void FORCEINLINE doStepExtraInterpolants() { for(int i=0;iverts = verts; Y = Ceil28_4((fixed28_4)verts[Top]->y); int YEnd = Ceil28_4((fixed28_4)verts[Bottom]->y); Height = YEnd - Y; @@ -745,166 +298,664 @@ FORCEINLINE int edge_fx_fl::Step() { return Height; } -//draws a single scanline -FORCEINLINE static void drawscanline(edge_fx_fl *pLeft, edge_fx_fl *pRight) + + +static FORCEINLINE void alphaBlend(FragmentColor & dst, const FragmentColor & src) { - int XStart = pLeft->X; - int width = pRight->X - XStart; - - //these are the starting values, taken from the left edge - float invw = pLeft->invw.curr; - float u = pLeft->u.curr; - float v = pLeft->v.curr; - float z = pLeft->z.curr; - float color[3] = { - pLeft->color[0].curr, - pLeft->color[1].curr, - pLeft->color[2].curr }; - - //our dx values are taken from the steps up until the right edge - float invWidth = 1.0f / width; - float dinvw_dx = (pRight->invw.curr - invw) * invWidth; - float du_dx = (pRight->u.curr - u) * invWidth; - float dv_dx = (pRight->v.curr - v) * invWidth; - float dz_dx = (pRight->z.curr - z) * invWidth; - float dc_dx[3] = { - (pRight->color[0].curr - color[0]) * invWidth, - (pRight->color[1].curr - color[1]) * invWidth, - (pRight->color[2].curr - color[2]) * invWidth }; - - int adr = (pLeft->Y<<8)+XStart; - - //CONSIDER: in case some other math is wrong (shouldve been clipped OK), we might go out of bounds here. - //better check the Y value. - if(pLeft->Y<0 || pLeft->Y>191) { - printf("rasterizer rendering at y=%d! oops!\n",pLeft->Y); - return; - } - - int x = XStart; - - while(width-- > 0) + if(gfx3d.enableAlphaBlending) { - if(x<0 || x>255) { - printf("rasterizer rendering at x=%d! oops!\n",x); - return; + if(src.a == 0 || dst.a == 0) + { + dst = src; } - pixel(adr,color[0],color[1],color[2],u,v,1.0f/invw,z); - adr++; - x++; - - invw += dinvw_dx; - u += du_dx; - v += dv_dx; - z += dz_dx; - color[0] += dc_dx[0]; - color[1] += dc_dx[1]; - color[2] += dc_dx[2]; - } -} - -//runs several scanlines, until an edge is finished -static void runscanlines(edge_fx_fl *left, edge_fx_fl *right) -{ - //do not overstep either of the edges - int Height = min(left->Height,right->Height); - while(Height--) { - drawscanline(left,right); - left->Step(); - right->Step(); - } -} - -//rotates verts counterclockwise -template -INLINE static void rot_verts() { - #define ROTSWAP(X) if(type>X) swap(verts[X-1],verts[X]); - ROTSWAP(1); ROTSWAP(2); ROTSWAP(3); ROTSWAP(4); - ROTSWAP(5); ROTSWAP(6); ROTSWAP(7); -} - -//rotate verts until vert0.y is minimum, and then vert0.x is minimum in case of ties -//this is a necessary precondition for our shape engine -template -static void sort_verts(bool backwards) { - //if the verts are backwards, reorder them first - if(backwards) - for(int i=0;iX) if(verts[0]->y > verts[X]->y) goto doswap; - CHECKY(1); CHECKY(2); CHECKY(3); CHECKY(4); - CHECKY(5); CHECKY(6); CHECKY(7); - break; - - doswap: - rot_verts(); - } - - while(verts[0]->y == verts[1]->y && verts[0]->x > verts[1]->x) - rot_verts(); - -} - -//This function can handle any convex N-gon up to octagons -//verts must be clockwise. -//I didnt reference anything for this algorithm but it seems like I've seen it somewhere before. -static void shape_engine(int type, bool backwards) -{ - failure = false; - - switch(type) { - case 3: sort_verts<3>(backwards); break; - case 4: sort_verts<4>(backwards); break; - case 5: sort_verts<5>(backwards); break; - case 6: sort_verts<6>(backwards); break; - case 7: sort_verts<7>(backwards); break; - case 8: sort_verts<8>(backwards); break; - default: printf("skipping type %d\n",type); return; - } - - //we are going to step around the polygon in both directions starting from vert 0. - //right edges will be stepped over clockwise and left edges stepped over counterclockwise. - //these variables track that stepping, but in order to facilitate wrapping we start extra high - //for the counter we're decrementing. - int lv = type, rv = 0; - - edge_fx_fl left, right; - bool step_left = true, step_right = true; - for(;;) { - //generate new edges if necessary. we must avoid regenerating edges when they are incomplete - //so that they can be continued on down the shape - assert(rv != type); - int _lv = lv==type?0:lv; //make sure that we ask for vert 0 when the variable contains the starting value - if(step_left) left = edge_fx_fl(_lv,lv-1); - if(step_right) right = edge_fx_fl(rv,rv+1); - step_left = step_right = false; - - //handle a failure in the edge setup due to nutty polys - if(failure) - return; - - - - runscanlines(&left,&right); - - //if we ran out of an edge, step to the next one - if(right.Height == 0) { - step_right = true; - rv++; - } - if(left.Height == 0) { - step_left = true; - lv--; + else + { + u8 alpha = src.a+1; + u8 invAlpha = 32 - alpha; + dst.r = (alpha*src.r + invAlpha*dst.r)>>5; + dst.g = (alpha*src.g + invAlpha*dst.g)>>5; + dst.b = (alpha*src.b + invAlpha*dst.b)>>5; } - //this is our completion condition: when our stepped edges meet in the middle - if(lv<=rv+1) break; + dst.a = max(src.a,dst.a); + } + else + { + if(src.a == 0) + { + //do nothing; the fragment is totally transparent + } + else + { + dst = src; + } + } +} + + + +class RasterizerUnit +{ +public: + + int SLI_MASK, SLI_VALUE; + + RasterizerUnit() + : sampler(*this) + , shader(sampler) + { } + ADPCMCacheItem* lastTexKey; + + VERT* verts[MAX_CLIPPED_VERTS]; + + struct PolyAttr + { + u32 val; + + bool decalMode; + bool translucentDepthWrite; + bool drawBackPlaneIntersectingPolys; + u8 polyid; + u8 alpha; + bool backfacing; + bool translucent; + u8 fogged; + + bool isVisible(bool backfacing) + { + //this was added after adding multi-bit stencil buffer + //it seems that we also need to prevent drawing back faces of shadow polys for rendering + u32 mode = (val>>4)&0x3; + if(mode==3 && polyid !=0) return !backfacing; + //another reasonable possibility is that we should be forcing back faces to draw (mariokart doesnt use them) + //and then only using a single bit buffer (but a cursory test of this doesnt actually work) + + switch((val>>6)&3) { + case 0: return false; + case 1: return backfacing; + case 2: return !backfacing; + case 3: return true; + default: assert(false); return false; + } + } + + void setup(u32 polyAttr) + { + val = polyAttr; + decalMode = BIT14(val); + translucentDepthWrite = BIT11(val); + polyid = (polyAttr>>24)&0x3F; + alpha = (polyAttr>>16)&0x1F; + drawBackPlaneIntersectingPolys = BIT12(val); + fogged = BIT15(val); + } + + } polyAttr; + + + struct Sampler + { + Sampler(RasterizerUnit& _unit) + : unit(_unit) + {} + + RasterizerUnit& unit; + + int width, height; + int wmask, hmask; + int wrap; + int wshift; + int texFormat; + void setup(u32 texParam) + { + texFormat = (texParam>>26)&7; + wshift = ((texParam>>20)&0x07) + 3; + width=(1 << wshift); + height=(8 << ((texParam>>23)&0x07)); + wmask = width-1; + hmask = height-1; + wrap = (texParam>>16)&0xF; + } + + FORCEINLINE void clamp(int &val, const int size, const int sizemask){ + if(val<0) val = 0; + if(val>sizemask) val = sizemask; + } + FORCEINLINE void hclamp(int &val) { clamp(val,width,wmask); } + FORCEINLINE void vclamp(int &val) { clamp(val,height,hmask); } + + FORCEINLINE void repeat(int &val, const int size, const int sizemask) { + val &= sizemask; + } + FORCEINLINE void hrepeat(int &val) { repeat(val,width,wmask); } + FORCEINLINE void vrepeat(int &val) { repeat(val,height,hmask); } + + FORCEINLINE void flip(int &val, const int size, const int sizemask) { + val &= ((size<<1)-1); + if(val>=size) val = (size<<1)-val-1; + } + FORCEINLINE void hflip(int &val) { flip(val,width,wmask); } + FORCEINLINE void vflip(int &val) { flip(val,height,hmask); } + + FORCEINLINE void dowrap(int& iu, int& iv) + { + switch(wrap) { + //flip none + case 0x0: hclamp(iu); vclamp(iv); break; + case 0x1: hrepeat(iu); vclamp(iv); break; + case 0x2: hclamp(iu); vrepeat(iv); break; + case 0x3: hrepeat(iu); vrepeat(iv); break; + //flip S + case 0x4: hclamp(iu); vclamp(iv); break; + case 0x5: hflip(iu); vclamp(iv); break; + case 0x6: hclamp(iu); vrepeat(iv); break; + case 0x7: hflip(iu); vrepeat(iv); break; + //flip T + case 0x8: hclamp(iu); vclamp(iv); break; + case 0x9: hrepeat(iu); vclamp(iv); break; + case 0xA: hclamp(iu); vflip(iv); break; + case 0xB: hrepeat(iu); vflip(iv); break; + //flip both + case 0xC: hclamp(iu); vclamp(iv); break; + case 0xD: hflip(iu); vclamp(iv); break; + case 0xE: hclamp(iu); vflip(iv); break; + case 0xF: hflip(iu); vflip(iv); break; + } + } + + FORCEINLINE FragmentColor sample(float u, float v) + { + //finally, we can use floor here. but, it is slower than we want. + //the best solution is probably to wait until the pipeline is full of fixed point + s32 iu = s32floor(u); + s32 iv = s32floor(v); + dowrap(iu,iv); + + FragmentColor color; + color.color = ((u32*)unit.lastTexKey->decoded)[(iv<>4)&0x3; + //if there is no texture set, then set to the mode which doesnt even use a texture + //(no texture makes sense for toon/highlight mode) + if(sampler.texFormat == 0 && (mode == 0 || mode == 1)) + mode = 4; + } + + float invu, invv, w; + FragmentColor materialColor; + + FORCEINLINE void shade(FragmentColor& dst) + { + FragmentColor texColor; + float u,v; + + switch(mode) + { + case 0: //modulate + u = invu*w; + v = invv*w; + texColor = sampler.sample(u,v); + dst.r = modulate_table[texColor.r][materialColor.r]; + dst.g = modulate_table[texColor.g][materialColor.g]; + dst.b = modulate_table[texColor.b][materialColor.b]; + dst.a = modulate_table[GFX3D_5TO6(texColor.a)][GFX3D_5TO6(materialColor.a)]>>1; + //dst.color.components.a = 31; + //#ifdef _MSC_VER + //if(GetAsyncKeyState(VK_SHIFT)) { + // //debugging tricks + // dst = materialColor; + // if(GetAsyncKeyState(VK_TAB)) { + // u8 alpha = dst.a; + // dst.color = polynum*8+8; + // dst.a = alpha; + // } + //} + //#endif + break; + case 1: //decal + u = invu*w; + v = invv*w; + texColor = sampler.sample(u,v); + dst.r = decal_table[texColor.a][texColor.r][materialColor.r]; + dst.g = decal_table[texColor.a][texColor.g][materialColor.g]; + dst.b = decal_table[texColor.a][texColor.b][materialColor.b]; + dst.a = materialColor.a; + break; + case 2: //toon/highlight shading + { + u = invu*w; + v = invv*w; + texColor = sampler.sample(u,v); + FragmentColor toonColor = toonTable[materialColor.r>>1]; + if(sampler.texFormat == 0) + { + //if no texture is set then we dont need to modulate texture with toon + //but rather just use toon directly + dst = toonColor; + dst.a = materialColor.a; + } + else + { + if(gfx3d.shading == GFX3D::HIGHLIGHT) + { + dst.r = modulate_table[texColor.r][materialColor.r]; + dst.g = modulate_table[texColor.g][materialColor.r]; + dst.b = modulate_table[texColor.b][materialColor.r]; + dst.a = modulate_table[GFX3D_5TO6(texColor.a)][GFX3D_5TO6(materialColor.a)]>>1; + + dst.r = min(63, (dst.r + toonColor.r)); + dst.g = min(63, (dst.g + toonColor.g)); + dst.b = min(63, (dst.b + toonColor.b)); + } + else + { + dst.r = modulate_table[texColor.r][toonColor.r]; + dst.g = modulate_table[texColor.g][toonColor.g]; + dst.b = modulate_table[texColor.b][toonColor.b]; + dst.a = modulate_table[GFX3D_5TO6(texColor.a)][GFX3D_5TO6(materialColor.a)]>>1; + } + } + + } + break; + case 3: //shadows + //is this right? only with the material color? + dst = materialColor; + break; + case 4: //our own special mode which only uses the material color (for when texturing is disabled) + dst = materialColor; + break; + + } + } + + } shader; + + FORCEINLINE void pixel(int adr,float r, float g, float b, float invu, float invv, float w, float z) + { + Fragment &destFragment = screen[adr]; + FragmentColor &destFragmentColor = screenColor[adr]; + + u32 depth; + if(gfx3d.wbuffer) + { + //not sure about this + //this value was chosen to make the skybox, castle window decals, and water level render correctly in SM64 + depth = u32floor(4096*w); + } + else + { + depth = u32floor(z*0x7FFF); + depth <<= 9; + } + + if(polyAttr.decalMode) + { + if(depth != destFragment.depth) + { + goto depth_fail; + } + } + else + { + if(depth>=destFragment.depth) + { + goto depth_fail; + } + } + + //handle shadow polys + if(shader.mode == 3) + { + if(polyAttr.polyid == 0) + { + //that's right! stencil buffers, despite reports to the contrary, must be more than 1 bit + //this is necessary to make these cases work all at once. + //1. sm64 (standing near signs and blocks) + //2. mariokart (no glitches in shadow shape in kart selector) + //3. mariokart (no junk beneath platform in kart selector / no shadow beneath grate floor in bowser stage) + //(specifically, the shadows in mario kart are complicated concave shapes) + destFragment.stencil++; + goto rejected_fragment; + } + else + { + if(destFragment.stencil) + { + destFragment.stencil--; + goto rejected_fragment; + } + + //shadow polys have a special check here to keep from self-shadowing when user + //has tried to prevent it from happening + //if this isnt here, then the vehicle select in mariokart will look terrible + if(destFragment.polyid.opaque == polyAttr.polyid) + goto rejected_fragment; + } + } + + shader.w = w; + shader.invu = invu; + shader.invv = invv; + + //perspective-correct the colors + r = (r * w) + 0.5f; + g = (g * w) + 0.5f; + b = (b * w) + 0.5f; + + //this is a HACK: + //we are being very sloppy with our interpolation precision right now + //and rather than fix it, i just want to clamp it + shader.materialColor.r = max(0U,min(63U,u32floor(r))); + shader.materialColor.g = max(0U,min(63U,u32floor(g))); + shader.materialColor.b = max(0U,min(63U,u32floor(b))); + + shader.materialColor.a = polyAttr.alpha; + + //pixel shader + FragmentColor shaderOutput; + shader.shade(shaderOutput); + + //we shouldnt do any of this if we generated a totally transparent pixel + if(shaderOutput.a != 0) + { + //alpha test (don't have any test cases for this...? is it in the right place...?) + if(gfx3d.enableAlphaTest) + { + if(shaderOutput.a < gfx3d.alphaTestRef) + goto rejected_fragment; + } + + //handle polyids + bool isOpaquePixel = shaderOutput.a == 31; + if(isOpaquePixel) + { + destFragment.polyid.opaque = polyAttr.polyid; + destFragment.isTranslucentPoly = polyAttr.translucent?1:0; + destFragment.fogged = polyAttr.fogged; + destFragmentColor = shaderOutput; + } + else + { + //dont overwrite pixels on translucent polys with the same polyids + if(destFragment.polyid.translucent == polyAttr.polyid) + goto rejected_fragment; + + //originally we were using a test case of shadows-behind-trees in sm64ds + //but, it looks bad in that game. this is actually correct + //if this isnt correct, then complex shape cart shadows in mario kart don't work right + destFragment.polyid.translucent = polyAttr.polyid; + + //alpha blending and write color + alphaBlend(destFragmentColor, shaderOutput); + + destFragment.fogged &= polyAttr.fogged; + } + + //depth writing + if(isOpaquePixel || polyAttr.translucentDepthWrite) + destFragment.depth = depth; + + } + + depth_fail: + rejected_fragment: + ; + } + + //draws a single scanline + FORCEINLINE void drawscanline(edge_fx_fl *pLeft, edge_fx_fl *pRight) + { + int XStart = pLeft->X; + int width = pRight->X - XStart; + + //these are the starting values, taken from the left edge + float invw = pLeft->invw.curr; + float u = pLeft->u.curr; + float v = pLeft->v.curr; + float z = pLeft->z.curr; + float color[3] = { + pLeft->color[0].curr, + pLeft->color[1].curr, + pLeft->color[2].curr }; + + //our dx values are taken from the steps up until the right edge + float invWidth = 1.0f / width; + float dinvw_dx = (pRight->invw.curr - invw) * invWidth; + float du_dx = (pRight->u.curr - u) * invWidth; + float dv_dx = (pRight->v.curr - v) * invWidth; + float dz_dx = (pRight->z.curr - z) * invWidth; + float dc_dx[3] = { + (pRight->color[0].curr - color[0]) * invWidth, + (pRight->color[1].curr - color[1]) * invWidth, + (pRight->color[2].curr - color[2]) * invWidth }; + + int adr = (pLeft->Y<<8)+XStart; + + //CONSIDER: in case some other math is wrong (shouldve been clipped OK), we might go out of bounds here. + //better check the Y value. + if(pLeft->Y<0 || pLeft->Y>191) { + printf("rasterizer rendering at y=%d! oops!\n",pLeft->Y); + return; + } + + int x = XStart; + + while(width-- > 0) + { + if(x<0 || x>255) { + printf("rasterizer rendering at x=%d! oops!\n",x); + return; + } + pixel(adr,color[0],color[1],color[2],u,v,1.0f/invw,z); + adr++; + x++; + + invw += dinvw_dx; + u += du_dx; + v += dv_dx; + z += dz_dx; + color[0] += dc_dx[0]; + color[1] += dc_dx[1]; + color[2] += dc_dx[2]; + } + } + + //runs several scanlines, until an edge is finished + template + void runscanlines(edge_fx_fl *left, edge_fx_fl *right) + { + //do not overstep either of the edges + int Height = min(left->Height,right->Height); + while(Height--) { + if(!SLI || (left->Y & SLI_MASK) == SLI_VALUE) + drawscanline(left,right); + left->Step(); + right->Step(); + } + } + + + //rotates verts counterclockwise + template + INLINE void rot_verts() { + #define ROTSWAP(X) if(type>X) swap(verts[X-1],verts[X]); + ROTSWAP(1); ROTSWAP(2); ROTSWAP(3); ROTSWAP(4); + ROTSWAP(5); ROTSWAP(6); ROTSWAP(7); ROTSWAP(8); ROTSWAP(9); + } + + //rotate verts until vert0.y is minimum, and then vert0.x is minimum in case of ties + //this is a necessary precondition for our shape engine + template + void sort_verts(bool backwards) { + //if the verts are backwards, reorder them first + if(backwards) + for(int i=0;iX) if(verts[0]->y > verts[X]->y) goto doswap; + CHECKY(1); CHECKY(2); CHECKY(3); CHECKY(4); + CHECKY(5); CHECKY(6); CHECKY(7); CHECKY(8); CHECKY(9); + break; + + doswap: + rot_verts(); + } + + while(verts[0]->y == verts[1]->y && verts[0]->x > verts[1]->x) + rot_verts(); + + } + + //This function can handle any convex N-gon up to octagons + //verts must be clockwise. + //I didnt reference anything for this algorithm but it seems like I've seen it somewhere before. + template + void shape_engine(int type, bool backwards) + { + failure = false; + + switch(type) { + case 3: sort_verts<3>(backwards); break; + case 4: sort_verts<4>(backwards); break; + case 5: sort_verts<5>(backwards); break; + case 6: sort_verts<6>(backwards); break; + case 7: sort_verts<7>(backwards); break; + case 8: sort_verts<8>(backwards); break; + case 9: sort_verts<9>(backwards); break; + case 10: sort_verts<10>(backwards); break; + default: printf("skipping type %d\n",type); return; + } + + //we are going to step around the polygon in both directions starting from vert 0. + //right edges will be stepped over clockwise and left edges stepped over counterclockwise. + //these variables track that stepping, but in order to facilitate wrapping we start extra high + //for the counter we're decrementing. + int lv = type, rv = 0; + + edge_fx_fl left, right; + bool step_left = true, step_right = true; + for(;;) { + //generate new edges if necessary. we must avoid regenerating edges when they are incomplete + //so that they can be continued on down the shape + assert(rv != type); + int _lv = lv==type?0:lv; //make sure that we ask for vert 0 when the variable contains the starting value + if(step_left) left = edge_fx_fl(_lv,lv-1,(VERT**)&verts); + if(step_right) right = edge_fx_fl(rv,rv+1,(VERT**)&verts); + step_left = step_right = false; + + //handle a failure in the edge setup due to nutty polys + if(failure) + return; + + if(left.Height<0 || right.Height<0) + { + //i have NO IDEA WHY THIS HAPPENS + //but i think it was corrupting things in a bad way + //which was only revealed by the multicored rasterizer + return; + } + + runscanlines(&left,&right); + + //if we ran out of an edge, step to the next one + if(right.Height == 0) { + step_right = true; + rv++; + } + if(left.Height == 0) { + step_left = true; + lv--; + } + + //this is our completion condition: when our stepped edges meet in the middle + if(lv<=rv+1) break; + } + + } + + template + void mainLoop() + { + lastTexKey = NULL; + + //a counter for how many polys got culled + int culled = 0; + + u32 lastPolyAttr = 0; + u32 lastTextureFormat = 0, lastTexturePalette = 0; + + //iterate over polys + for(int i=0;ipolyAttr) + { + polyAttr.setup(poly->polyAttr); + polyAttr.translucent = poly->isTranslucent(); + lastPolyAttr = poly->polyAttr; + } + + + //if(i == 0 || lastTextureFormat != poly->texParam || lastTexturePalette != poly->texPalette) + { + sampler.setup(poly->texParam); + lastTextureFormat = poly->texParam; + lastTexturePalette = poly->texPalette; + } + + lastTexKey = polyTexKeys[i]; + + //hmm... shader gets setup every time because it depends on sampler which may have just changed + shader.setup(poly->polyAttr); + + for(int j=0;jverts[j] = &clippedPoly.clipVerts[j]; + for(int j=type;jverts[j] = NULL; + + polyAttr.backfacing = polyBackfacing[i]; + + shape_engine(type,!polyAttr.backfacing); + } + } + + +}; //rasterizerUnit + + +static Task rasterizerUnitTask[4]; +static RasterizerUnit rasterizerUnit[4]; +static int rasterizerCores; + +void* execRasterizerUnit(void* arg) +{ + s32 which = (s32)arg; + rasterizerUnit[which].mainLoop(); + return 0; } static char SoftRastInit(void) @@ -912,6 +963,37 @@ static char SoftRastInit(void) static bool tables_generated = false; if(!tables_generated) { + if(CommonSettings.num_cores>=4) + { + rasterizerCores = 4; + rasterizerUnit[0].SLI_MASK = 3; + rasterizerUnit[1].SLI_MASK = 3; + rasterizerUnit[2].SLI_MASK = 3; + rasterizerUnit[3].SLI_MASK = 3; + rasterizerUnit[0].SLI_VALUE = 0; + rasterizerUnit[1].SLI_VALUE = 1; + rasterizerUnit[2].SLI_VALUE = 2; + rasterizerUnit[3].SLI_VALUE = 3; + rasterizerUnitTask[0].start(false); + rasterizerUnitTask[1].start(false); + rasterizerUnitTask[2].start(false); + rasterizerUnitTask[3].start(false); + } else if(CommonSettings.num_cores>1) + { + rasterizerCores = 2; + rasterizerUnit[0].SLI_MASK = 1; + rasterizerUnit[1].SLI_MASK = 1; + rasterizerUnit[0].SLI_VALUE = 0; + rasterizerUnit[1].SLI_VALUE = 1; + rasterizerUnitTask[0].start(false); + rasterizerUnitTask[1].start(false); + } else { + rasterizerCores = 1; + rasterizerUnit[0].SLI_MASK = 0; + rasterizerUnit[0].SLI_VALUE = 0; + } + + tables_generated = true; clipper.clippedPolys = clippedPolys = new GFX3D_Clipper::TClippedPoly[POLYLIST_SIZE*2]; @@ -942,8 +1024,6 @@ static char SoftRastInit(void) } TexCache_Reset(); - TexCache_BindTexture = BindTexture; - TexCache_BindTextureData = BindTextureData; printf("SoftRast Initialized\n"); return 1; @@ -969,7 +1049,7 @@ static void SoftRastFramebufferProcess() // - the edges are completely sharp/opaque on the very brief title screen intro, // - the level-start intro gets a pseudo-antialiasing effect around the silhouette, // - the character edges in-level are clearly transparent, and also show well through shield powerups. - if(gfx3d.enableEdgeMarking) + if(gfx3d.enableEdgeMarking && CommonSettings.GFX3D_EdgeMark) { //TODO - need to test and find out whether these get grabbed at flush time, or at render time //we can do this by rendering a 3d frame and then freezing the system, but only changing the edge mark colors @@ -1039,7 +1119,7 @@ static void SoftRastFramebufferProcess() } } - if(gfx3d.enableFog) + if(gfx3d.enableFog && CommonSettings.GFX3D_Fog) { u32 r = GFX3D_5TO6((gfx3d.fogColor)&0x1F); u32 g = GFX3D_5TO6((gfx3d.fogColor>>5)&0x1F); @@ -1070,8 +1150,6 @@ static void SoftRastConvertFramebuffer() memcpy(gfx3d_convertedScreen,screenColor,256*192*4); } - - static void SoftRastRender() { Fragment clearFragment; @@ -1144,7 +1222,7 @@ static void SoftRastRender() } //setup fog variables (but only if fog is enabled) - if(gfx3d.enableFog) + if(gfx3d.enableFog && CommonSettings.GFX3D_Fog) { u8* fogDensity = MMU.MMU_MEM[ARMCPU_ARM9][0x40] + 0x360; #if 0 @@ -1260,29 +1338,20 @@ static void SoftRastRender() } } - //a counter for how many polys got culled - int culled = 0; - - u32 lastTextureFormat = 0, lastTexturePalette = 0, lastPolyAttr = 0; - - //iterate over polys + ADPCMCacheItem* lastTexKey = NULL; + u32 lastTextureFormat = 0, lastTexturePalette = 0; bool needInitTexture = true; for(int i=0;ipolyAttr) - { - polyAttr.setup(poly->polyAttr); - polyAttr.translucent = poly->isTranslucent(); - lastPolyAttr = poly->polyAttr; - } + + RasterizerUnit::PolyAttr polyAttr; + polyAttr.setup(poly->polyAttr); //HACK: backface culling //this should be moved to gfx3d, but first we need to redo the way the lists are built @@ -1303,26 +1372,19 @@ static void SoftRastRender() // this version should handle those cases better. int n = type - 1; float facing = (verts[0].y + verts[n].y) * (verts[0].x - verts[n].x) - + (verts[1].y + verts[0].y) * (verts[1].x - verts[0].x) - + (verts[2].y + verts[1].y) * (verts[2].x - verts[1].x); - for(int i = 2; i < n; i++) - facing += (verts[i+1].y + verts[i].y) * (verts[i+1].x - verts[i].x); - polyAttr.backfacing = (facing < 0); + + (verts[1].y + verts[0].y) * (verts[1].x - verts[0].x) + + (verts[2].y + verts[1].y) * (verts[2].x - verts[1].x); + for(int j = 2; j < n; j++) + facing += (verts[j+1].y + verts[j].y) * (verts[j+1].x - verts[j].x); + polyBackfacing[i] = polyAttr.backfacing = (facing < 0); #endif if(!polyAttr.isVisible(polyAttr.backfacing)) { - culled++; + polyVisible[i] = false; continue; } - - if(needInitTexture || lastTextureFormat != poly->texParam || lastTexturePalette != poly->texPalette) - { - TexCache_SetTexture(poly->texParam,poly->texPalette); - sampler.setup(poly->texParam); - lastTextureFormat = poly->texParam; - lastTexturePalette = poly->texPalette; - needInitTexture = false; - } + + polyVisible[i] = true; //here is a hack which needs to be removed. //at some point our shape engine needs these to be converted to "fixed point" @@ -1331,15 +1393,31 @@ static void SoftRastRender() for(int k=0;k<2;k++) verts[j].coord[k] = (float)iround(16.0f * verts[j].coord[k]); - //hmm... shader gets setup every time because it depends on sampler which may have just changed - shader.setup(poly->polyAttr); + //make sure all the textures we'll need are cached + if(needInitTexture || lastTextureFormat != poly->texParam || lastTexturePalette != poly->texPalette) + { + lastTexKey = TexCache_SetTexture(TexFormat_15bpp,poly->texParam,poly->texPalette); + lastTextureFormat = poly->texParam; + lastTexturePalette = poly->texPalette; + needInitTexture = false; + } - for(int j=0;jtexParam,rasterizerUnit[0].textures.currentNum); + polyTexKeys[i] = lastTexKey; } + if(rasterizerCores==1) + { + rasterizerUnit[0].mainLoop(); + } + else + { + for(int i=0;icount-culled,gfx3d.polylist->count); @@ -1354,3 +1432,4 @@ GPU3DInterface gpu3DRasterize = { SoftRastRender, SoftRastVramReconfigureSignal, }; + diff --git a/desmume/src/rasterize.h b/desmume/src/rasterize.h index 9f24e2c04..3ab76d57f 100644 --- a/desmume/src/rasterize.h +++ b/desmume/src/rasterize.h @@ -1,8 +1,4 @@ -/* Copyright (C) 2006 yopyop - yopyop156@ifrance.com - yopyop156.ifrance.com - - Copyright 2009 DeSmuME team +/* Copyright 2009 DeSmuME team This file is part of DeSmuME diff --git a/desmume/src/texcache.cpp b/desmume/src/texcache.cpp index 18af01ed2..37fd51221 100644 --- a/desmume/src/texcache.cpp +++ b/desmume/src/texcache.cpp @@ -1,7 +1,8 @@ -#include "texcache.h" - #include #include +#include + +#include "texcache.h" #include "bits.h" #include "common.h" @@ -15,6 +16,8 @@ using std::max; //only dump this from ogl renderer. for now, softrasterizer creates things in an incompatible pixel format //#define DEBUG_DUMP_TEXTURE +#define CONVERT(color,alpha) ((TEXFORMAT == TexFormat_32bpp)?(RGB15TO32(color,alpha)):RGB15TO6665(color,alpha)) + //This class represents a number of regions of memory which should be viewed as contiguous class MemSpan { @@ -54,6 +57,8 @@ public: return 0; } + //TODO - get rid of duplication between these two methods. + //dumps the memspan to the specified buffer //you may set size to limit the size to be copied int dump(void* buf, int size=-1) @@ -160,12 +165,6 @@ static MemSpan MemSpan_TexPalette(u32 ofs, u32 len) return ret; } -TextureCache *texcache; -u32 texcache_start; -u32 texcache_stop; -u8 *TexCache_texMAP = NULL; - - #if defined (DEBUG_DUMP_TEXTURE) && defined (WIN32) #define DO_DEBUG_DUMP_TEXTURE static void DebugDumpTexture(int which) @@ -178,476 +177,527 @@ static void DebugDumpTexture(int which) #endif -static int lastTexture = -1; -#define CONVERT(color,alpha) ((TEXFORMAT == TexFormat_32bpp)?(RGB15TO32(color,alpha)):RGB15TO6665(color,alpha)) - -template -void TexCache_SetTexture(u32 format, u32 texpal) +//notes on the cache: +//I am really unhappy with the ref counting. this needs to be automatic. +//We could do something better than a linear search through cache items, but it may not be worth it. +//Also we may need to rescan more often (every time a sample loops) +class ADPCMCache { - //for each texformat, number of palette entries - const int palSizes[] = {0, 32, 4, 16, 256, 0, 8, 0}; +public: + ADPCMCache() + : list_front(NULL) + , list_back(NULL) + , cache_size(0) + {} - //for each texformat, multiplier from numtexels to numbytes (fixed point 30.2) - const int texSizes[] = {0, 4, 1, 2, 4, 1, 4, 8}; + ADPCMCacheItem *list_front, *list_back; - //used to hold a copy of the palette specified for this texture - u16 pal[256]; + //this ought to be enough for anyone + static const u32 kMaxCacheSize = 64*1024*1024; + //this is not really precise, it is off by a constant factor + u32 cache_size; - u32 *dwdst = (u32*)TexCache_texMAP; - - u32 textureMode = (unsigned short)((format>>26)&0x07); - unsigned int sizeX=(8 << ((format>>20)&0x07)); - unsigned int sizeY=(8 << ((format>>23)&0x07)); - unsigned int imageSize = sizeX*sizeY; - - u8 *adr; - - u32 paletteAddress; - - switch (textureMode) - { - case TEXMODE_I2: - paletteAddress = texpal<<3; - break; - case TEXMODE_A3I5: //a3i5 - case TEXMODE_I4: //i4 - case TEXMODE_I8: //i8 - case TEXMODE_A5I3: //a5i3 - case TEXMODE_16BPP: //16bpp - case TEXMODE_4X4: //4x4 - default: - paletteAddress = texpal<<4; - break; + void list_remove(ADPCMCacheItem* item) { + if(item->next) item->next->prev = item->prev; + if(item->prev) item->prev->next = item->next; + if(item == list_front) list_front = item->next; + if(item == list_back) list_back = item->prev; } - //analyze the texture memory mapping and the specifications of this texture - int palSize = palSizes[textureMode]; - int texSize = (imageSize*texSizes[textureMode])>>2; //shifted because the texSizes multiplier is fixed point - MemSpan ms = MemSpan_TexMem((format&0xFFFF)<<3,texSize); - MemSpan mspal = MemSpan_TexPalette(paletteAddress,palSize*2); - - //determine the location for 4x4 index data - u32 indexBase; - if((format & 0xc000) == 0x8000) indexBase = 0x30000; - else indexBase = 0x20000; - - u32 indexOffset = (format&0x3FFF)<<2; - - int indexSize = 0; - MemSpan msIndex; - if(textureMode == TEXMODE_4X4) + void list_push_front(ADPCMCacheItem* item) { - indexSize = imageSize>>3; - msIndex = MemSpan_TexMem(indexOffset+indexBase,indexSize); + item->next = list_front; + if(list_front) list_front->prev = item; + else list_back = item; + item->prev = NULL; + list_front = item; } - - //dump the palette to a temp buffer, so that we don't have to worry about memory mapping. - //this isnt such a problem with texture memory, because we read sequentially from it. - //however, we read randomly from palette memory, so the mapping is more costly. -#ifdef WORDS_BIGENDIAN - mspal.dump16(pal); -#else - mspal.dump(pal); -#endif - - - u32 tx=texcache_start; - - //if(false) - while (TRUE) + template + ADPCMCacheItem* scan(u32 format, u32 texpal) { - //conditions where we give up and regenerate the texture: - if (texcache_stop == tx) break; - if (texcache[tx].frm == 0) break; + //for each texformat, number of palette entries + static const int palSizes[] = {0, 32, 4, 16, 256, 0, 8, 0}; - //conditions where we reject matches: - //when the teximage or texpal params dont match - //(this is our key for identifying palettes in the cache) - if (texcache[tx].frm != format) goto REJECT; - if (texcache[tx].pal != texpal) goto REJECT; + //for each texformat, multiplier from numtexels to numbytes (fixed point 30.2) + static const int texSizes[] = {0, 4, 1, 2, 4, 1, 4, 8}; - //the texture matches params, but isnt suspected invalid. accept it. - if (!texcache[tx].suspectedInvalid) goto ACCEPT; + //used to hold a copy of the palette specified for this texture + u16 pal[256]; - //if we couldnt cache this entire texture due to it being too large, then reject it - if (texSize+indexSize > (int)sizeof(texcache[tx].dump.texture)) goto REJECT; + u32 textureMode = (unsigned short)((format>>26)&0x07); + u32 sizeX=(8 << ((format>>20)&0x07)); + u32 sizeY=(8 << ((format>>23)&0x07)); + u32 imageSize = sizeX*sizeY; - //when the palettes dont match: - //note that we are considering 4x4 textures to have a palette size of 0. - //they really have a potentially HUGE palette, too big for us to handle like a normal palette, - //so they go through a different system - if (mspal.size != 0 && memcmp(texcache[tx].dump.palette,pal,mspal.size)) goto REJECT; + u8 *adr; - //when the texture data doesn't match - if(ms.memcmp(texcache[tx].dump.texture,sizeof(texcache[tx].dump.texture))) goto REJECT; + u32 paletteAddress; - //if the texture is 4x4 then the index data must match + switch (textureMode) + { + case TEXMODE_I2: + paletteAddress = texpal<<3; + break; + case TEXMODE_A3I5: //a3i5 + case TEXMODE_I4: //i4 + case TEXMODE_I8: //i8 + case TEXMODE_A5I3: //a5i3 + case TEXMODE_16BPP: //16bpp + case TEXMODE_4X4: //4x4 + default: + paletteAddress = texpal<<4; + break; + } + + //analyze the texture memory mapping and the specifications of this texture + int palSize = palSizes[textureMode]; + int texSize = (imageSize*texSizes[textureMode])>>2; //shifted because the texSizes multiplier is fixed point + MemSpan ms = MemSpan_TexMem((format&0xFFFF)<<3,texSize); + MemSpan mspal = MemSpan_TexPalette(paletteAddress,palSize*2); + + //determine the location for 4x4 index data + u32 indexBase; + if((format & 0xc000) == 0x8000) indexBase = 0x30000; + else indexBase = 0x20000; + + u32 indexOffset = (format&0x3FFF)<<2; + + int indexSize = 0; + MemSpan msIndex; if(textureMode == TEXMODE_4X4) { - if(msIndex.memcmp(texcache[tx].dump.texture + texcache[tx].dump.textureSize,texcache[tx].dump.indexSize)) goto REJECT; + indexSize = imageSize>>3; + msIndex = MemSpan_TexMem(indexOffset+indexBase,indexSize); } -ACCEPT: - texcache[tx].suspectedInvalid = false; - if(lastTexture == -1 || (int)tx != lastTexture) + //dump the palette to a temp buffer, so that we don't have to worry about memory mapping. + //this isnt such a problem with texture memory, because we read sequentially from it. + //however, we read randomly from palette memory, so the mapping is more costly. + #ifdef WORDS_BIGENDIAN + mspal.dump16(pal); + #else + mspal.dump(pal); + #endif + + for(ADPCMCacheItem* curr = list_front;curr;curr=curr->next) { - lastTexture = tx; - if(TexCache_BindTexture) - TexCache_BindTexture(tx); - } - return; - -REJECT: - tx++; - if ( tx > MAX_TEXTURE ) - { - texcache_stop=texcache_start; - texcache[texcache_stop].frm=0; - texcache_start++; - if (texcache_start>MAX_TEXTURE) + //conditions where we reject matches: + //when the teximage or texpal params dont match + //(this is our key for identifying textures in the cache) + if(curr->texformat != format) continue; + if(curr->texpal != texpal) continue; + + //we're being asked for a different format than what we had cached. + if(curr->cacheFormat != TEXFORMAT) goto REJECT; + + //not used anymore -- add another method to purge suspicious items from the cache + //the texture matches params, but isnt suspected invalid. accept it. + if (!curr->suspectedInvalid) return curr; + + //when the palettes dont match: + //note that we are considering 4x4 textures to have a palette size of 0. + //they really have a potentially HUGE palette, too big for us to handle like a normal palette, + //so they go through a different system + if(mspal.size != 0 && memcmp(curr->dump.palette,pal,mspal.size)) goto REJECT; + + //when the texture data doesn't match + if(ms.memcmp(curr->dump.texture,sizeof(curr->dump.texture))) goto REJECT; + + //if the texture is 4x4 then the index data must match + if(textureMode == TEXMODE_4X4) { - texcache_start=0; - texcache_stop=MAX_TEXTURE<<1; - } - tx=0; - } - } - - lastTexture = tx; - //glBindTexture(GL_TEXTURE_2D, texcache[tx].id); - - texcache[tx].suspectedInvalid = false; - texcache[tx].frm=format; - texcache[tx].mode=textureMode; - texcache[tx].pal=texpal; - texcache[tx].sizeX=sizeX; - texcache[tx].sizeY=sizeY; - texcache[tx].invSizeX=1.0f/((float)(sizeX)); - texcache[tx].invSizeY=1.0f/((float)(sizeY)); - texcache[tx].dump.textureSize = ms.dump(texcache[tx].dump.texture,sizeof(texcache[tx].dump.texture)); - - //dump palette data for cache keying - if ( palSize ) - { - memcpy(texcache[tx].dump.palette, pal, palSize*2); - } - //dump 4x4 index data for cache keying - texcache[tx].dump.indexSize = 0; - if(textureMode == TEXMODE_4X4) - { - texcache[tx].dump.indexSize = min(msIndex.size,(int)sizeof(texcache[tx].dump.texture) - texcache[tx].dump.textureSize); - msIndex.dump(texcache[tx].dump.texture+texcache[tx].dump.textureSize,texcache[tx].dump.indexSize); - } - - - //INFO("Texture %03i - format=%08X; pal=%04X (mode %X, width %04i, height %04i)\n",i, texcache[i].frm, texcache[i].pal, texcache[i].mode, sizeX, sizeY); - - //============================================================================ Texture conversion - const u32 opaqueColor = TEXFORMAT==TexFormat_32bpp?255:31; - u32 palZeroTransparent = (1-((format>>29)&1))*opaqueColor; - - switch (texcache[tx].mode) - { - case TEXMODE_A3I5: - { - for(int j=0;j>5; - if(TEXFORMAT == TexFormat_15bpp) - *dwdst++ = RGB15TO6665(c,material_3bit_to_5bit[alpha]); - else - *dwdst++ = RGB15TO32(c,material_3bit_to_8bit[alpha]); - adr++; - } + if(msIndex.memcmp(curr->dump.texture + curr->dump.textureSize,curr->dump.indexSize)) goto REJECT; } + //we found a match. just return it + //curr->lock(); + list_remove(curr); + list_push_front(curr); + return curr; + + REJECT: + //we found a cached item for the current address, but the data is stale. + //for a variety of complicated reasons, we need to throw it out right this instant. + list_remove(curr); + delete curr; break; } - case TEXMODE_I2: + + //item was not found. recruit an existing one (the oldest), or create a new one + //evict(); //reduce the size of the cache if necessary + //TODO - as a peculiarity of the texcache, eviction must happen after the entire 3d frame runs + //to support separate cache and read passes + ADPCMCacheItem* newitem = new ADPCMCacheItem(); + list_push_front(newitem); + //newitem->lock(); + newitem->suspectedInvalid = false; + newitem->texformat = format; + newitem->cacheFormat = TEXFORMAT; + newitem->texpal = texpal; + newitem->sizeX=sizeX; + newitem->sizeY=sizeY; + newitem->invSizeX=1.0f/((float)(sizeX)); + newitem->invSizeY=1.0f/((float)(sizeY)); + newitem->dump.textureSize = ms.dump(newitem->dump.texture,sizeof(newitem->dump.texture)); + newitem->decode_len = sizeX*sizeY*4; + newitem->mode = textureMode; + cache_size += newitem->decode_len; + newitem->decoded = new u8[newitem->decode_len]; + + u32 *dwdst = (u32*)newitem->decoded; + + + //dump palette data for cache keying + if(palSize) { - for(int j=0;j>2)&0x3; - c = pal[bits]; - *dwdst++ = CONVERT(c,(bits == 0) ? palZeroTransparent : opaqueColor); - - bits = ((*adr)>>4)&0x3; - c = pal[bits]; - *dwdst++ = CONVERT(c,(bits == 0) ? palZeroTransparent : opaqueColor); - - bits = ((*adr)>>6)&0x3; - c = pal[bits]; - *dwdst++ = CONVERT(c,(bits == 0) ? palZeroTransparent : opaqueColor); - - adr++; - } - } - break; + memcpy(newitem->dump.palette, pal, palSize*2); } - case TEXMODE_I4: + //dump 4x4 index data for cache keying + newitem->dump.indexSize = 0; + if(textureMode == TEXMODE_4X4) { - for(int j=0;j>4); - c = pal[bits]; - *dwdst++ = CONVERT(c,(bits == 0) ? palZeroTransparent : opaqueColor); - adr++; - } - } - break; + newitem->dump.indexSize = min(msIndex.size,(int)sizeof(newitem->dump.texture) - newitem->dump.textureSize); + msIndex.dump(newitem->dump.texture+newitem->dump.textureSize,newitem->dump.indexSize); } - case TEXMODE_I8: + + //============================================================================ + //Texture conversion + //============================================================================ + + const u32 opaqueColor = TEXFORMAT==TexFormat_32bpp?255:31; + u32 palZeroTransparent = (1-((format>>29)&1))*opaqueColor; + + switch (newitem->mode) { - for(int j=0;j>14)] + ((paletteAddress + (offset)*2)&0x3FFF) ) ) - - u16* slot1; - u32* map = (u32*)ms.items[0].ptr; - u32 limit = ms.items[0].len<<2; - u32 d = 0; - if ( (texcache[tx].frm & 0xc000) == 0x8000) - // texel are in slot 2 - slot1=(u16*)&MMU.texInfo.textureSlotAddr[1][((texcache[tx].frm & 0x3FFF)<<2)+0x010000]; - else - slot1=(u16*)&MMU.texInfo.textureSlotAddr[1][(texcache[tx].frm & 0x3FFF)<<2]; - - u16 yTmpSize = (texcache[tx].sizeY>>2); - u16 xTmpSize = (texcache[tx].sizeX>>2); - - //this is flagged whenever a 4x4 overruns its slot. - //i am guessing we just generate black in that case - bool dead = false; - - for (int y = 0; y < yTmpSize; y ++) + case TEXMODE_A3I5: { - u32 tmpPos[4]={(y<<2)*texcache[tx].sizeX,((y<<2)+1)*texcache[tx].sizeX, - ((y<<2)+2)*texcache[tx].sizeX,((y<<2)+3)*texcache[tx].sizeX}; - for (int x = 0; x < xTmpSize; x ++, d++) - { - if(d >= limit) - dead = true; + for(int j=0;j>5; + if(TEXFORMAT == TexFormat_15bpp) + *dwdst++ = RGB15TO6665(c,material_3bit_to_5bit[alpha]); + else + *dwdst++ = RGB15TO32(c,material_3bit_to_8bit[alpha]); + adr++; + } + } + break; + } - if(dead) { + case TEXMODE_I2: + { + for(int j=0;j>2)&0x3; + c = pal[bits]; + *dwdst++ = CONVERT(c,(bits == 0) ? palZeroTransparent : opaqueColor); + + bits = ((*adr)>>4)&0x3; + c = pal[bits]; + *dwdst++ = CONVERT(c,(bits == 0) ? palZeroTransparent : opaqueColor); + + bits = ((*adr)>>6)&0x3; + c = pal[bits]; + *dwdst++ = CONVERT(c,(bits == 0) ? palZeroTransparent : opaqueColor); + + adr++; + } + } + break; + } + case TEXMODE_I4: + { + for(int j=0;j>4); + c = pal[bits]; + *dwdst++ = CONVERT(c,(bits == 0) ? palZeroTransparent : opaqueColor); + adr++; + } + } + break; + } + case TEXMODE_I8: + { + for(int j=0;j>14)] + ((paletteAddress + (offset)*2)&0x3FFF) ) ) + + u16* slot1; + u32* map = (u32*)ms.items[0].ptr; + u32 limit = ms.items[0].len<<2; + u32 d = 0; + if ( (format & 0xc000) == 0x8000) + // texel are in slot 2 + slot1=(u16*)&MMU.texInfo.textureSlotAddr[1][((format & 0x3FFF)<<2)+0x010000]; + else + slot1=(u16*)&MMU.texInfo.textureSlotAddr[1][(format & 0x3FFF)<<2]; + + u16 yTmpSize = (sizeY>>2); + u16 xTmpSize = (sizeX>>2); + + //this is flagged whenever a 4x4 overruns its slot. + //i am guessing we just generate black in that case + bool dead = false; + + for (int y = 0; y < yTmpSize; y ++) + { + u32 tmpPos[4]={(y<<2)*sizeX,((y<<2)+1)*sizeX, + ((y<<2)+2)*sizeX,((y<<2)+3)*sizeX}; + for (int x = 0; x < xTmpSize; x ++, d++) + { + if(d >= limit) + dead = true; + + if(dead) { + for (int sy = 0; sy < 4; sy++) + { + u32 currentPos = (x<<2) + tmpPos[sy]; + dwdst[currentPos] = dwdst[currentPos+1] = dwdst[currentPos+2] = dwdst[currentPos+3] = 0; + } + continue; + } + + u32 currBlock = map[d]; + u16 pal1 = slot1[d]; + u16 pal1offset = (pal1 & 0x3FFF)<<1; + u8 mode = pal1>>14; + u32 tmp_col[4]; + + tmp_col[0]=RGB16TO32(PAL4X4(pal1offset),255); + tmp_col[1]=RGB16TO32(PAL4X4(pal1offset+1),255); + + switch (mode) + { + case 0: + tmp_col[2]=RGB16TO32(PAL4X4(pal1offset+2),255); + tmp_col[3]=RGB16TO32(0x7FFF,0); + break; + case 1: + tmp_col[2]=(((tmp_col[0]&0xFF)+(tmp_col[1]&0xff))>>1)| + (((tmp_col[0]&(0xFF<<8))+(tmp_col[1]&(0xFF<<8)))>>1)| + (((tmp_col[0]&(0xFF<<16))+(tmp_col[1]&(0xFF<<16)))>>1)| + (0xff<<24); + tmp_col[3]=RGB16TO32(0x7FFF,0); + break; + case 2: + tmp_col[2]=RGB16TO32(PAL4X4(pal1offset+2),255); + tmp_col[3]=RGB16TO32(PAL4X4(pal1offset+3),255); + break; + case 3: + { + u32 red1, red2; + u32 green1, green2; + u32 blue1, blue2; + u16 tmp1, tmp2; + + red1=tmp_col[0]&0xff; + green1=(tmp_col[0]>>8)&0xff; + blue1=(tmp_col[0]>>16)&0xff; + red2=tmp_col[1]&0xff; + green2=(tmp_col[1]>>8)&0xff; + blue2=(tmp_col[1]>>16)&0xff; + + tmp1=((red1*5+red2*3)>>6)| + (((green1*5+green2*3)>>6)<<5)| + (((blue1*5+blue2*3)>>6)<<10); + tmp2=((red2*5+red1*3)>>6)| + (((green2*5+green1*3)>>6)<<5)| + (((blue2*5+blue1*3)>>6)<<10); + + tmp_col[2]=RGB16TO32(tmp1,255); + tmp_col[3]=RGB16TO32(tmp2,255); + break; + } + } + + if(TEXFORMAT==TexFormat_15bpp) + { + for(int i=0;i<4;i++) + { + tmp_col[i] >>= 2; + tmp_col[i] &= 0x3F3F3F3F; + u32 a = tmp_col[i]>>24; + tmp_col[i] &= 0x00FFFFFF; + tmp_col[i] |= (a>>1)<<24; + } + } + + //TODO - this could be more precise for 32bpp mode (run it through the color separation table) + + //set all 16 texels for (int sy = 0; sy < 4; sy++) { + // Texture offset u32 currentPos = (x<<2) + tmpPos[sy]; - dwdst[currentPos] = dwdst[currentPos+1] = dwdst[currentPos+2] = dwdst[currentPos+3] = 0; + u8 currRow = (u8)((currBlock>>(sy<<3))&0xFF); + + dwdst[currentPos] = tmp_col[currRow&3]; + dwdst[currentPos+1] = tmp_col[(currRow>>2)&3]; + dwdst[currentPos+2] = tmp_col[(currRow>>4)&3]; + dwdst[currentPos+3] = tmp_col[(currRow>>6)&3]; } - continue; + + } - - u32 currBlock = map[d]; - u16 pal1 = slot1[d]; - u16 pal1offset = (pal1 & 0x3FFF)<<1; - u8 mode = pal1>>14; - u32 tmp_col[4]; - - tmp_col[0]=RGB16TO32(PAL4X4(pal1offset),255); - tmp_col[1]=RGB16TO32(PAL4X4(pal1offset+1),255); - - switch (mode) - { - case 0: - tmp_col[2]=RGB16TO32(PAL4X4(pal1offset+2),255); - tmp_col[3]=RGB16TO32(0x7FFF,0); - break; - case 1: - tmp_col[2]=(((tmp_col[0]&0xFF)+(tmp_col[1]&0xff))>>1)| - (((tmp_col[0]&(0xFF<<8))+(tmp_col[1]&(0xFF<<8)))>>1)| - (((tmp_col[0]&(0xFF<<16))+(tmp_col[1]&(0xFF<<16)))>>1)| - (0xff<<24); - tmp_col[3]=RGB16TO32(0x7FFF,0); - break; - case 2: - tmp_col[2]=RGB16TO32(PAL4X4(pal1offset+2),255); - tmp_col[3]=RGB16TO32(PAL4X4(pal1offset+3),255); - break; - case 3: - { - u32 red1, red2; - u32 green1, green2; - u32 blue1, blue2; - u16 tmp1, tmp2; - - red1=tmp_col[0]&0xff; - green1=(tmp_col[0]>>8)&0xff; - blue1=(tmp_col[0]>>16)&0xff; - red2=tmp_col[1]&0xff; - green2=(tmp_col[1]>>8)&0xff; - blue2=(tmp_col[1]>>16)&0xff; - - tmp1=((red1*5+red2*3)>>6)| - (((green1*5+green2*3)>>6)<<5)| - (((blue1*5+blue2*3)>>6)<<10); - tmp2=((red2*5+red1*3)>>6)| - (((green2*5+green1*3)>>6)<<5)| - (((blue2*5+blue1*3)>>6)<<10); - - tmp_col[2]=RGB16TO32(tmp1,255); - tmp_col[3]=RGB16TO32(tmp2,255); - break; - } - } - - if(TEXFORMAT==TexFormat_15bpp) - { - for(int i=0;i<4;i++) - { - tmp_col[i] >>= 2; - tmp_col[i] &= 0x3F3F3F3F; - u32 a = tmp_col[i]>>24; - tmp_col[i] &= 0x00FFFFFF; - tmp_col[i] |= (a>>1)<<24; - } - } - - //TODO - this could be more precise for 32bpp mode (run it through the color separation table) - - //set all 16 texels - for (int sy = 0; sy < 4; sy++) - { - // Texture offset - u32 currentPos = (x<<2) + tmpPos[sy]; - u8 currRow = (u8)((currBlock>>(sy<<3))&0xFF); - - dwdst[currentPos] = tmp_col[currRow&3]; - dwdst[currentPos+1] = tmp_col[(currRow>>2)&3]; - dwdst[currentPos+2] = tmp_col[(currRow>>4)&3]; - dwdst[currentPos+3] = tmp_col[(currRow>>6)&3]; - } - - } + + + break; } - - - break; - } - case TEXMODE_A5I3: - { - for(int j=0;j>3); - if(TEXFORMAT == TexFormat_15bpp) - *dwdst++ = RGB15TO6665(c,alpha); - else - *dwdst++ = RGB15TO32(c,material_5bit_to_8bit[alpha]); - adr++; + case TEXMODE_A5I3: + { + for(int j=0;j>3); + if(TEXFORMAT == TexFormat_15bpp) + *dwdst++ = RGB15TO6665(c,alpha); + else + *dwdst++ = RGB15TO32(c,material_5bit_to_8bit[alpha]); + adr++; + } } + break; } - break; - } - case TEXMODE_16BPP: - { - for(int j=0;j>1; - for(int x = 0; x < len; ++x) - { - u16 c = map[x]; - int alpha = ((c&0x8000)?opaqueColor:0); - *dwdst++ = CONVERT(c&0x7FFF,alpha); + case TEXMODE_16BPP: + { + for(int j=0;j>1; + for(int x = 0; x < len; ++x) + { + u16 c = map[x]; + int alpha = ((c&0x8000)?opaqueColor:0); + *dwdst++ = CONVERT(c&0x7FFF,alpha); + } } + break; } - break; - } - } + } //switch(texture format) - if(TexCache_BindTextureData != 0) - TexCache_BindTextureData(tx,TexCache_texMAP); + /*if(user) + user->BindTextureData(tx,TexCache_texMAP); #ifdef DO_DEBUG_DUMP_TEXTURE DebugDumpTexture(tx); -#endif +#endif*/ -} + return newitem; + + + } //scan() + + void evict(const u32 target = kMaxCacheSize) { + //evicts old cache items until it is less than the max cache size + //this means we actually can exceed the cache by the size of the next item. + //if we really wanted to hold ourselves to it, we could evict to kMaxCacheSize-nextItemSize + while(cache_size > target) + { + ADPCMCacheItem *oldest = list_back; + while(oldest && oldest->lockCount>0) oldest = oldest->prev; //find an unlocked one + if(!oldest) + { + //nothing we can do, everything in the cache is locked. maybe we're leaking. + //just quit trying to evict + return; + } + list_remove(oldest); + cache_size -= oldest->decode_len; + //printf("evicting! totalsize:%d\n",cache_size); + delete oldest; + } + } +} adpcmCache; void TexCache_Reset() { - if(TexCache_texMAP == NULL) TexCache_texMAP = new u8[1024*2048*4]; - if(texcache == NULL) texcache = new TextureCache[MAX_TEXTURE+1]; + //if(TexCache_texMAP == NULL) TexCache_texMAP = new u8[1024*2048*4]; + //if(texcache == NULL) texcache = new TextureCache[MAX_TEXTURE+1]; - memset(texcache,0,sizeof(TextureCache[MAX_TEXTURE+1])); + //memset(texcache,0,sizeof(TextureCache[MAX_TEXTURE+1])); - texcache_start=0; - texcache_stop=MAX_TEXTURE<<1; -} - -TextureCache* TexCache_Curr() -{ - if(lastTexture == -1) - return NULL; - else return &texcache[lastTexture]; + //texcache_start=0; + //texcache_stop=MAX_TEXTURE<<1; + adpcmCache.evict(0); } void TexCache_Invalidate() { - //well, this is a very blunt instrument. - //lets just flag all the textures as invalid. - for(int i=0;i(format,texpal); + case TexFormat_15bpp: return adpcmCache.scan(format,texpal); + default: assert(false); return NULL; } } -void (*TexCache_BindTexture)(u32 texnum) = NULL; -void (*TexCache_BindTextureData)(u32 texnum, u8* data); - -//these templates needed to be instantiated manually -template void TexCache_SetTexture(u32 format, u32 texpal); -template void TexCache_SetTexture(u32 format, u32 texpal); +//call this periodically to keep the tex cache clean +void TexCache_EvictFrame() +{ + adpcmCache.evict(); +} diff --git a/desmume/src/texcache.h b/desmume/src/texcache.h index d51de2f0f..8f1fe6b74 100644 --- a/desmume/src/texcache.h +++ b/desmume/src/texcache.h @@ -5,47 +5,62 @@ enum TexCache_TexFormat { - TexFormat_32bpp, - TexFormat_15bpp + TexFormat_None, //used when nothing yet is cached + TexFormat_32bpp, //used by ogl renderer + TexFormat_15bpp //used by rasterizer }; -#define MAX_TEXTURE 500 - - -struct CACHE_ALIGN TextureCache +class ADPCMCacheItem { - u32 id; - u32 frm; - u32 mode; - u32 pal; - u32 sizeX; - u32 sizeY; - float invSizeX; - float invSizeY; +public: + ADPCMCacheItem() + : decoded(NULL) + , decode_len(0) + , next(NULL) + , prev(NULL) + , lockCount(0) + , cacheFormat(TexFormat_None) + , deleteCallback(NULL) + , suspectedInvalid(false) + {} + ~ADPCMCacheItem() { + delete[] decoded; + if(deleteCallback) deleteCallback(this); + } + void unlock() { + lockCount--; + } + void lock() { + lockCount++; + } + u32 decode_len; + u32 mode; + u8* decoded; //decoded texture data + ADPCMCacheItem *next, *prev; //double linked list + int lockCount; + bool suspectedInvalid; + u32 texformat, texpal; + u32 sizeX, sizeY; + float invSizeX, invSizeY; + + void* texid; //used by ogl renderer for the texid + void (*deleteCallback)(ADPCMCacheItem*); + + TexCache_TexFormat cacheFormat; + + //TODO - this is a little wasteful struct { - int textureSize, indexSize; - u8 texture[128*1024]; // 128Kb texture slot - u8 palette[256*2]; + int textureSize, indexSize; + u8 texture[128*1024]; // 128Kb texture slot + u8 palette[256*2]; } dump; - - //set if this texture is suspected be invalid due to a vram reconfigure - bool suspectedInvalid; }; -extern TextureCache *texcache; - -extern void (*TexCache_BindTexture)(u32 texnum); -extern void (*TexCache_BindTextureData)(u32 texnum, u8* data); - -void TexCache_Reset(); - -template -void TexCache_SetTexture(u32 format, u32 texpal); - void TexCache_Invalidate(); +void TexCache_Reset(); +void TexCache_EvictFrame(); -extern u8 *TexCache_texMAP; -TextureCache* TexCache_Curr(); +ADPCMCacheItem* TexCache_SetTexture(TexCache_TexFormat TEXFORMAT, u32 format, u32 texpal); #endif diff --git a/desmume/src/utils/task.cpp b/desmume/src/utils/task.cpp new file mode 100644 index 000000000..b76b8ef27 --- /dev/null +++ b/desmume/src/utils/task.cpp @@ -0,0 +1,279 @@ +/* Copyright 2009 DeSmuME team + + This file is part of DeSmuME + + DeSmuME is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + DeSmuME is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with DeSmuME; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "task.h" + +#ifdef _WIN32 + +#include +#include + +class Task::Impl { +public: + Impl(); + ~Impl(); + + bool spinlock; + + void start(bool spinlock); + + //execute some work + void execute(const TWork &work, void* param); + + //wait for the work to complete + void* finish(); + + static DWORD __stdcall s_taskProc(void *ptr); + void taskProc(); + void init(); + + //the work function that shall be executed + TWork work; + void* param; + + HANDLE incomingWork, workDone, hThread; + volatile bool bIncomingWork, bWorkDone, bKill; + bool bStarted; +}; + +static void* killTask(void* task) +{ + ((Task::Impl*)task)->bKill = true; + return 0; +} + +Task::Impl::~Impl() +{ + if(!bStarted) return; + + execute(killTask,this); + finish(); + + CloseHandle(incomingWork); + CloseHandle(workDone); + CloseHandle(hThread); +} + +Task::Impl::Impl() + : work(NULL) + , bIncomingWork(false) + , bWorkDone(true) + , bKill(false) + , bStarted(false) +{ +} + +DWORD __stdcall Task::Impl::s_taskProc(void *ptr) +{ + //just past the buck to the instance method + ((Task::Impl*)ptr)->taskProc(); + return 0; +} + +void Task::Impl::taskProc() +{ + for(;;) { + if(bKill) break; + + //wait for a chunk of work + if(spinlock) while(!bIncomingWork) Sleep(0); + else WaitForSingleObject(incomingWork,INFINITE); + + bIncomingWork = false; + //execute the work + param = work(param); + //signal completion + if(!spinlock) SetEvent(workDone); + bWorkDone = true; + } +} + +void Task::Impl::start(bool spinlock) +{ + bStarted = true; + this->spinlock = spinlock; + incomingWork = CreateEvent(NULL,FALSE,FALSE,NULL); + workDone = CreateEvent(NULL,FALSE,FALSE,NULL); + hThread = CreateThread(NULL,0,Task::Impl::s_taskProc,(void*)this, 0, NULL); +} + +void Task::Impl::execute(const TWork &work, void* param) +{ + //setup the work + this->work = work; + this->param = param; + bWorkDone = false; + //signal it to start + if(!spinlock) SetEvent(incomingWork); + bIncomingWork = true; +} + +void* Task::Impl::finish() +{ + //just wait for the work to be done + if(spinlock) + while(!bWorkDone) + Sleep(0); + else WaitForSingleObject(workDone,INFINITE); + return param; +} + +#else + +//just a stub impl that doesnt actually do any threading. +//somebody needs to update the pthread implementation below +class Task::Impl { +public: + Impl() {} + ~Impl() {} + + void start(bool spinlock) {} + + void* ret; + void execute(const TWork &work, void* param) { ret = work(param); } + + void* finish() { return ret; } +}; + + +/* +#include + +class Task::Impl { +public: + Impl(); + + //execute some work + void execute(const TWork &work, void* param); + + //wait for the work to complete + void* finish(); + + pthread_t thread; + static void* s_taskProc(void *ptr); + void taskProc(); + void init(); + + //the work function that shall be executed + TWork work; + void* param; + + bool initialized; + + struct WaitEvent + { + WaitEvent() + : condition(PTHREAD_COND_INITIALIZER) + , mutex(PTHREAD_MUTEX_INITIALIZER) + , value(false) + {} + pthread_mutex_t mutex; + pthread_cond_t condition; + bool value; + + //waits for the WaitEvent to be set + void waitAndClear() + { + lock(); + if(!value) + pthread_cond_wait( &condition, &mutex ); + value = false; + unlock(); + } + + //sets the WaitEvent + void signal() + { + lock(); + if(!value) { + value = true; + pthread_cond_signal( &condition ); + } + unlock(); + } + + //locks the condition's mutex + void lock() { pthread_mutex_lock(&mutex); } + + //unlocks the condition's mutex + void unlock() { pthread_mutex_unlock( &mutex ); } + + } incomingWork, workDone; + +}; + +Task::Impl::Impl() + : work(NULL) + , initialized(false) +{ +} + +void* Task::Impl::s_taskProc(void *ptr) +{ + //just past the buck to the instance method + ((Task::Impl*)ptr)->taskProc(); + return 0; +} + +void Task::Impl::taskProc() +{ + for(;;) { + //wait for a chunk of work + incomingWork.waitAndClear(); + //execute the work + param = work(param); + //signal completion + workDone.signal(); + } +} + +void Task::Impl::init() +{ + pthread_create( &thread, NULL, Task::Impl::s_taskProc, (void*)this ); + initialized = true; +} + +void Task::Impl::execute(const TWork &work, void* param) +{ + //initialization is deferred to the first execute to give win32 time to startup + if(!initialized) init(); + //setup the work + this->work = work; + this->param = param; + //signal it to start + incomingWork.signal(); +} + +void* Task::Impl::finish() +{ + //just wait for the work to be done + workDone.waitAndClear(); + return param; +} +*/ + +#endif + +void Task::start(bool spinlock) { impl->start(spinlock); } +Task::Task() : impl(new Task::Impl()) {} +Task::~Task() { delete impl; } +void Task::execute(const TWork &work, void* param) { impl->execute(work,param); } +void* Task::finish() { return impl->finish(); } + + diff --git a/desmume/src/utils/task.h b/desmume/src/utils/task.h new file mode 100644 index 000000000..d97874ec1 --- /dev/null +++ b/desmume/src/utils/task.h @@ -0,0 +1,46 @@ +/* Copyright 2009 DeSmuME team + + This file is part of DeSmuME + + DeSmuME is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + DeSmuME is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with DeSmuME; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#ifndef _TASK_H_ + +//Sort of like a single-thread thread pool. +//You hand it a worker function and then call finish() to synch with its completion +class Task +{ +public: + Task(); + ~Task(); + + typedef void * (*TWork)(void *); + + void start(bool spinlock); + + //execute some work + void execute(const TWork &work, void* param); + + //wait for the work to complete + void* finish(); + + class Impl; + Impl *impl; + +}; + + +#endif diff --git a/desmume/src/windows/DeSmuME_2005.vcproj b/desmume/src/windows/DeSmuME_2005.vcproj index 7fb3a08e5..81ad86d77 100644 --- a/desmume/src/windows/DeSmuME_2005.vcproj +++ b/desmume/src/windows/DeSmuME_2005.vcproj @@ -643,10 +643,6 @@ RelativePath=".\aviout.h" > - - @@ -971,6 +967,14 @@ RelativePath="..\utils\md5.h" > + + + + diff --git a/desmume/src/windows/DeSmuME_2008.vcproj b/desmume/src/windows/DeSmuME_2008.vcproj index 1d96d4ceb..119709723 100644 --- a/desmume/src/windows/DeSmuME_2008.vcproj +++ b/desmume/src/windows/DeSmuME_2008.vcproj @@ -1019,6 +1019,14 @@ RelativePath="..\utils\md5.h" > + + + + diff --git a/desmume/src/windows/main.cpp b/desmume/src/windows/main.cpp index 786c6a050..190777754 100644 --- a/desmume/src/windows/main.cpp +++ b/desmume/src/windows/main.cpp @@ -1002,7 +1002,7 @@ static void DoDisplay(bool firstTime) //on single core systems, draw straight to the screen //we only do this once per emulated frame because we don't want to waste time redrawing //on such lousy computers - if(CommonSettings.single_core) + if(CommonSettings.single_core()) { aggDraw.hud->attach((u8*)video.buffer, 256, 384, 1024); DoDisplay_DrawHud(); @@ -1025,7 +1025,7 @@ static void DoDisplay(bool firstTime) //apply user's filter video.filter(); - if(!CommonSettings.single_core) + if(!CommonSettings.single_core()) { //draw and composite the OSD (but not if we are drawing osd straight to screen) DoDisplay_DrawHud(); @@ -1081,7 +1081,7 @@ void KillDisplay() void Display() { - if(CommonSettings.single_core) + if(CommonSettings.single_core()) { video.srcBuffer = (u8*)GPU_screen; DoDisplay(true); @@ -1229,7 +1229,7 @@ static void StepRunLoop_Paused() Sleep(100); // periodically update single-core OSD when paused and in the foreground - if(CommonSettings.single_core && GetActiveWindow() == mainLoopData.hwnd) + if(CommonSettings.single_core() && GetActiveWindow() == mainLoopData.hwnd) { video.srcBuffer = (u8*)GPU_screen; DoDisplay(true); @@ -1718,7 +1718,7 @@ class WinDriver : public BaseDriver // in multi-core mode now the display thread will probably // wait for an invocation in this thread to happen, // so handle that ASAP - if(!CommonSettings.single_core) + if(!CommonSettings.single_core()) { ResetEvent(display_invoke_ready_event); SetEvent(display_wakeup_event); @@ -1844,11 +1844,7 @@ int _main() //this helps give a substantial speedup for singlecore users SYSTEM_INFO systemInfo; GetSystemInfo(&systemInfo); - if(systemInfo.dwNumberOfProcessors==1) - CommonSettings.single_core = true; - else - CommonSettings.single_core = false; - + CommonSettings.num_cores = systemInfo.dwNumberOfProcessors; char text[80]; @@ -1948,7 +1944,7 @@ int _main() //in case this isnt actually a singlecore system, but the user requested it //then restrict ourselves to one core - if(CommonSettings.single_core) + if(CommonSettings.single_core()) SetProcessAffinityMask(GetCurrentProcess(),1); MainWindow = new WINCLASS(CLASSNAME, hAppInst); @@ -2130,7 +2126,9 @@ int _main() hKeyInputTimer = timeSetEvent (KeyInRepeatMSec, 0, KeyInputTimer, 0, TIME_PERIODIC); cur3DCore = GetPrivateProfileInt("3D", "Renderer", GPU3D_OPENGL, IniName); - CommonSettings.HighResolutionInterpolateColor = GetPrivateProfileBool("3D", "HighResolutionInterpolateColor", 1, IniName); + CommonSettings.GFX3D_HighResolutionInterpolateColor = GetPrivateProfileBool("3D", "HighResolutionInterpolateColor", 1, IniName); + CommonSettings.GFX3D_EdgeMark = GetPrivateProfileBool("3D", "EnableEdgeMark", 1, IniName); + CommonSettings.GFX3D_Fog = GetPrivateProfileBool("3D", "EnableFog", 1, IniName); //CommonSettings.gfx3d_flushMode = GetPrivateProfileInt("3D", "AlternateFlush", 0, IniName); NDS_3D_ChangeCore(cur3DCore); @@ -3379,7 +3377,7 @@ LRESULT CALLBACK WindowProcedure (HWND hwnd, UINT message, WPARAM wParam, LPARAM } else { - if(CommonSettings.single_core) + if(CommonSettings.single_core()) { video.srcBuffer = (u8*)GPU_screen; DoDisplay(true); @@ -4406,7 +4404,9 @@ LRESULT CALLBACK GFX3DSettingsDlgProc(HWND hw, UINT msg, WPARAM wp, LPARAM lp) { int i; - CheckDlgButton(hw,IDC_INTERPOLATECOLOR,CommonSettings.HighResolutionInterpolateColor?1:0); + CheckDlgButton(hw,IDC_INTERPOLATECOLOR,CommonSettings.GFX3D_HighResolutionInterpolateColor?1:0); + CheckDlgButton(hw,IDC_3DSETTINGS_EDGEMARK,CommonSettings.GFX3D_EdgeMark?1:0); + CheckDlgButton(hw,IDC_3DSETTINGS_FOG,CommonSettings.GFX3D_Fog?1:0); //CheckDlgButton(hw,IDC_ALTERNATEFLUSH,CommonSettings.gfx3d_flushMode); for(i = 0; core3DList[i] != NULL; i++) @@ -4423,10 +4423,14 @@ LRESULT CALLBACK GFX3DSettingsDlgProc(HWND hw, UINT msg, WPARAM wp, LPARAM lp) { case IDOK: { - CommonSettings.HighResolutionInterpolateColor = IsDlgCheckboxChecked(hw,IDC_INTERPOLATECOLOR); + CommonSettings.GFX3D_HighResolutionInterpolateColor = IsDlgCheckboxChecked(hw,IDC_INTERPOLATECOLOR); + CommonSettings.GFX3D_EdgeMark = IsDlgCheckboxChecked(hw,IDC_3DSETTINGS_EDGEMARK); + CommonSettings.GFX3D_Fog = IsDlgCheckboxChecked(hw,IDC_3DSETTINGS_FOG); NDS_3D_ChangeCore(ComboBox_GetCurSel(GetDlgItem(hw, IDC_3DCORE))); WritePrivateProfileInt("3D", "Renderer", cur3DCore, IniName); - WritePrivateProfileInt("3D", "HighResolutionInterpolateColor", CommonSettings.HighResolutionInterpolateColor?1:0, IniName); + WritePrivateProfileInt("3D", "HighResolutionInterpolateColor", CommonSettings.GFX3D_HighResolutionInterpolateColor?1:0, IniName); + WritePrivateProfileInt("3D", "EnableEdgeMark", CommonSettings.GFX3D_EdgeMark?1:0, IniName); + WritePrivateProfileInt("3D", "EnableFog", CommonSettings.GFX3D_Fog?1:0, IniName); //CommonSettings.gfx3d_flushMode = (IsDlgButtonChecked(hw,IDC_ALTERNATEFLUSH) == BST_CHECKED)?1:0; //WritePrivateProfileInt("3D", "AlternateFlush", CommonSettings.gfx3d_flushMode, IniName); } diff --git a/desmume/src/windows/resource.h b/desmume/src/windows/resource.h index 6e71617e2..1b69b77ad 100644 --- a/desmume/src/windows/resource.h +++ b/desmume/src/windows/resource.h @@ -277,7 +277,6 @@ #define IDC_SOUNDCORECB 1000 #define IDC_USEEXTBIOS 1000 #define ID_BROWSE 1000 -#define IDC_ALTERNATEFLUSH 1001 #define IDC_BGMAP_BGXCNT 1001 #define IDC_CHECKBOX_DEBUGGERMODE 1001 #define IDC_EDIT01 1001 @@ -630,7 +629,9 @@ #define IDC_GI_FATOFS 4464 #define IDC_INTERPOLATECOLOR 4464 #define IDC_GI_FATSIZE 4465 +#define IDC_3DSETTINGS_EDGEMARK 4465 #define IDC_GI_ICONTITLEOFS 4466 +#define IDC_3DSETTINGS_FOG 4466 #define IDC_GI_USEDROMSIZE 4467 #define IDC_GI_ICON 4469 #define IDC_GI_TITLE 4470 diff --git a/desmume/src/windows/resources.rc b/desmume/src/windows/resources.rc index 82e09e898082e4bca16c59384b26cb067c2f809b..7a4896337d07ec7c00c0546fbc39ccab79f42416 100644 GIT binary patch delta 485 zcmZqqCiU;3)P`+w%UdQfGd3HgwHu`|0x{EeqcmnFHC{^w0|p%i1qMR~QwH5km&!$sacfPv6kX?8j-uV1y*JpoQ6DI?FFs8BRkK@qmkrTGKc6F`EGmFafGJ zo?d9eC_MeaK_->S`x0fCO@T(%Hpb}%%Nf}?&A_S+8O$bc+$uWx z!3Cx15B{*$O`kV~m5tjF=n#nArjs}R5uP64&!#jzW+Jl;uL6TVLjgkxLncE$L*Del za7Nk522CvdrcmYP3}!$ln1GyMsPD><$B@X7#E`>~3iPTgLkdGWnB@x;En>(9nvn+- zQ37i4WN=|{o<9FOtGHl1P@xNhJCO8caAXK#@Sd)?oH3LWVz32+$>fJMrn0Cey8+#T zZVr1qgB#GP?hxG;3>H9JfR3DQIGtIs9T+D-%(A^@66*wk$=l*Yrk`8JYBg;Ms}!># xgZcE0ZcO6S&n;nV_mo4(PIS$x}K)N1qUhNeA`UQqGa3MW&;Z|rtDy$$a1iRC ziN#mz8pdhG29xjQiEL@U9haN=V!;SjFknxZ9;Rln;?po4Dfo7*1s5hBox9xx- zyE3KMqjljIZRO7Aqj(k{T&AViv5DwVbB`;PPa?0fD2<8ofIAzPO1RB=;F1nBzH@zW zHP@E}uQ>dpD~{EBw_(p1$MhPLpDi|WnC8nP3y?lgdfMqZ}}d@ zJTI@wbSrJe+tq;xRY=}&wYWGUI8_NH+vJ&n3bmc(eGr6hEHtO&GK>k3N)r3`6CTIs zg{=Jl3D|}+bEE=w>~KymqY#V}fOdsPoS%>RL0R8bh=y63PX35Gfm~UR1x#_|_iKLICp38er4