From 11d989908f94f87f561f5223faad19e456209dc7 Mon Sep 17 00:00:00 2001 From: zeromus Date: Tue, 9 Sep 2008 08:50:00 +0000 Subject: [PATCH] - Reorganize 3d code to defer rendering to vblank. eliminates tearing, and maybe some texturing artifacts. also possibly helps performance a bit by letting the hardware pipeline work some more before blocking for framebuffer read. - Tweak optimization flags and change entire source code to use fastcall. - Add opengl state caching. This is of dubious performance assistance, but it is easy to take out so I am leaving it for now. - Add MMU->GPU signal for when vram mappings change, which allows it to assume textures are unchanged unless vram has changed. (big 3d speedup) --- desmume/ChangeLog | 6 + desmume/src/ARM9.h | 2 +- desmume/src/MMU.cpp | 2 + desmume/src/NDSSystem.cpp | 2 + desmume/src/render3D.h | 9 +- desmume/src/windows/DeSmuME_2005.vcproj | 34 +- desmume/src/windows/OGLRender.cpp | 576 ++++++++++++++--------- desmume/src/windows/zlib123/zconf.h | 2 +- desmume/src/windows/zziplib/zzip/_msvc.h | 2 + desmume/src/windows/zziplib/zzip/zzip.h | 82 ++-- 10 files changed, 454 insertions(+), 263 deletions(-) diff --git a/desmume/ChangeLog b/desmume/ChangeLog index af34a69c8..c168ed37f 100644 --- a/desmume/ChangeLog +++ b/desmume/ChangeLog @@ -44,6 +44,12 @@ - Defer rendering until after flush. This was a necessary architectural change, as it permits savestate for the display list, and allows us eventually to separate the GE emulation from the rendering [zeromus] - Fix the 2d/3d compositing well enough for NSMB to fix bugs, but it is still bad [zeromus] + - Reorganize 3d code to defer rendering to vblank. eliminates tearing, and maybe some texturing artifacts. + also possibly helps performance a bit by letting the hardware pipeline work some more before blocking for + framebuffer read. [zeromus] + - Tweak optimization flags and change entire source code to use fastcall [zeromus] + - Add opengl state caching. This is of dubious performance assistance, but it is easy to take out so I am leaving it for now. [zeromus] + - Add MMU->GPU signal for when vram mappings change, which allows it to assume textures are unchanged unless vram has changed [zeromus] 0.7.3 -> 0.8 diff --git a/desmume/src/ARM9.h b/desmume/src/ARM9.h index 4dae037d8..849f045c8 100644 --- a/desmume/src/ARM9.h +++ b/desmume/src/ARM9.h @@ -23,7 +23,7 @@ typedef struct { u8 * ObjExtPal[2][2]; u8 * texPalSlot[4]; - const u8 *textureSlotAddr[4]; + u8 *textureSlotAddr[4]; u8 *blank_memory[0x20000]; } ARM9_struct; diff --git a/desmume/src/MMU.cpp b/desmume/src/MMU.cpp index 70b22da8b..8ba794009 100644 --- a/desmume/src/MMU.cpp +++ b/desmume/src/MMU.cpp @@ -1060,6 +1060,8 @@ void FASTCALL MMU_write8(u32 proc, u32 adr, u8 val) ARM9Mem.textureSlotAddr[slot_index] = &ARM9Mem.ARM9_LCD[0x20000 * (adr - REG_VRAMCNTA)]; + + gpu3D->NDS_3D_VramReconfigureSignal(); } } MMU_VRAMReloadFromLCD(adr-REG_VRAMCNTA,val) ; diff --git a/desmume/src/NDSSystem.cpp b/desmume/src/NDSSystem.cpp index eeb889b2e..31f350d70 100644 --- a/desmume/src/NDSSystem.cpp +++ b/desmume/src/NDSSystem.cpp @@ -1022,6 +1022,8 @@ NDS_exec(s32 nb, BOOL force) nds.lignerendu = FALSE; if(nds.VCount==192) { + gpu3D->NDS_3D_VBlankSignal(); + T1WriteWord(ARM9Mem.ARM9_REG, 4, T1ReadWord(ARM9Mem.ARM9_REG, 4) | 1); T1WriteWord(MMU.ARM7_REG, 4, T1ReadWord(MMU.ARM7_REG, 4) | 1); NDS_ARM9VBlankInt(); diff --git a/desmume/src/render3D.h b/desmume/src/render3D.h index fac293c22..a598605d6 100644 --- a/desmume/src/render3D.h +++ b/desmume/src/render3D.h @@ -21,11 +21,8 @@ #ifndef GPU_3D #define GPU_3D -#ifdef _MSC_VER -#define CALL_CONVENTION __cdecl -#else +//not using this right now #define CALL_CONVENTION -#endif /* enum DRIVER_3D @@ -125,7 +122,9 @@ typedef struct GPU3DInterface long (CALL_CONVENTION* NDS_3D_GetPosRes) (unsigned int index); long (CALL_CONVENTION* NDS_3D_GetVecRes) (unsigned int index); - void (CALL_CONVENTION* NDS_3D_UpdateToonTable) (void* toonTable); + void (CALL_CONVENTION* NDS_3D_UpdateToonTable) (void* toonTable); + void (CALL_CONVENTION* NDS_3D_VBlankSignal) (); + void (CALL_CONVENTION* NDS_3D_VramReconfigureSignal) (); } GPU3DInterface; diff --git a/desmume/src/windows/DeSmuME_2005.vcproj b/desmume/src/windows/DeSmuME_2005.vcproj index 8f2a2f788..f8c3679a7 100644 --- a/desmume/src/windows/DeSmuME_2005.vcproj +++ b/desmume/src/windows/DeSmuME_2005.vcproj @@ -111,6 +111,7 @@ IntermediateDirectory="$(SolutionDir)\.VS2005\$(ConfigurationName)\$(PlatformName)" ConfigurationType="1" InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops" + WholeProgramOptimization="1" > + + + + + + + + count = 0; } +//------------------------------------------------------------ + + +#define OGLEXT(x,y) x y; +#define INITOGLEXT(x,y) y = (x)wglGetProcAddress(#y); + +OGLEXT(PFNGLCREATESHADERPROC,glCreateShader) +//zero: i dont understand this at all. my glext.h has the wrong thing declared here... so I have to do it myself +typedef void (APIENTRYP X_PFNGLGETSHADERSOURCEPROC) (GLuint shader, GLsizei bufSize, GLchar **source, GLsizei *length); +OGLEXT(X_PFNGLGETSHADERSOURCEPROC,glShaderSource) +OGLEXT(PFNGLCOMPILESHADERPROC,glCompileShader) +OGLEXT(PFNGLCREATEPROGRAMPROC,glCreateProgram) +OGLEXT(PFNGLATTACHSHADERPROC,glAttachShader) +OGLEXT(PFNGLLINKPROGRAMPROC,glLinkProgram) +OGLEXT(PFNGLUSEPROGRAMPROC,glUseProgram) +OGLEXT(PFNGLGETSHADERINFOLOGPROC,glGetShaderInfoLog) + +//opengl state caching: +//This is of dubious performance assistance, but it is easy to take out so I am leaving it for now. +//every function that is xgl* can be replaced with gl* if we decide to rip this out or if anyone else +//doesnt feel like sticking with it (or if it causes trouble) + +void xglDepthFunc(GLenum func) { + static GLenum oldfunc = -1; + if(oldfunc == func) return; + glDepthFunc(oldfunc=func); +} + +void xglPolygonMode(GLenum face,GLenum mode) { + static GLenum oldmodes[2] = {-1,-1}; + switch(face) { + case GL_FRONT: if(oldmodes[0]==mode) return; else glPolygonMode(GL_FRONT,oldmodes[0]=mode); return; + case GL_BACK: if(oldmodes[1]==mode) return; else glPolygonMode(GL_BACK,oldmodes[1]=mode); return; + case GL_FRONT_AND_BACK: if(oldmodes[0]==mode && oldmodes[1]==mode) return; else glPolygonMode(GL_FRONT_AND_BACK,oldmodes[0]=oldmodes[1]=mode); + } +} + +void xglUseProgram(GLuint program) { + if(!glUseProgram) return; + static GLuint oldprogram = -1; + if(oldprogram==program) return; + glUseProgram(oldprogram=program); +} + +void xglDepthMask (GLboolean flag) { + static GLboolean oldflag = -1; + if(oldflag==flag) return; + glDepthMask(oldflag=flag); +} + +struct GLCaps { + u8 caps[0x100]; + GLCaps() { + memset(caps,0xFF,sizeof(caps)); + } +}; +static GLCaps glcaps; + +void _xglEnable(GLenum cap) { + cap -= 0x0B00; + if(glcaps.caps[cap] == 0xFF || glcaps.caps[cap] == 0) { + glEnable(cap+0x0B00); + glcaps.caps[cap] = 1; + } +} + +void _xglDisable(GLenum cap) { + cap -= 0x0B00; + if(glcaps.caps[cap]) { + glDisable(cap+0x0B00); + glcaps.caps[cap] = 0; + } +} + +#define xglEnable(cap) { \ + CTASSERT((cap-0x0B00)<0x100); \ + _xglEnable(cap); } + +#define xglDisable(cap) {\ + CTASSERT((cap-0x0B00)<0x100); \ + _xglDisable(cap); } + + //================================================= Textures #define MAX_TEXTURE 500 -typedef struct +struct TextureCache { + TextureCache() + : suspectedInvalid(true) + {} + GLenum id; unsigned int frm; unsigned int mode; @@ -217,7 +309,10 @@ typedef struct float invSizeY; unsigned char texture[128*1024]; // 128Kb texture slot -} TextureCache; + //set if this texture is suspected be invalid due to a vram reconfigure + bool suspectedInvalid; + +} ; TextureCache texcache[MAX_TEXTURE+1]; u32 texcache_count; @@ -283,20 +378,6 @@ static void NDS_3D_UpdateToonTable(void* toonTable) { glTexImage1D(GL_TEXTURE_1D, 0, GL_RGB, 32, 0, GL_RGBA, GL_UNSIGNED_BYTE, rgbToonTable); } -#define OGLEXT(x,y) x y; -#define INITOGLEXT(x,y) y = (x)wglGetProcAddress(#y); - -OGLEXT(PFNGLCREATESHADERPROC,glCreateShader) -//zero: i dont understand this at all. my glext.h has the wrong thing declared here... so I have to do it myself -typedef void (APIENTRYP X_PFNGLGETSHADERSOURCEPROC) (GLuint shader, GLsizei bufSize, GLchar **source, GLsizei *length); -OGLEXT(X_PFNGLGETSHADERSOURCEPROC,glShaderSource) -OGLEXT(PFNGLCOMPILESHADERPROC,glCompileShader) -OGLEXT(PFNGLCREATEPROGRAMPROC,glCreateProgram) -OGLEXT(PFNGLATTACHSHADERPROC,glAttachShader) -OGLEXT(PFNGLLINKPROGRAMPROC,glLinkProgram) -OGLEXT(PFNGLUSEPROGRAMPROC,glUseProgram) -OGLEXT(PFNGLGETSHADERINFOLOGPROC,glGetShaderInfoLog) - char NDS_glInit(void) { int i; @@ -343,12 +424,14 @@ char NDS_glInit(void) #endif glClearColor (0.f, 0.f, 0.f, 1.f); - glEnable (GL_NORMALIZE); - glEnable (GL_DEPTH_TEST); + glPixelStorei(GL_PACK_ALIGNMENT,8); + + xglEnable (GL_NORMALIZE); + xglEnable (GL_DEPTH_TEST); glEnable (GL_TEXTURE_2D); glAlphaFunc (GL_GREATER, 0); - glEnable (GL_ALPHA_TEST); + xglEnable (GL_ALPHA_TEST); glGenTextures (MAX_TEXTURE, &oglTempTextureID[0]); @@ -710,6 +793,22 @@ __forceinline void NDS_glMultMatrix4x4(signed long v) //todo - make all color conversions go through a properly spread table!! +//I think this is slower than the regular memcmp.. doesnt make sense to me, but my +//asm optimization knowlege is 15 years old.. +__forceinline int memcmp_slow(const void* src, const void* dst, u32 count) { + int retval; + __asm { + mov [retval], 0; + mov ecx, [count]; + shr ecx, 2; + mov esi, [src]; + mov edi, [dst]; + repe cmpsd; + setc byte ptr [retval]; + } + return retval; +} + __forceinline void* memcpy_fast(void* dest, const void* src, size_t count) { size_t blockCnt = count / 64; @@ -781,6 +880,7 @@ static void DebugDumpTexture(int which) } //================================================================================ +static int lastTexture = -1; __forceinline void setTexture(unsigned int format, unsigned int texpal) { int palSize[7]={32,4,16,256,0,8,32768}; @@ -817,17 +917,27 @@ __forceinline void setTexture(unsigned int format, unsigned int texpal) i=texcache_start; - if(false) + //if(false) while (TRUE) { if (texcache_stop==i) break; if (texcache[i].frm==0) break; if ((texcache[i].frm==format)&&(texcache[i].pal==texpal)) { - if (!memcmp(adr,texcache[i].texture,imageSize)) + //TODO - we need to compare the palette also. + //TODO - this doesnt correctly span bank boundaries. in fact, it seems quite dangerous. + if (!texcache[i].suspectedInvalid || !memcmp(adr,texcache[i].texture,min(imageSize,sizeof(texcache[i].texture)))) { + texcache[i].suspectedInvalid = false; texcache_count=i; - glBindTexture(GL_TEXTURE_2D,texcache[i].id); + if(i != lastTexture) + { + lastTexture = i; + glBindTexture(GL_TEXTURE_2D,texcache[i].id); + glMatrixMode (GL_TEXTURE); + glLoadIdentity (); + glScaled (texcache[i].invSizeX, texcache[i].invSizeY, 1.0f); + } return; } } @@ -846,7 +956,10 @@ __forceinline void setTexture(unsigned int format, unsigned int texpal) } } + lastTexture = i; glBindTexture(GL_TEXTURE_2D, texcache[i].id); + + texcache[i].suspectedInvalid = false; texcache[i].mode=textureMode; texcache[i].pal=texpal; texcache[i].sizeX=sizeX; @@ -856,11 +969,15 @@ __forceinline void setTexture(unsigned int format, unsigned int texpal) texcache[i].invSizeY=1.0f/((float)sizeY*(1<<4)); texcache[i].texenv=envMode; //memcpy(texcache[i].texture,adr,imageSize); //======================= copy - memcpy_fast(texcache[i].texture,adr,imageSize); //======================= copy + memcpy_fast(texcache[i].texture,adr,min(imageSize,sizeof(texcache[i].texture))); //======================= copy texcache[i].numcolors=palSize[texcache[i].mode]; texcache[i].frm=format; + glMatrixMode (GL_TEXTURE); + glLoadIdentity (); + glScaled (texcache[i].invSizeX, texcache[i].invSizeY, 1.0f); + if(i==62 || textureMode==1) { int zzz=9; } @@ -1135,29 +1252,32 @@ __forceinline void NDS_glBegin(unsigned long v) tempVertList.count = 0; } +//controls states: +//glStencilFunc +//glStencilOp +//glColorMask +static u32 stencilStateSet = -1; + static void BeginRenderPoly() { int enableDepthWrite = 1; u32 tmp=0; - tempVertList.count = 0; - - glDepthFunc (depthFuncMode); + xglDepthFunc (depthFuncMode); // Cull face if (cullingMask != 0xC0) { - glEnable(GL_CULL_FACE); + xglEnable(GL_CULL_FACE); glCullFace(map3d_cull[cullingMask>>6]); } else - glDisable(GL_CULL_FACE); + xglDisable(GL_CULL_FACE); // Alpha value, actually not well handled, 0 should be wireframe if (colorAlpha > 0) { - glPolygonMode (GL_FRONT, GL_FILL); - glPolygonMode (GL_BACK, GL_FILL); + xglPolygonMode (GL_FRONT_AND_BACK, GL_FILL); //non-31 alpha polys are translucent if(colorAlpha != 0x7FFFFFFF) @@ -1165,21 +1285,10 @@ static void BeginRenderPoly() } else { - glPolygonMode (GL_FRONT, GL_LINE); - glPolygonMode (GL_BACK, GL_LINE); + xglPolygonMode (GL_FRONT_AND_BACK, GL_LINE); } - // texture environment setTexture(textureFormat, texturePalette); - //================= - if (texcache_count!=-1) - { - texCoordinateTransform = texcache[texcache_count].coord; - - glMatrixMode (GL_TEXTURE); - glLoadIdentity (); - glScaled (texcache[texcache_count].invSizeX, texcache[texcache_count].invSizeY, 1.0f); - } //a5i3 or a3i5 textures are translucent alphaDepthWrite = 0; //zero - as a hack, we are never going to write depth buffer for alpha values @@ -1194,43 +1303,48 @@ static void BeginRenderPoly() //handle shadow polys if(envMode == 3) { - glEnable(GL_STENCIL_TEST); + xglEnable(GL_STENCIL_TEST); if(polyID == 0) { - //when the polyID is zero, we are writing the shadow mask. - //set stencilbuf = 1 where the shadow volume is obstructed by geometry. - //do not write color or depth information. - glStencilFunc(GL_ALWAYS,2,255); - glStencilOp(GL_KEEP,GL_REPLACE,GL_KEEP); - glColorMask(GL_FALSE,GL_FALSE,GL_FALSE,GL_FALSE); enableDepthWrite = 1; + if(stencilStateSet!=0) { + stencilStateSet = 0; + //when the polyID is zero, we are writing the shadow mask. + //set stencilbuf = 1 where the shadow volume is obstructed by geometry. + //do not write color or depth information. + glStencilFunc(GL_ALWAYS,2,255); + glStencilOp(GL_KEEP,GL_REPLACE,GL_KEEP); + glColorMask(GL_FALSE,GL_FALSE,GL_FALSE,GL_FALSE); + } } else { - //when the polyid is nonzero, we are drawing the shadow poly. - //only draw the shadow poly where the stencilbuf==1. - //I am not sure whether to update the depth buffer here--so I chose not to. - glStencilFunc(GL_EQUAL,2,255); - glStencilOp(GL_KEEP,GL_KEEP,GL_KEEP); - glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_TRUE); enableDepthWrite = 0; + if(stencilStateSet!=1) { + stencilStateSet = 1; + //when the polyid is nonzero, we are drawing the shadow poly. + //only draw the shadow poly where the stencilbuf==1. + //I am not sure whether to update the depth buffer here--so I chose not to. + glStencilFunc(GL_EQUAL,2,255); + glStencilOp(GL_KEEP,GL_KEEP,GL_KEEP); + glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_TRUE); + } } } else { - glEnable(GL_STENCIL_TEST); - glStencilFunc(GL_ALWAYS,1,255); - glStencilOp(GL_REPLACE,GL_REPLACE,GL_REPLACE); - glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_TRUE); + xglEnable(GL_STENCIL_TEST); + if(stencilStateSet!=2) { + stencilStateSet=2; + glStencilFunc(GL_ALWAYS,1,255); + glStencilOp(GL_REPLACE,GL_REPLACE,GL_REPLACE); + glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_TRUE); + } } //handle toon rendering if(glUseProgram) { if(envMode == 2) { - glUseProgram(toonProgram); - } else glUseProgram(0); + xglUseProgram(toonProgram); + } else xglUseProgram(0); } - glDepthMask(enableDepthWrite?GL_TRUE:GL_FALSE); - - //just to be sure - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); + xglDepthMask(enableDepthWrite?GL_TRUE:GL_FALSE); } __forceinline void NDS_glEnd (void) @@ -1427,60 +1541,6 @@ __forceinline int NDS_glGetNumVertex (void) return 0; } -//NHerve mod3 - Fixed blending with 2D backgrounds (New Super Mario Bros looks better) -//zeromus post-mod3: fix even better -__forceinline void NDS_glGetLine (int line, unsigned short * dst) -{ - int i, t; - u8 *screen3D = (u8 *)&GPU_screen3D [(191-(line%192))*1024]; - u8 *screenStencil = (u8*)&GPU_screenStencil[(191-(line%192))*256]; - - //the renderer clears the stencil to 0 - //then it sets it to 1 whenever it renders a pixel that passes the alpha test - //(it also sets it to 2 under some circumstances when rendering shadow volumes) - //so, we COULD use a zero stencil value to indicate that nothing should get composited. - //in fact, we are going to do that to fix some problems. - //but beware that it i figure it might could CAUSE some problems - - //this alpha compositing blending logic isnt thought through at all - //someone needs to think about what bitdepth it should take place at and how to do it efficiently - - u32 a,r,g,b,stencil,oldcolor,oldr,oldg,oldb; - - for(i = 0, t=0; i < 256; i++) - { - stencil = screenStencil[i]; - - //you would use this if you wanted to use the stencil buffer to make decisions here - if(!stencil) continue; - - t=i*4; - r = screen3D[t+0]; - g = screen3D[t+1]; - b = screen3D[t+2]; - a = screen3D[t+3]; - - if(a != 0xFF && a != 0) { - int zzz=9; - } - - oldcolor = RGB15TO32(dst[i],0); - oldr = oldcolor&0xFF; - oldg = (oldcolor>>8)&0xFF; - oldb = (oldcolor>>16)&0xFF; - - r = (r*a + oldr*(255-a)) / 255; - g = (g*a + oldg*(255-a)) / 255; - b = (b*a + oldb*(255-a)) / 255; - - r=min(255,r); - g=min(255,g); - b=min(255,b); - - dst[i] = ((b>>3)<<10) | ((g>>3)<<5) | (r>>3); - } -} - static void InstallPolygonAttrib(unsigned long val) { // Light enable/disable @@ -1512,92 +1572,6 @@ __forceinline void NDS_glPolygonAttrib (unsigned long val) InstallPolygonAttrib(polyAttr); } -__forceinline void NDS_glFlush(unsigned long v) -{ - u32 wbuffer = v&1; - u32 sortmode = (v>>1)&1; - - // Set back some secure render states - glPolygonMode (GL_BACK, GL_FILL); - glPolygonMode (GL_FRONT, GL_FILL); - - glDepthMask (GL_TRUE); - glClear (GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT); - - //render display list - //TODO - properly doublebuffer the display lists - { - int i; - for(i=0;icount;i++) { - POLY *poly = &polylist->list[i]; - int type = poly->type; - int j; - InstallPolygonAttrib(poly->polyAttr); - textureFormat = poly->texParam; - texturePalette = poly->texPalette; - BeginRenderPoly(); - - //since we havent got the whole pipeline working yet, lets use opengl for the projection - glMatrixMode(GL_PROJECTION); - glLoadMatrixf(poly->projMatrix); - - glBegin(type==3?GL_TRIANGLES:GL_QUADS); - for(j=0;jlist[poly->vertIndexes[j]]; - - //float tempCoord[4]; - //Vector4Copy(tempCoord,vert->coord); - //we havent got the whole pipeline working yet, so we cant do this - ////convert from ds device coords to opengl - //tempCoord[0] *= 2; - //tempCoord[1] *= 2; - //tempCoord[0] -= 1; - //tempCoord[1] -= 1; - - //todo - edge flag? - glTexCoord2fv(vert->texcoord); - glColor4iv(vert->color); - //glVertex3fv(tempCoord); - glVertex3fv(vert->coord); - } - glEnd(); - } - } - - twiddleLists(); - - //reset gpu state - clCmd = 0; - clInd = 0; - - //capture rendering results - glFlush(); - glReadPixels(0,0,256,192,GL_RGBA, GL_UNSIGNED_BYTE, GPU_screen3D); - glReadPixels(0,0,256,192,GL_STENCIL_INDEX, GL_UNSIGNED_BYTE, GPU_screenStencil); - - //debug: view depth buffer via color buffer for debugging - { - //int ctr=0; - //for(ctr=0;ctr<256*192;ctr++) { - // float zval = GPU_screen3Ddepth[ctr]; - // u8* colorPtr = GPU_screen3D+ctr*3; - // if(zval<0) { - // colorPtr[0] = 255; - // colorPtr[1] = 0; - // colorPtr[2] = 0; - // } else if(zval>1) { - // colorPtr[0] = 0; - // colorPtr[1] = 0; - // colorPtr[2] = 255; - // } else { - // colorPtr[0] = colorPtr[1] = colorPtr[2] = zval*255; - // //printlog("%f %f %d\n",zval, zval*255,colorPtr[0]); - // } - - //} - } -} - /* 0-4 Diffuse Reflection Red 5-9 Diffuse Reflection Green @@ -1734,6 +1708,7 @@ __forceinline void NDS_glLightColor (unsigned long v) __forceinline void NDS_glAlphaFunc(unsigned long v) { alphaTestRef = (v&31)/31.f; + glAlphaFunc (GL_GREATER, alphaTestBase); } __forceinline void NDS_glControl(unsigned long v) @@ -1749,11 +1724,11 @@ __forceinline void NDS_glControl(unsigned long v) if(v&(1<<2)) { - //glAlphaFunc (GL_GREATER, alphaTestBase); + glAlphaFunc (GL_GREATER, alphaTestBase); } else { - //glAlphaFunc (GL_GREATER, 0.1f); + glAlphaFunc (GL_GREATER, 0); } if(v&(1<<3)) @@ -1878,6 +1853,181 @@ __forceinline void NDS_glNormal(unsigned long v) } } +static bool flushPending = false; +static u32 flush_wbuffer; +static u32 flush_sortmode; + +void NDS_glFlush(unsigned long v) +{ + flushPending = true; + flush_wbuffer = v&1; + flush_sortmode = (v>>1)&1; +} + +void GL_Draw() +{ + xglDepthMask (GL_TRUE); + glClear (GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT); + + //render display list + //TODO - properly doublebuffer the display lists + { + + u32 lastTextureFormat, lastTexturePalette, lastPolyAttr; + + for(int i=0;icount;i++) { + POLY *poly = &polylist->list[i]; + int type = poly->type; + + //a very macro-level state caching approach: + //these are the only things which control the GPU rendering state. + if(i==0 || lastTextureFormat != poly->texParam || lastTexturePalette != poly->texPalette || lastPolyAttr != poly->polyAttr) + { + InstallPolygonAttrib(lastPolyAttr=poly->polyAttr); + lastTextureFormat = textureFormat = poly->texParam; + lastTexturePalette = texturePalette = poly->texPalette; + BeginRenderPoly(); + } + + //since we havent got the whole pipeline working yet, lets use opengl for the projection + glMatrixMode(GL_PROJECTION); + glLoadMatrixf(poly->projMatrix); + + glBegin(type==3?GL_TRIANGLES:GL_QUADS); + for(int j=0;jlist[poly->vertIndexes[j]]; + + //float tempCoord[4]; + //Vector4Copy(tempCoord,vert->coord); + //we havent got the whole pipeline working yet, so we cant do this + ////convert from ds device coords to opengl + //tempCoord[0] *= 2; + //tempCoord[1] *= 2; + //tempCoord[0] -= 1; + //tempCoord[1] -= 1; + + //todo - edge flag? + glTexCoord2fv(vert->texcoord); + glColor4iv(vert->color); + //glVertex3fv(tempCoord); + glVertex3fv(vert->coord); + } + glEnd(); + } + } + + //since we just redrew, we need to refresh the framebuffers + needRefreshFramebuffer = true; + + twiddleLists(); + + //reset GE state + clCmd = 0; + clInd = 0; +} + +void NDS_3D_VBlankSignal() +{ + //the 3d buffers are swapped when a vblank begins. + //so, if we have a redraw pending, now is a safe time to do it + if(!flushPending) return; + flushPending = false; + GL_Draw(); +} + +void NDS_3D_VramReconfigureSignal() +{ + //well, this is a very blunt instrument. + //lets just flag all the textures as invalid. + for(int i=0;i1) { + // colorPtr[0] = 0; + // colorPtr[1] = 0; + // colorPtr[2] = 255; + // } else { + // colorPtr[0] = colorPtr[1] = colorPtr[2] = zval*255; + // //printlog("%f %f %d\n",zval, zval*255,colorPtr[0]); + // } + + //} +} + +//NHerve mod3 - Fixed blending with 2D backgrounds (New Super Mario Bros looks better) +//zeromus post-mod3: fix even better +__forceinline void NDS_glGetLine (int line, unsigned short * dst) +{ + if(needRefreshFramebuffer) { + needRefreshFramebuffer = false; + GL_ReadFramebuffer(); + } + int i, t; + u8 *screen3D = (u8 *)&GPU_screen3D [(191-(line%192))*1024]; + u8 *screenStencil = (u8*)&GPU_screenStencil[(191-(line%192))*256]; + + //the renderer clears the stencil to 0 + //then it sets it to 1 whenever it renders a pixel that passes the alpha test + //(it also sets it to 2 under some circumstances when rendering shadow volumes) + //so, we COULD use a zero stencil value to indicate that nothing should get composited. + //in fact, we are going to do that to fix some problems. + //but beware that it i figure it might could CAUSE some problems + + //this alpha compositing blending logic isnt thought through at all + //someone needs to think about what bitdepth it should take place at and how to do it efficiently + + u32 a,r,g,b,stencil,oldcolor,oldr,oldg,oldb; + + for(i = 0, t=0; i < 256; i++) + { + stencil = screenStencil[i]; + + //you would use this if you wanted to use the stencil buffer to make decisions here + if(!stencil) continue; + + t=i*4; + r = screen3D[t+0]; + g = screen3D[t+1]; + b = screen3D[t+2]; + a = screen3D[t+3]; + + if(a != 0xFF && a != 0) { + int zzz=9; + } + + oldcolor = RGB15TO32(dst[i],0); + oldr = oldcolor&0xFF; + oldg = (oldcolor>>8)&0xFF; + oldb = (oldcolor>>16)&0xFF; + + r = (r*a + oldr*(255-a)) / 255; + g = (g*a + oldg*(255-a)) / 255; + b = (b*a + oldb*(255-a)) / 255; + + r=min(255,r); + g=min(255,g); + b=min(255,b); + + dst[i] = ((b>>3)<<10) | ((g>>3)<<5) | (r>>3); + } +} + __forceinline void NDS_glBoxTest(unsigned long v) { } @@ -2405,7 +2555,9 @@ GPU3DInterface gpu3Dgl = { NDS_glInit, NDS_glVecTest, NDS_glGetPosRes, NDS_glGetVecRes, - NDS_3D_UpdateToonTable + NDS_3D_UpdateToonTable, + NDS_3D_VBlankSignal, + NDS_3D_VramReconfigureSignal, }; diff --git a/desmume/src/windows/zlib123/zconf.h b/desmume/src/windows/zlib123/zconf.h index ccbf3751e..ec69ca940 100644 --- a/desmume/src/windows/zlib123/zconf.h +++ b/desmume/src/windows/zlib123/zconf.h @@ -247,7 +247,7 @@ # define ZEXTERN extern #endif #ifndef ZEXPORT -# define ZEXPORT +# define ZEXPORT __cdecl #endif #ifndef ZEXPORTVA # define ZEXPORTVA diff --git a/desmume/src/windows/zziplib/zzip/_msvc.h b/desmume/src/windows/zziplib/zzip/_msvc.h index e85251344..81adb6a4a 100644 --- a/desmume/src/windows/zziplib/zzip/_msvc.h +++ b/desmume/src/windows/zziplib/zzip/_msvc.h @@ -1,5 +1,7 @@ #ifndef _ZZIP__MSVC_H #define _ZZIP__MSVC_H 1 + +#define _zzip_calltype __cdecl /* zzip/_msvc.h. Generated automatically at end of configure. */ /* config.h.in. Generated from configure.ac by autoheader. */ diff --git a/desmume/src/windows/zziplib/zzip/zzip.h b/desmume/src/windows/zziplib/zzip/zzip.h index 6f2c51690..80cfc0cec 100644 --- a/desmume/src/windows/zziplib/zzip/zzip.h +++ b/desmume/src/windows/zziplib/zzip/zzip.h @@ -94,9 +94,9 @@ struct zzip_dirent * zzip/err.c */ _zzip_export /* error in _opendir : */ -zzip_char_t* zzip_strerror(int errcode); +zzip_char_t* _zzip_calltype zzip_strerror(int errcode); _zzip_export /* error in other functions : */ -zzip_char_t* zzip_strerror_of(ZZIP_DIR * dir); +zzip_char_t* _zzip_calltype zzip_strerror_of(ZZIP_DIR * dir); _zzip_export /* error mapped to errno.h defines : */ int zzip_errno(int errcode); @@ -107,46 +107,46 @@ int zzip_errno(int errcode); * zzip/info.c */ _zzip_export -int zzip_error(ZZIP_DIR * dir); +int _zzip_calltype zzip_error(ZZIP_DIR * dir); _zzip_export -void zzip_seterror(ZZIP_DIR * dir, int errcode); +void _zzip_calltype zzip_seterror(ZZIP_DIR * dir, int errcode); _zzip_export zzip_char_t* zzip_compr_str(int compr); _zzip_export -ZZIP_DIR * zzip_dirhandle(ZZIP_FILE * fp); +ZZIP_DIR * _zzip_calltype zzip_dirhandle(ZZIP_FILE * fp); _zzip_export -int zzip_dirfd(ZZIP_DIR * dir); +int _zzip_calltype zzip_dirfd(ZZIP_DIR * dir); _zzip_export -int zzip_dir_real(ZZIP_DIR * dir); +int _zzip_calltype zzip_dir_real(ZZIP_DIR * dir); _zzip_export -int zzip_file_real(ZZIP_FILE * fp); +int _zzip_calltype zzip_file_real(ZZIP_FILE * fp); _zzip_export -void* zzip_realdir(ZZIP_DIR * dir); +void* _zzip_calltype zzip_realdir(ZZIP_DIR * dir); _zzip_export -int zzip_realfd(ZZIP_FILE * fp); +int _zzip_calltype zzip_realfd(ZZIP_FILE * fp); /* * zip handle management * zzip/zip.c */ _zzip_export -ZZIP_DIR * zzip_dir_alloc(zzip_strings_t* fileext); +ZZIP_DIR * _zzip_calltype zzip_dir_alloc(zzip_strings_t* fileext); _zzip_export -int zzip_dir_free(ZZIP_DIR *); +int _zzip_calltype zzip_dir_free(ZZIP_DIR *); /* * Opening/closing a zip archive * zzip-zip.c */ _zzip_export -ZZIP_DIR * zzip_dir_fdopen(int fd, zzip_error_t * errcode_p); +ZZIP_DIR * _zzip_calltype zzip_dir_fdopen(int fd, zzip_error_t * errcode_p); _zzip_export -ZZIP_DIR * zzip_dir_open(zzip_char_t* filename, zzip_error_t * errcode_p); +ZZIP_DIR * _zzip_calltype zzip_dir_open(zzip_char_t* filename, zzip_error_t * errcode_p); _zzip_export -int zzip_dir_close(ZZIP_DIR * dir); +int _zzip_calltype zzip_dir_close(ZZIP_DIR * dir); _zzip_export -int zzip_dir_read(ZZIP_DIR * dir, ZZIP_DIRENT * dirent); +int _zzip_calltype zzip_dir_read(ZZIP_DIR * dir, ZZIP_DIRENT * dirent); /* @@ -155,46 +155,46 @@ int zzip_dir_read(ZZIP_DIR * dir, ZZIP_DIRENT * dirent); * zzip/zip.c */ _zzip_export -ZZIP_DIR * zzip_opendir(zzip_char_t* filename); +ZZIP_DIR * _zzip_calltype zzip_opendir(zzip_char_t* filename); _zzip_export -int zzip_closedir(ZZIP_DIR * dir); +int _zzip_calltype zzip_closedir(ZZIP_DIR * dir); _zzip_export -ZZIP_DIRENT * zzip_readdir(ZZIP_DIR * dir); +ZZIP_DIRENT * _zzip_calltype zzip_readdir(ZZIP_DIR * dir); _zzip_export -void zzip_rewinddir(ZZIP_DIR * dir); +void _zzip_calltype zzip_rewinddir(ZZIP_DIR * dir); _zzip_export -zzip_off_t zzip_telldir(ZZIP_DIR * dir); +zzip_off_t _zzip_calltype zzip_telldir(ZZIP_DIR * dir); _zzip_export -void zzip_seekdir(ZZIP_DIR * dir, zzip_off_t offset); +void _zzip_calltype zzip_seekdir(ZZIP_DIR * dir, zzip_off_t offset); /* * 'opening', 'closing' and reading invidual files in zip archive. * zzip/file.c */ _zzip_export -ZZIP_FILE * zzip_file_open(ZZIP_DIR * dir, zzip_char_t* name, int flags); +ZZIP_FILE * _zzip_calltype zzip_file_open(ZZIP_DIR * dir, zzip_char_t* name, int flags); _zzip_export -int zzip_file_close(ZZIP_FILE * fp); +int _zzip_calltype zzip_file_close(ZZIP_FILE * fp); _zzip_export -zzip_ssize_t zzip_file_read(ZZIP_FILE * fp, void* buf, zzip_size_t len); +zzip_ssize_t _zzip_calltype zzip_file_read(ZZIP_FILE * fp, void* buf, zzip_size_t len); _zzip_export -ZZIP_FILE * zzip_open(zzip_char_t* name, int flags); +ZZIP_FILE * _zzip_calltype zzip_open(zzip_char_t* name, int flags); _zzip_export -int zzip_close(ZZIP_FILE * fp); +int _zzip_calltype zzip_close(ZZIP_FILE * fp); _zzip_export -zzip_ssize_t zzip_read(ZZIP_FILE * fp, void * buf, zzip_size_t len); +zzip_ssize_t _zzip_calltype zzip_read(ZZIP_FILE * fp, void * buf, zzip_size_t len); /* * the stdc variant to open/read/close files. - Take note of the freopen() * call as it may reuse an existing preparsed copy of a zip central directory */ _zzip_export -ZZIP_FILE* zzip_freopen(zzip_char_t* name, zzip_char_t* mode, ZZIP_FILE*); +ZZIP_FILE* _zzip_calltype zzip_freopen(zzip_char_t* name, zzip_char_t* mode, ZZIP_FILE*); _zzip_export -ZZIP_FILE* zzip_fopen(zzip_char_t* name, zzip_char_t* mode); +ZZIP_FILE* _zzip_calltype zzip_fopen(zzip_char_t* name, zzip_char_t* mode); _zzip_export -zzip_size_t zzip_fread(void *ptr, zzip_size_t size, zzip_size_t nmemb, +zzip_size_t _zzip_calltype zzip_fread(void *ptr, zzip_size_t size, zzip_size_t nmemb, ZZIP_FILE * file); _zzip_export int zzip_fclose(ZZIP_FILE * fp); @@ -203,23 +203,23 @@ int zzip_fclose(ZZIP_FILE * fp); * seek and tell functions */ _zzip_export -int zzip_rewind(ZZIP_FILE *fp); +int _zzip_calltype zzip_rewind(ZZIP_FILE *fp); _zzip_export -zzip_off_t zzip_seek(ZZIP_FILE * fp, zzip_off_t offset, int whence); +zzip_off_t _zzip_calltype zzip_seek(ZZIP_FILE * fp, zzip_off_t offset, int whence); _zzip_export -zzip_off_t zzip_tell(ZZIP_FILE * fp); +zzip_off_t _zzip_calltype zzip_tell(ZZIP_FILE * fp); /* * reading info of a single file * zzip/stat.c */ _zzip_export -int zzip_dir_stat(ZZIP_DIR * dir, zzip_char_t* name, +int _zzip_calltype zzip_dir_stat(ZZIP_DIR * dir, zzip_char_t* name, ZZIP_STAT * zs, int flags); _zzip_export -int zzip_file_stat(ZZIP_FILE * fp, ZZIP_STAT * zs); +int _zzip_calltype zzip_file_stat(ZZIP_FILE * fp, ZZIP_STAT * zs); _zzip_export -int zzip_fstat(ZZIP_FILE * fp, ZZIP_STAT * zs); +int _zzip_calltype zzip_fstat(ZZIP_FILE * fp, ZZIP_STAT * zs); #ifdef ZZIP_LARGEFILE_RENAME #define zzip_open_shared_io zzip_open_shared_io64 @@ -236,20 +236,20 @@ int zzip_fstat(ZZIP_FILE * fp, ZZIP_STAT * zs); typedef union _zzip_plugin_io _zzip_const * zzip_plugin_io_t; _zzip_export -ZZIP_FILE * zzip_open_shared_io(ZZIP_FILE* stream, +ZZIP_FILE * _zzip_calltype zzip_open_shared_io(ZZIP_FILE* stream, zzip_char_t* name, int o_flags, int o_modes, zzip_strings_t* ext, zzip_plugin_io_t io); _zzip_export -ZZIP_FILE * zzip_open_ext_io(zzip_char_t* name, int o_flags, int o_modes, +ZZIP_FILE * _zzip_calltype zzip_open_ext_io(zzip_char_t* name, int o_flags, int o_modes, zzip_strings_t* ext, zzip_plugin_io_t io); _zzip_export -ZZIP_DIR * zzip_opendir_ext_io(zzip_char_t* name, int o_modes, +ZZIP_DIR * _zzip_calltype zzip_opendir_ext_io(zzip_char_t* name, int o_modes, zzip_strings_t* ext, zzip_plugin_io_t io); _zzip_export -ZZIP_DIR * zzip_dir_open_ext_io(zzip_char_t* filename, +ZZIP_DIR * _zzip_calltype zzip_dir_open_ext_io(zzip_char_t* filename, zzip_error_t* errcode_p, zzip_strings_t* ext, zzip_plugin_io_t io);