- Reorganize 3d code to defer rendering to vblank. eliminates tearing, and maybe some texturing artifacts. also possibly helps performance a bit by letting the hardware pipeline work some more before blocking for framebuffer read.

- Tweak optimization flags and change entire source code to use fastcall.
 - Add opengl state caching. This is of dubious performance assistance, but it is easy to take out so I am leaving it for now.
 - Add MMU->GPU signal for when vram mappings change, which allows it to assume textures are unchanged unless vram has changed. (big 3d speedup)
This commit is contained in:
zeromus 2008-09-09 08:50:00 +00:00
parent f11b11b18e
commit 11d989908f
10 changed files with 454 additions and 263 deletions

View File

@ -44,6 +44,12 @@
- Defer rendering until after flush. This was a necessary architectural change, as it permits savestate
for the display list, and allows us eventually to separate the GE emulation from the rendering [zeromus]
- Fix the 2d/3d compositing well enough for NSMB to fix bugs, but it is still bad [zeromus]
- Reorganize 3d code to defer rendering to vblank. eliminates tearing, and maybe some texturing artifacts.
also possibly helps performance a bit by letting the hardware pipeline work some more before blocking for
framebuffer read. [zeromus]
- Tweak optimization flags and change entire source code to use fastcall [zeromus]
- Add opengl state caching. This is of dubious performance assistance, but it is easy to take out so I am leaving it for now. [zeromus]
- Add MMU->GPU signal for when vram mappings change, which allows it to assume textures are unchanged unless vram has changed [zeromus]
0.7.3 -> 0.8

View File

@ -23,7 +23,7 @@ typedef struct {
u8 * ObjExtPal[2][2];
u8 * texPalSlot[4];
const u8 *textureSlotAddr[4];
u8 *textureSlotAddr[4];
u8 *blank_memory[0x20000];
} ARM9_struct;

View File

@ -1060,6 +1060,8 @@ void FASTCALL MMU_write8(u32 proc, u32 adr, u8 val)
ARM9Mem.textureSlotAddr[slot_index] =
&ARM9Mem.ARM9_LCD[0x20000 * (adr - REG_VRAMCNTA)];
gpu3D->NDS_3D_VramReconfigureSignal();
}
}
MMU_VRAMReloadFromLCD(adr-REG_VRAMCNTA,val) ;

View File

@ -1022,6 +1022,8 @@ NDS_exec(s32 nb, BOOL force)
nds.lignerendu = FALSE;
if(nds.VCount==192)
{
gpu3D->NDS_3D_VBlankSignal();
T1WriteWord(ARM9Mem.ARM9_REG, 4, T1ReadWord(ARM9Mem.ARM9_REG, 4) | 1);
T1WriteWord(MMU.ARM7_REG, 4, T1ReadWord(MMU.ARM7_REG, 4) | 1);
NDS_ARM9VBlankInt();

View File

@ -21,11 +21,8 @@
#ifndef GPU_3D
#define GPU_3D
#ifdef _MSC_VER
#define CALL_CONVENTION __cdecl
#else
//not using this right now
#define CALL_CONVENTION
#endif
/*
enum DRIVER_3D
@ -126,6 +123,8 @@ typedef struct GPU3DInterface
long (CALL_CONVENTION* NDS_3D_GetVecRes) (unsigned int index);
void (CALL_CONVENTION* NDS_3D_UpdateToonTable) (void* toonTable);
void (CALL_CONVENTION* NDS_3D_VBlankSignal) ();
void (CALL_CONVENTION* NDS_3D_VramReconfigureSignal) ();
} GPU3DInterface;

View File

@ -111,6 +111,7 @@
IntermediateDirectory="$(SolutionDir)\.VS2005\$(ConfigurationName)\$(PlatformName)"
ConfigurationType="1"
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
WholeProgramOptimization="1"
>
<Tool
Name="VCPreBuildEventTool"
@ -135,16 +136,19 @@
InlineFunctionExpansion="2"
EnableIntrinsicFunctions="true"
FavorSizeOrSpeed="1"
OmitFramePointers="true"
EnableFiberSafeOptimizations="true"
WholeProgramOptimization="true"
AdditionalIncludeDirectories="..;.\zlib123;.\zziplib"
PreprocessorDefinitions="_CRT_SECURE_NO_DEPRECATE;VERSION=\&quot;0.8.0b2\&quot;;WIN32;HAVE_LIBZ;HAVE_LIBZZIP;BETA_VERSION"
StringPooling="true"
ExceptionHandling="0"
BufferSecurityCheck="false"
EnableEnhancedInstructionSet="0"
FloatingPointModel="2"
WarningLevel="1"
DebugInformationFormat="3"
CallingConvention="0"
CallingConvention="1"
CompileAs="0"
/>
<Tool
@ -195,6 +199,7 @@
IntermediateDirectory="$(SolutionDir)\.VS2005\$(ConfigurationName)\$(PlatformName)"
ConfigurationType="1"
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
WholeProgramOptimization="1"
>
<Tool
Name="VCPreBuildEventTool"
@ -219,16 +224,19 @@
InlineFunctionExpansion="2"
EnableIntrinsicFunctions="true"
FavorSizeOrSpeed="1"
OmitFramePointers="true"
EnableFiberSafeOptimizations="true"
WholeProgramOptimization="true"
AdditionalIncludeDirectories="..;.\zlib123;.\zziplib"
PreprocessorDefinitions="_CRT_SECURE_NO_DEPRECATE;VERSION=\&quot;0.8.0b2 SSE2\&quot;;WIN32;HAVE_LIBZ;HAVE_LIBZZIP;SSE2;BETA_VERSION"
StringPooling="true"
ExceptionHandling="0"
BufferSecurityCheck="false"
EnableEnhancedInstructionSet="0"
EnableEnhancedInstructionSet="2"
FloatingPointModel="2"
WarningLevel="1"
DebugInformationFormat="3"
CallingConvention="0"
CallingConvention="1"
CompileAs="0"
/>
<Tool
@ -807,10 +815,26 @@
<File
RelativePath="..\ROMReader.cpp"
>
<FileConfiguration
Name="Release (SSE2)|Win32"
>
<Tool
Name="VCCLCompilerTool"
CallingConvention="1"
/>
</FileConfiguration>
</File>
<File
RelativePath="..\saves.cpp"
>
<FileConfiguration
Name="Release (SSE2)|Win32"
>
<Tool
Name="VCCLCompilerTool"
CallingConvention="1"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\snddx.cpp"
@ -832,6 +856,10 @@
RelativePath="..\wifi.cpp"
>
</File>
<Filter
Name="windows"
>
</Filter>
</Filter>
<Filter
Name="Header Files"

View File

@ -48,12 +48,15 @@
#include "../NDSSystem.h"
#include "OGLRender.h"
#ifndef CTASSERT
#define CTASSERT(x) typedef char __assert ## y[(x) ? 1 : -1]
#endif
#define fix2float(v) (((float)((s32)(v))) / (float)(1<<12))
#define fix10_2float(v) (((float)((s32)(v))) / (float)(1<<9))
static unsigned char GPU_screen3D [256*256*4]={0};
static unsigned char GPU_screenStencil[256*256]={0};
static __declspec(align(16)) unsigned char GPU_screen3D [256*256*4]={0};
static __declspec(align(16)) unsigned char GPU_screenStencil[256*256]={0};
// Acceleration tables
static float* float16table = NULL;
@ -89,6 +92,8 @@ static const unsigned short map3d_cull[4] = {GL_FRONT_AND_BACK, GL_FRONT, GL_BAC
static const int texEnv[4] = { GL_MODULATE, GL_DECAL, GL_MODULATE, GL_MODULATE };
static const int depthFunc[2] = { GL_LESS, GL_EQUAL };
static bool needRefreshFramebuffer = false;
//is this a crazy idea? this table spreads 5 bits evenly over 31 from exactly 0 to INT_MAX
static const int material_5bit_to_31bit[] = {
0x00000000, 0x04210842, 0x08421084, 0x0C6318C6,
@ -133,7 +138,7 @@ static float fogColor[4] = {0.f};
static float fogOffset = 0.f;
static float alphaTestRef = 0.01f;
static float alphaTestBase = 0.1f;
static float alphaTestBase = 0;
static unsigned long clCmd = 0;
static unsigned long clInd = 0;
static unsigned long clInd2 = 0;
@ -200,10 +205,97 @@ static void twiddleLists() {
vertlist->count = 0;
}
//------------------------------------------------------------
#define OGLEXT(x,y) x y;
#define INITOGLEXT(x,y) y = (x)wglGetProcAddress(#y);
OGLEXT(PFNGLCREATESHADERPROC,glCreateShader)
//zero: i dont understand this at all. my glext.h has the wrong thing declared here... so I have to do it myself
typedef void (APIENTRYP X_PFNGLGETSHADERSOURCEPROC) (GLuint shader, GLsizei bufSize, GLchar **source, GLsizei *length);
OGLEXT(X_PFNGLGETSHADERSOURCEPROC,glShaderSource)
OGLEXT(PFNGLCOMPILESHADERPROC,glCompileShader)
OGLEXT(PFNGLCREATEPROGRAMPROC,glCreateProgram)
OGLEXT(PFNGLATTACHSHADERPROC,glAttachShader)
OGLEXT(PFNGLLINKPROGRAMPROC,glLinkProgram)
OGLEXT(PFNGLUSEPROGRAMPROC,glUseProgram)
OGLEXT(PFNGLGETSHADERINFOLOGPROC,glGetShaderInfoLog)
//opengl state caching:
//This is of dubious performance assistance, but it is easy to take out so I am leaving it for now.
//every function that is xgl* can be replaced with gl* if we decide to rip this out or if anyone else
//doesnt feel like sticking with it (or if it causes trouble)
void xglDepthFunc(GLenum func) {
static GLenum oldfunc = -1;
if(oldfunc == func) return;
glDepthFunc(oldfunc=func);
}
void xglPolygonMode(GLenum face,GLenum mode) {
static GLenum oldmodes[2] = {-1,-1};
switch(face) {
case GL_FRONT: if(oldmodes[0]==mode) return; else glPolygonMode(GL_FRONT,oldmodes[0]=mode); return;
case GL_BACK: if(oldmodes[1]==mode) return; else glPolygonMode(GL_BACK,oldmodes[1]=mode); return;
case GL_FRONT_AND_BACK: if(oldmodes[0]==mode && oldmodes[1]==mode) return; else glPolygonMode(GL_FRONT_AND_BACK,oldmodes[0]=oldmodes[1]=mode);
}
}
void xglUseProgram(GLuint program) {
if(!glUseProgram) return;
static GLuint oldprogram = -1;
if(oldprogram==program) return;
glUseProgram(oldprogram=program);
}
void xglDepthMask (GLboolean flag) {
static GLboolean oldflag = -1;
if(oldflag==flag) return;
glDepthMask(oldflag=flag);
}
struct GLCaps {
u8 caps[0x100];
GLCaps() {
memset(caps,0xFF,sizeof(caps));
}
};
static GLCaps glcaps;
void _xglEnable(GLenum cap) {
cap -= 0x0B00;
if(glcaps.caps[cap] == 0xFF || glcaps.caps[cap] == 0) {
glEnable(cap+0x0B00);
glcaps.caps[cap] = 1;
}
}
void _xglDisable(GLenum cap) {
cap -= 0x0B00;
if(glcaps.caps[cap]) {
glDisable(cap+0x0B00);
glcaps.caps[cap] = 0;
}
}
#define xglEnable(cap) { \
CTASSERT((cap-0x0B00)<0x100); \
_xglEnable(cap); }
#define xglDisable(cap) {\
CTASSERT((cap-0x0B00)<0x100); \
_xglDisable(cap); }
//================================================= Textures
#define MAX_TEXTURE 500
typedef struct
struct TextureCache
{
TextureCache()
: suspectedInvalid(true)
{}
GLenum id;
unsigned int frm;
unsigned int mode;
@ -217,7 +309,10 @@ typedef struct
float invSizeY;
unsigned char texture[128*1024]; // 128Kb texture slot
} TextureCache;
//set if this texture is suspected be invalid due to a vram reconfigure
bool suspectedInvalid;
} ;
TextureCache texcache[MAX_TEXTURE+1];
u32 texcache_count;
@ -283,20 +378,6 @@ static void NDS_3D_UpdateToonTable(void* toonTable) {
glTexImage1D(GL_TEXTURE_1D, 0, GL_RGB, 32, 0, GL_RGBA, GL_UNSIGNED_BYTE, rgbToonTable);
}
#define OGLEXT(x,y) x y;
#define INITOGLEXT(x,y) y = (x)wglGetProcAddress(#y);
OGLEXT(PFNGLCREATESHADERPROC,glCreateShader)
//zero: i dont understand this at all. my glext.h has the wrong thing declared here... so I have to do it myself
typedef void (APIENTRYP X_PFNGLGETSHADERSOURCEPROC) (GLuint shader, GLsizei bufSize, GLchar **source, GLsizei *length);
OGLEXT(X_PFNGLGETSHADERSOURCEPROC,glShaderSource)
OGLEXT(PFNGLCOMPILESHADERPROC,glCompileShader)
OGLEXT(PFNGLCREATEPROGRAMPROC,glCreateProgram)
OGLEXT(PFNGLATTACHSHADERPROC,glAttachShader)
OGLEXT(PFNGLLINKPROGRAMPROC,glLinkProgram)
OGLEXT(PFNGLUSEPROGRAMPROC,glUseProgram)
OGLEXT(PFNGLGETSHADERINFOLOGPROC,glGetShaderInfoLog)
char NDS_glInit(void)
{
int i;
@ -343,12 +424,14 @@ char NDS_glInit(void)
#endif
glClearColor (0.f, 0.f, 0.f, 1.f);
glEnable (GL_NORMALIZE);
glEnable (GL_DEPTH_TEST);
glPixelStorei(GL_PACK_ALIGNMENT,8);
xglEnable (GL_NORMALIZE);
xglEnable (GL_DEPTH_TEST);
glEnable (GL_TEXTURE_2D);
glAlphaFunc (GL_GREATER, 0);
glEnable (GL_ALPHA_TEST);
xglEnable (GL_ALPHA_TEST);
glGenTextures (MAX_TEXTURE, &oglTempTextureID[0]);
@ -710,6 +793,22 @@ __forceinline void NDS_glMultMatrix4x4(signed long v)
//todo - make all color conversions go through a properly spread table!!
//I think this is slower than the regular memcmp.. doesnt make sense to me, but my
//asm optimization knowlege is 15 years old..
__forceinline int memcmp_slow(const void* src, const void* dst, u32 count) {
int retval;
__asm {
mov [retval], 0;
mov ecx, [count];
shr ecx, 2;
mov esi, [src];
mov edi, [dst];
repe cmpsd;
setc byte ptr [retval];
}
return retval;
}
__forceinline void* memcpy_fast(void* dest, const void* src, size_t count)
{
size_t blockCnt = count / 64;
@ -781,6 +880,7 @@ static void DebugDumpTexture(int which)
}
//================================================================================
static int lastTexture = -1;
__forceinline void setTexture(unsigned int format, unsigned int texpal)
{
int palSize[7]={32,4,16,256,0,8,32768};
@ -817,17 +917,27 @@ __forceinline void setTexture(unsigned int format, unsigned int texpal)
i=texcache_start;
if(false)
//if(false)
while (TRUE)
{
if (texcache_stop==i) break;
if (texcache[i].frm==0) break;
if ((texcache[i].frm==format)&&(texcache[i].pal==texpal))
{
if (!memcmp(adr,texcache[i].texture,imageSize))
//TODO - we need to compare the palette also.
//TODO - this doesnt correctly span bank boundaries. in fact, it seems quite dangerous.
if (!texcache[i].suspectedInvalid || !memcmp(adr,texcache[i].texture,min(imageSize,sizeof(texcache[i].texture))))
{
texcache[i].suspectedInvalid = false;
texcache_count=i;
if(i != lastTexture)
{
lastTexture = i;
glBindTexture(GL_TEXTURE_2D,texcache[i].id);
glMatrixMode (GL_TEXTURE);
glLoadIdentity ();
glScaled (texcache[i].invSizeX, texcache[i].invSizeY, 1.0f);
}
return;
}
}
@ -846,7 +956,10 @@ __forceinline void setTexture(unsigned int format, unsigned int texpal)
}
}
lastTexture = i;
glBindTexture(GL_TEXTURE_2D, texcache[i].id);
texcache[i].suspectedInvalid = false;
texcache[i].mode=textureMode;
texcache[i].pal=texpal;
texcache[i].sizeX=sizeX;
@ -856,11 +969,15 @@ __forceinline void setTexture(unsigned int format, unsigned int texpal)
texcache[i].invSizeY=1.0f/((float)sizeY*(1<<4));
texcache[i].texenv=envMode;
//memcpy(texcache[i].texture,adr,imageSize); //======================= copy
memcpy_fast(texcache[i].texture,adr,imageSize); //======================= copy
memcpy_fast(texcache[i].texture,adr,min(imageSize,sizeof(texcache[i].texture))); //======================= copy
texcache[i].numcolors=palSize[texcache[i].mode];
texcache[i].frm=format;
glMatrixMode (GL_TEXTURE);
glLoadIdentity ();
glScaled (texcache[i].invSizeX, texcache[i].invSizeY, 1.0f);
if(i==62 || textureMode==1) {
int zzz=9;
}
@ -1135,29 +1252,32 @@ __forceinline void NDS_glBegin(unsigned long v)
tempVertList.count = 0;
}
//controls states:
//glStencilFunc
//glStencilOp
//glColorMask
static u32 stencilStateSet = -1;
static void BeginRenderPoly()
{
int enableDepthWrite = 1;
u32 tmp=0;
tempVertList.count = 0;
glDepthFunc (depthFuncMode);
xglDepthFunc (depthFuncMode);
// Cull face
if (cullingMask != 0xC0)
{
glEnable(GL_CULL_FACE);
xglEnable(GL_CULL_FACE);
glCullFace(map3d_cull[cullingMask>>6]);
}
else
glDisable(GL_CULL_FACE);
xglDisable(GL_CULL_FACE);
// Alpha value, actually not well handled, 0 should be wireframe
if (colorAlpha > 0)
{
glPolygonMode (GL_FRONT, GL_FILL);
glPolygonMode (GL_BACK, GL_FILL);
xglPolygonMode (GL_FRONT_AND_BACK, GL_FILL);
//non-31 alpha polys are translucent
if(colorAlpha != 0x7FFFFFFF)
@ -1165,21 +1285,10 @@ static void BeginRenderPoly()
}
else
{
glPolygonMode (GL_FRONT, GL_LINE);
glPolygonMode (GL_BACK, GL_LINE);
xglPolygonMode (GL_FRONT_AND_BACK, GL_LINE);
}
// texture environment
setTexture(textureFormat, texturePalette);
//=================
if (texcache_count!=-1)
{
texCoordinateTransform = texcache[texcache_count].coord;
glMatrixMode (GL_TEXTURE);
glLoadIdentity ();
glScaled (texcache[texcache_count].invSizeX, texcache[texcache_count].invSizeY, 1.0f);
}
//a5i3 or a3i5 textures are translucent
alphaDepthWrite = 0; //zero - as a hack, we are never going to write depth buffer for alpha values
@ -1194,43 +1303,48 @@ static void BeginRenderPoly()
//handle shadow polys
if(envMode == 3)
{
glEnable(GL_STENCIL_TEST);
xglEnable(GL_STENCIL_TEST);
if(polyID == 0) {
enableDepthWrite = 1;
if(stencilStateSet!=0) {
stencilStateSet = 0;
//when the polyID is zero, we are writing the shadow mask.
//set stencilbuf = 1 where the shadow volume is obstructed by geometry.
//do not write color or depth information.
glStencilFunc(GL_ALWAYS,2,255);
glStencilOp(GL_KEEP,GL_REPLACE,GL_KEEP);
glColorMask(GL_FALSE,GL_FALSE,GL_FALSE,GL_FALSE);
enableDepthWrite = 1;
}
} else {
enableDepthWrite = 0;
if(stencilStateSet!=1) {
stencilStateSet = 1;
//when the polyid is nonzero, we are drawing the shadow poly.
//only draw the shadow poly where the stencilbuf==1.
//I am not sure whether to update the depth buffer here--so I chose not to.
glStencilFunc(GL_EQUAL,2,255);
glStencilOp(GL_KEEP,GL_KEEP,GL_KEEP);
glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_TRUE);
enableDepthWrite = 0;
}
}
} else {
glEnable(GL_STENCIL_TEST);
xglEnable(GL_STENCIL_TEST);
if(stencilStateSet!=2) {
stencilStateSet=2;
glStencilFunc(GL_ALWAYS,1,255);
glStencilOp(GL_REPLACE,GL_REPLACE,GL_REPLACE);
glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_TRUE);
}
}
//handle toon rendering
if(glUseProgram) {
if(envMode == 2) {
glUseProgram(toonProgram);
} else glUseProgram(0);
xglUseProgram(toonProgram);
} else xglUseProgram(0);
}
glDepthMask(enableDepthWrite?GL_TRUE:GL_FALSE);
//just to be sure
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
xglDepthMask(enableDepthWrite?GL_TRUE:GL_FALSE);
}
__forceinline void NDS_glEnd (void)
@ -1427,60 +1541,6 @@ __forceinline int NDS_glGetNumVertex (void)
return 0;
}
//NHerve mod3 - Fixed blending with 2D backgrounds (New Super Mario Bros looks better)
//zeromus post-mod3: fix even better
__forceinline void NDS_glGetLine (int line, unsigned short * dst)
{
int i, t;
u8 *screen3D = (u8 *)&GPU_screen3D [(191-(line%192))*1024];
u8 *screenStencil = (u8*)&GPU_screenStencil[(191-(line%192))*256];
//the renderer clears the stencil to 0
//then it sets it to 1 whenever it renders a pixel that passes the alpha test
//(it also sets it to 2 under some circumstances when rendering shadow volumes)
//so, we COULD use a zero stencil value to indicate that nothing should get composited.
//in fact, we are going to do that to fix some problems.
//but beware that it i figure it might could CAUSE some problems
//this alpha compositing blending logic isnt thought through at all
//someone needs to think about what bitdepth it should take place at and how to do it efficiently
u32 a,r,g,b,stencil,oldcolor,oldr,oldg,oldb;
for(i = 0, t=0; i < 256; i++)
{
stencil = screenStencil[i];
//you would use this if you wanted to use the stencil buffer to make decisions here
if(!stencil) continue;
t=i*4;
r = screen3D[t+0];
g = screen3D[t+1];
b = screen3D[t+2];
a = screen3D[t+3];
if(a != 0xFF && a != 0) {
int zzz=9;
}
oldcolor = RGB15TO32(dst[i],0);
oldr = oldcolor&0xFF;
oldg = (oldcolor>>8)&0xFF;
oldb = (oldcolor>>16)&0xFF;
r = (r*a + oldr*(255-a)) / 255;
g = (g*a + oldg*(255-a)) / 255;
b = (b*a + oldb*(255-a)) / 255;
r=min(255,r);
g=min(255,g);
b=min(255,b);
dst[i] = ((b>>3)<<10) | ((g>>3)<<5) | (r>>3);
}
}
static void InstallPolygonAttrib(unsigned long val)
{
// Light enable/disable
@ -1512,92 +1572,6 @@ __forceinline void NDS_glPolygonAttrib (unsigned long val)
InstallPolygonAttrib(polyAttr);
}
__forceinline void NDS_glFlush(unsigned long v)
{
u32 wbuffer = v&1;
u32 sortmode = (v>>1)&1;
// Set back some secure render states
glPolygonMode (GL_BACK, GL_FILL);
glPolygonMode (GL_FRONT, GL_FILL);
glDepthMask (GL_TRUE);
glClear (GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);
//render display list
//TODO - properly doublebuffer the display lists
{
int i;
for(i=0;i<polylist->count;i++) {
POLY *poly = &polylist->list[i];
int type = poly->type;
int j;
InstallPolygonAttrib(poly->polyAttr);
textureFormat = poly->texParam;
texturePalette = poly->texPalette;
BeginRenderPoly();
//since we havent got the whole pipeline working yet, lets use opengl for the projection
glMatrixMode(GL_PROJECTION);
glLoadMatrixf(poly->projMatrix);
glBegin(type==3?GL_TRIANGLES:GL_QUADS);
for(j=0;j<type;j++) {
VERT* vert = &vertlist->list[poly->vertIndexes[j]];
//float tempCoord[4];
//Vector4Copy(tempCoord,vert->coord);
//we havent got the whole pipeline working yet, so we cant do this
////convert from ds device coords to opengl
//tempCoord[0] *= 2;
//tempCoord[1] *= 2;
//tempCoord[0] -= 1;
//tempCoord[1] -= 1;
//todo - edge flag?
glTexCoord2fv(vert->texcoord);
glColor4iv(vert->color);
//glVertex3fv(tempCoord);
glVertex3fv(vert->coord);
}
glEnd();
}
}
twiddleLists();
//reset gpu state
clCmd = 0;
clInd = 0;
//capture rendering results
glFlush();
glReadPixels(0,0,256,192,GL_RGBA, GL_UNSIGNED_BYTE, GPU_screen3D);
glReadPixels(0,0,256,192,GL_STENCIL_INDEX, GL_UNSIGNED_BYTE, GPU_screenStencil);
//debug: view depth buffer via color buffer for debugging
{
//int ctr=0;
//for(ctr=0;ctr<256*192;ctr++) {
// float zval = GPU_screen3Ddepth[ctr];
// u8* colorPtr = GPU_screen3D+ctr*3;
// if(zval<0) {
// colorPtr[0] = 255;
// colorPtr[1] = 0;
// colorPtr[2] = 0;
// } else if(zval>1) {
// colorPtr[0] = 0;
// colorPtr[1] = 0;
// colorPtr[2] = 255;
// } else {
// colorPtr[0] = colorPtr[1] = colorPtr[2] = zval*255;
// //printlog("%f %f %d\n",zval, zval*255,colorPtr[0]);
// }
//}
}
}
/*
0-4 Diffuse Reflection Red
5-9 Diffuse Reflection Green
@ -1734,6 +1708,7 @@ __forceinline void NDS_glLightColor (unsigned long v)
__forceinline void NDS_glAlphaFunc(unsigned long v)
{
alphaTestRef = (v&31)/31.f;
glAlphaFunc (GL_GREATER, alphaTestBase);
}
__forceinline void NDS_glControl(unsigned long v)
@ -1749,11 +1724,11 @@ __forceinline void NDS_glControl(unsigned long v)
if(v&(1<<2))
{
//glAlphaFunc (GL_GREATER, alphaTestBase);
glAlphaFunc (GL_GREATER, alphaTestBase);
}
else
{
//glAlphaFunc (GL_GREATER, 0.1f);
glAlphaFunc (GL_GREATER, 0);
}
if(v&(1<<3))
@ -1878,6 +1853,181 @@ __forceinline void NDS_glNormal(unsigned long v)
}
}
static bool flushPending = false;
static u32 flush_wbuffer;
static u32 flush_sortmode;
void NDS_glFlush(unsigned long v)
{
flushPending = true;
flush_wbuffer = v&1;
flush_sortmode = (v>>1)&1;
}
void GL_Draw()
{
xglDepthMask (GL_TRUE);
glClear (GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);
//render display list
//TODO - properly doublebuffer the display lists
{
u32 lastTextureFormat, lastTexturePalette, lastPolyAttr;
for(int i=0;i<polylist->count;i++) {
POLY *poly = &polylist->list[i];
int type = poly->type;
//a very macro-level state caching approach:
//these are the only things which control the GPU rendering state.
if(i==0 || lastTextureFormat != poly->texParam || lastTexturePalette != poly->texPalette || lastPolyAttr != poly->polyAttr)
{
InstallPolygonAttrib(lastPolyAttr=poly->polyAttr);
lastTextureFormat = textureFormat = poly->texParam;
lastTexturePalette = texturePalette = poly->texPalette;
BeginRenderPoly();
}
//since we havent got the whole pipeline working yet, lets use opengl for the projection
glMatrixMode(GL_PROJECTION);
glLoadMatrixf(poly->projMatrix);
glBegin(type==3?GL_TRIANGLES:GL_QUADS);
for(int j=0;j<type;j++) {
VERT* vert = &vertlist->list[poly->vertIndexes[j]];
//float tempCoord[4];
//Vector4Copy(tempCoord,vert->coord);
//we havent got the whole pipeline working yet, so we cant do this
////convert from ds device coords to opengl
//tempCoord[0] *= 2;
//tempCoord[1] *= 2;
//tempCoord[0] -= 1;
//tempCoord[1] -= 1;
//todo - edge flag?
glTexCoord2fv(vert->texcoord);
glColor4iv(vert->color);
//glVertex3fv(tempCoord);
glVertex3fv(vert->coord);
}
glEnd();
}
}
//since we just redrew, we need to refresh the framebuffers
needRefreshFramebuffer = true;
twiddleLists();
//reset GE state
clCmd = 0;
clInd = 0;
}
void NDS_3D_VBlankSignal()
{
//the 3d buffers are swapped when a vblank begins.
//so, if we have a redraw pending, now is a safe time to do it
if(!flushPending) return;
flushPending = false;
GL_Draw();
}
void NDS_3D_VramReconfigureSignal()
{
//well, this is a very blunt instrument.
//lets just flag all the textures as invalid.
for(int i=0;i<MAX_TEXTURE+1;i++)
texcache[i].suspectedInvalid = true;
}
void GL_ReadFramebuffer()
{
glFinish();
glReadPixels(0,0,256,192,GL_RGBA, GL_UNSIGNED_BYTE, GPU_screen3D);
glReadPixels(0,0,256,192,GL_STENCIL_INDEX, GL_UNSIGNED_BYTE, GPU_screenStencil);
//debug: view depth buffer via color buffer for debugging
//int ctr=0;
//for(ctr=0;ctr<256*192;ctr++) {
// float zval = GPU_screen3Ddepth[ctr];
// u8* colorPtr = GPU_screen3D+ctr*3;
// if(zval<0) {
// colorPtr[0] = 255;
// colorPtr[1] = 0;
// colorPtr[2] = 0;
// } else if(zval>1) {
// colorPtr[0] = 0;
// colorPtr[1] = 0;
// colorPtr[2] = 255;
// } else {
// colorPtr[0] = colorPtr[1] = colorPtr[2] = zval*255;
// //printlog("%f %f %d\n",zval, zval*255,colorPtr[0]);
// }
//}
}
//NHerve mod3 - Fixed blending with 2D backgrounds (New Super Mario Bros looks better)
//zeromus post-mod3: fix even better
__forceinline void NDS_glGetLine (int line, unsigned short * dst)
{
if(needRefreshFramebuffer) {
needRefreshFramebuffer = false;
GL_ReadFramebuffer();
}
int i, t;
u8 *screen3D = (u8 *)&GPU_screen3D [(191-(line%192))*1024];
u8 *screenStencil = (u8*)&GPU_screenStencil[(191-(line%192))*256];
//the renderer clears the stencil to 0
//then it sets it to 1 whenever it renders a pixel that passes the alpha test
//(it also sets it to 2 under some circumstances when rendering shadow volumes)
//so, we COULD use a zero stencil value to indicate that nothing should get composited.
//in fact, we are going to do that to fix some problems.
//but beware that it i figure it might could CAUSE some problems
//this alpha compositing blending logic isnt thought through at all
//someone needs to think about what bitdepth it should take place at and how to do it efficiently
u32 a,r,g,b,stencil,oldcolor,oldr,oldg,oldb;
for(i = 0, t=0; i < 256; i++)
{
stencil = screenStencil[i];
//you would use this if you wanted to use the stencil buffer to make decisions here
if(!stencil) continue;
t=i*4;
r = screen3D[t+0];
g = screen3D[t+1];
b = screen3D[t+2];
a = screen3D[t+3];
if(a != 0xFF && a != 0) {
int zzz=9;
}
oldcolor = RGB15TO32(dst[i],0);
oldr = oldcolor&0xFF;
oldg = (oldcolor>>8)&0xFF;
oldb = (oldcolor>>16)&0xFF;
r = (r*a + oldr*(255-a)) / 255;
g = (g*a + oldg*(255-a)) / 255;
b = (b*a + oldb*(255-a)) / 255;
r=min(255,r);
g=min(255,g);
b=min(255,b);
dst[i] = ((b>>3)<<10) | ((g>>3)<<5) | (r>>3);
}
}
__forceinline void NDS_glBoxTest(unsigned long v)
{
}
@ -2405,7 +2555,9 @@ GPU3DInterface gpu3Dgl = { NDS_glInit,
NDS_glVecTest,
NDS_glGetPosRes,
NDS_glGetVecRes,
NDS_3D_UpdateToonTable
NDS_3D_UpdateToonTable,
NDS_3D_VBlankSignal,
NDS_3D_VramReconfigureSignal,
};

View File

@ -247,7 +247,7 @@
# define ZEXTERN extern
#endif
#ifndef ZEXPORT
# define ZEXPORT
# define ZEXPORT __cdecl
#endif
#ifndef ZEXPORTVA
# define ZEXPORTVA

View File

@ -1,6 +1,8 @@
#ifndef _ZZIP__MSVC_H
#define _ZZIP__MSVC_H 1
#define _zzip_calltype __cdecl
/* zzip/_msvc.h. Generated automatically at end of configure. */
/* config.h.in. Generated from configure.ac by autoheader. */

View File

@ -94,9 +94,9 @@ struct zzip_dirent
* zzip/err.c
*/
_zzip_export /* error in _opendir : */
zzip_char_t* zzip_strerror(int errcode);
zzip_char_t* _zzip_calltype zzip_strerror(int errcode);
_zzip_export /* error in other functions : */
zzip_char_t* zzip_strerror_of(ZZIP_DIR * dir);
zzip_char_t* _zzip_calltype zzip_strerror_of(ZZIP_DIR * dir);
_zzip_export /* error mapped to errno.h defines : */
int zzip_errno(int errcode);
@ -107,46 +107,46 @@ int zzip_errno(int errcode);
* zzip/info.c
*/
_zzip_export
int zzip_error(ZZIP_DIR * dir);
int _zzip_calltype zzip_error(ZZIP_DIR * dir);
_zzip_export
void zzip_seterror(ZZIP_DIR * dir, int errcode);
void _zzip_calltype zzip_seterror(ZZIP_DIR * dir, int errcode);
_zzip_export
zzip_char_t* zzip_compr_str(int compr);
_zzip_export
ZZIP_DIR * zzip_dirhandle(ZZIP_FILE * fp);
ZZIP_DIR * _zzip_calltype zzip_dirhandle(ZZIP_FILE * fp);
_zzip_export
int zzip_dirfd(ZZIP_DIR * dir);
int _zzip_calltype zzip_dirfd(ZZIP_DIR * dir);
_zzip_export
int zzip_dir_real(ZZIP_DIR * dir);
int _zzip_calltype zzip_dir_real(ZZIP_DIR * dir);
_zzip_export
int zzip_file_real(ZZIP_FILE * fp);
int _zzip_calltype zzip_file_real(ZZIP_FILE * fp);
_zzip_export
void* zzip_realdir(ZZIP_DIR * dir);
void* _zzip_calltype zzip_realdir(ZZIP_DIR * dir);
_zzip_export
int zzip_realfd(ZZIP_FILE * fp);
int _zzip_calltype zzip_realfd(ZZIP_FILE * fp);
/*
* zip handle management
* zzip/zip.c
*/
_zzip_export
ZZIP_DIR * zzip_dir_alloc(zzip_strings_t* fileext);
ZZIP_DIR * _zzip_calltype zzip_dir_alloc(zzip_strings_t* fileext);
_zzip_export
int zzip_dir_free(ZZIP_DIR *);
int _zzip_calltype zzip_dir_free(ZZIP_DIR *);
/*
* Opening/closing a zip archive
* zzip-zip.c
*/
_zzip_export
ZZIP_DIR * zzip_dir_fdopen(int fd, zzip_error_t * errcode_p);
ZZIP_DIR * _zzip_calltype zzip_dir_fdopen(int fd, zzip_error_t * errcode_p);
_zzip_export
ZZIP_DIR * zzip_dir_open(zzip_char_t* filename, zzip_error_t * errcode_p);
ZZIP_DIR * _zzip_calltype zzip_dir_open(zzip_char_t* filename, zzip_error_t * errcode_p);
_zzip_export
int zzip_dir_close(ZZIP_DIR * dir);
int _zzip_calltype zzip_dir_close(ZZIP_DIR * dir);
_zzip_export
int zzip_dir_read(ZZIP_DIR * dir, ZZIP_DIRENT * dirent);
int _zzip_calltype zzip_dir_read(ZZIP_DIR * dir, ZZIP_DIRENT * dirent);
/*
@ -155,46 +155,46 @@ int zzip_dir_read(ZZIP_DIR * dir, ZZIP_DIRENT * dirent);
* zzip/zip.c
*/
_zzip_export
ZZIP_DIR * zzip_opendir(zzip_char_t* filename);
ZZIP_DIR * _zzip_calltype zzip_opendir(zzip_char_t* filename);
_zzip_export
int zzip_closedir(ZZIP_DIR * dir);
int _zzip_calltype zzip_closedir(ZZIP_DIR * dir);
_zzip_export
ZZIP_DIRENT * zzip_readdir(ZZIP_DIR * dir);
ZZIP_DIRENT * _zzip_calltype zzip_readdir(ZZIP_DIR * dir);
_zzip_export
void zzip_rewinddir(ZZIP_DIR * dir);
void _zzip_calltype zzip_rewinddir(ZZIP_DIR * dir);
_zzip_export
zzip_off_t zzip_telldir(ZZIP_DIR * dir);
zzip_off_t _zzip_calltype zzip_telldir(ZZIP_DIR * dir);
_zzip_export
void zzip_seekdir(ZZIP_DIR * dir, zzip_off_t offset);
void _zzip_calltype zzip_seekdir(ZZIP_DIR * dir, zzip_off_t offset);
/*
* 'opening', 'closing' and reading invidual files in zip archive.
* zzip/file.c
*/
_zzip_export
ZZIP_FILE * zzip_file_open(ZZIP_DIR * dir, zzip_char_t* name, int flags);
ZZIP_FILE * _zzip_calltype zzip_file_open(ZZIP_DIR * dir, zzip_char_t* name, int flags);
_zzip_export
int zzip_file_close(ZZIP_FILE * fp);
int _zzip_calltype zzip_file_close(ZZIP_FILE * fp);
_zzip_export
zzip_ssize_t zzip_file_read(ZZIP_FILE * fp, void* buf, zzip_size_t len);
zzip_ssize_t _zzip_calltype zzip_file_read(ZZIP_FILE * fp, void* buf, zzip_size_t len);
_zzip_export
ZZIP_FILE * zzip_open(zzip_char_t* name, int flags);
ZZIP_FILE * _zzip_calltype zzip_open(zzip_char_t* name, int flags);
_zzip_export
int zzip_close(ZZIP_FILE * fp);
int _zzip_calltype zzip_close(ZZIP_FILE * fp);
_zzip_export
zzip_ssize_t zzip_read(ZZIP_FILE * fp, void * buf, zzip_size_t len);
zzip_ssize_t _zzip_calltype zzip_read(ZZIP_FILE * fp, void * buf, zzip_size_t len);
/*
* the stdc variant to open/read/close files. - Take note of the freopen()
* call as it may reuse an existing preparsed copy of a zip central directory
*/
_zzip_export
ZZIP_FILE* zzip_freopen(zzip_char_t* name, zzip_char_t* mode, ZZIP_FILE*);
ZZIP_FILE* _zzip_calltype zzip_freopen(zzip_char_t* name, zzip_char_t* mode, ZZIP_FILE*);
_zzip_export
ZZIP_FILE* zzip_fopen(zzip_char_t* name, zzip_char_t* mode);
ZZIP_FILE* _zzip_calltype zzip_fopen(zzip_char_t* name, zzip_char_t* mode);
_zzip_export
zzip_size_t zzip_fread(void *ptr, zzip_size_t size, zzip_size_t nmemb,
zzip_size_t _zzip_calltype zzip_fread(void *ptr, zzip_size_t size, zzip_size_t nmemb,
ZZIP_FILE * file);
_zzip_export
int zzip_fclose(ZZIP_FILE * fp);
@ -203,23 +203,23 @@ int zzip_fclose(ZZIP_FILE * fp);
* seek and tell functions
*/
_zzip_export
int zzip_rewind(ZZIP_FILE *fp);
int _zzip_calltype zzip_rewind(ZZIP_FILE *fp);
_zzip_export
zzip_off_t zzip_seek(ZZIP_FILE * fp, zzip_off_t offset, int whence);
zzip_off_t _zzip_calltype zzip_seek(ZZIP_FILE * fp, zzip_off_t offset, int whence);
_zzip_export
zzip_off_t zzip_tell(ZZIP_FILE * fp);
zzip_off_t _zzip_calltype zzip_tell(ZZIP_FILE * fp);
/*
* reading info of a single file
* zzip/stat.c
*/
_zzip_export
int zzip_dir_stat(ZZIP_DIR * dir, zzip_char_t* name,
int _zzip_calltype zzip_dir_stat(ZZIP_DIR * dir, zzip_char_t* name,
ZZIP_STAT * zs, int flags);
_zzip_export
int zzip_file_stat(ZZIP_FILE * fp, ZZIP_STAT * zs);
int _zzip_calltype zzip_file_stat(ZZIP_FILE * fp, ZZIP_STAT * zs);
_zzip_export
int zzip_fstat(ZZIP_FILE * fp, ZZIP_STAT * zs);
int _zzip_calltype zzip_fstat(ZZIP_FILE * fp, ZZIP_STAT * zs);
#ifdef ZZIP_LARGEFILE_RENAME
#define zzip_open_shared_io zzip_open_shared_io64
@ -236,20 +236,20 @@ int zzip_fstat(ZZIP_FILE * fp, ZZIP_STAT * zs);
typedef union _zzip_plugin_io _zzip_const * zzip_plugin_io_t;
_zzip_export
ZZIP_FILE * zzip_open_shared_io(ZZIP_FILE* stream,
ZZIP_FILE * _zzip_calltype zzip_open_shared_io(ZZIP_FILE* stream,
zzip_char_t* name, int o_flags, int o_modes,
zzip_strings_t* ext, zzip_plugin_io_t io);
_zzip_export
ZZIP_FILE * zzip_open_ext_io(zzip_char_t* name, int o_flags, int o_modes,
ZZIP_FILE * _zzip_calltype zzip_open_ext_io(zzip_char_t* name, int o_flags, int o_modes,
zzip_strings_t* ext, zzip_plugin_io_t io);
_zzip_export
ZZIP_DIR * zzip_opendir_ext_io(zzip_char_t* name, int o_modes,
ZZIP_DIR * _zzip_calltype zzip_opendir_ext_io(zzip_char_t* name, int o_modes,
zzip_strings_t* ext, zzip_plugin_io_t io);
_zzip_export
ZZIP_DIR * zzip_dir_open_ext_io(zzip_char_t* filename,
ZZIP_DIR * _zzip_calltype zzip_dir_open_ext_io(zzip_char_t* filename,
zzip_error_t* errcode_p,
zzip_strings_t* ext, zzip_plugin_io_t io);