From e7d63e6cd2b1c122327bf45583841f7dfcb8a72d Mon Sep 17 00:00:00 2001 From: rogerman Date: Wed, 4 Oct 2017 16:55:08 -0700 Subject: [PATCH] GFX3D: Reduce the memory requirement of the vertex lists to one-third of its previous size. - Has the positive side-effect of improving the OpenGL renderer's performance when many vertices are used. - Also fix the vertex list double-buffering so that it actually works as intended. --- desmume/src/OGLRender.cpp | 12 ++-- desmume/src/OGLRender.h | 1 - desmume/src/OGLRender_3_2.cpp | 4 +- desmume/src/gfx3d.cpp | 112 ++++++++++++++++++---------------- desmume/src/gfx3d.h | 100 +++++++++++++++++++----------- desmume/src/rasterize.cpp | 16 +++-- desmume/src/rasterize.h | 2 +- 7 files changed, 143 insertions(+), 104 deletions(-) diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index f224e4b6b..6c2b2760e 100755 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -3579,9 +3579,9 @@ Render3DError OpenGLRenderer_1_2::BeginRender(const GFX3D &engine) else { // If VBOs aren't supported, we need to use the client-side buffers here. - OGLRef.vtxPtrPosition = &engine.vertlist->list[0].coord; - OGLRef.vtxPtrTexCoord = &engine.vertlist->list[0].texcoord; - OGLRef.vtxPtrColor = (this->isShaderSupported) ? (GLvoid *)&engine.vertlist->list[0].color : OGLRef.color4fBuffer; + OGLRef.vtxPtrPosition = &engine.vertList[0].coord; + OGLRef.vtxPtrTexCoord = &engine.vertList[0].texcoord; + OGLRef.vtxPtrColor = (this->isShaderSupported) ? (GLvoid *)&engine.vertList[0].color : OGLRef.color4fBuffer; indexPtr = OGLRef.vertIndexBuffer; } @@ -3627,7 +3627,7 @@ Render3DError OpenGLRenderer_1_2::BeginRender(const GFX3D &engine) // Consolidate the vertex color and the poly alpha to our internal color buffer // so that OpenGL can use it. - const VERT *vert = &engine.vertlist->list[vertIndex]; + const VERT *vert = &engine.vertList[vertIndex]; OGLRef.color4fBuffer[colorIndex+0] = material_8bit_to_float[vert->color[0]]; OGLRef.color4fBuffer[colorIndex+1] = material_8bit_to_float[vert->color[1]]; OGLRef.color4fBuffer[colorIndex+2] = material_8bit_to_float[vert->color[2]]; @@ -3658,7 +3658,7 @@ Render3DError OpenGLRenderer_1_2::BeginRender(const GFX3D &engine) if (this->isVBOSupported) { glUnmapBufferARB(GL_ELEMENT_ARRAY_BUFFER_ARB); - glBufferSubDataARB(GL_ARRAY_BUFFER_ARB, 0, sizeof(VERT) * engine.vertlist->count, engine.vertlist); + glBufferSubDataARB(GL_ARRAY_BUFFER_ARB, 0, sizeof(VERT) * engine.vertListCount, engine.vertList); } glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); @@ -4661,7 +4661,7 @@ Render3DError OpenGLRenderer_2_0::BeginRender(const GFX3D &engine) } glUnmapBuffer(GL_ELEMENT_ARRAY_BUFFER); - glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(VERT) * engine.vertlist->count, engine.vertlist); + glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(VERT) * engine.vertListCount, engine.vertList); glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); glDepthMask(GL_TRUE); diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h index ea4f5511a..d3fdc9414 100755 --- a/desmume/src/OGLRender.h +++ b/desmume/src/OGLRender.h @@ -545,7 +545,6 @@ struct OGLRenderRef }; struct GFX3D_State; -struct VERTLIST; struct POLYLIST; struct INDEXLIST; struct POLY; diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp index e4af95641..2476e9e4f 100644 --- a/desmume/src/OGLRender_3_2.cpp +++ b/desmume/src/OGLRender_3_2.cpp @@ -1658,9 +1658,9 @@ Render3DError OpenGLRenderer_3_2::BeginRender(const GFX3D &engine) glUnmapBuffer(GL_ELEMENT_ARRAY_BUFFER); glUnmapBuffer(GL_TEXTURE_BUFFER); - const size_t vtxBufferSize = sizeof(VERT) * engine.vertlist->count; + const size_t vtxBufferSize = sizeof(VERT) * engine.vertListCount; VERT *vtxPtr = (VERT *)glMapBufferRange(GL_ARRAY_BUFFER, 0, vtxBufferSize, GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_UNSYNCHRONIZED_BIT); - memcpy(vtxPtr, engine.vertlist, vtxBufferSize); + memcpy(vtxPtr, engine.vertList, vtxBufferSize); glUnmapBuffer(GL_ARRAY_BUFFER); glUseProgram(OGLRef.programGeometryID); diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index a49111e6b..eff866171 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -367,9 +367,11 @@ static CACHE_ALIGN s32 cacheHalfVector[4][4]; //-------------poly and vertex lists and such things POLYLIST* polylists = NULL; POLYLIST* polylist = NULL; -VERTLIST* vertlists = NULL; -VERTLIST* vertlist = NULL; -int polygonListCompleted = 0; +VERT *vertLists = NULL; +VERT *vertList = NULL; + +size_t vertListCount[2] = {0, 0}; +int polygonListCompleted = 0; static int listTwiddle = 1; static u8 triStripToggle; @@ -391,10 +393,10 @@ static void twiddleLists() listTwiddle++; listTwiddle &= 1; polylist = &polylists[listTwiddle]; - vertlist = &vertlists[listTwiddle]; + vertList = vertLists + (VERTLIST_SIZE * listTwiddle); polylist->count = 0; polylist->opaqueCount = 0; - vertlist->count = 0; + vertListCount[listTwiddle] = 0; } static BOOL drawPending = FALSE; @@ -515,18 +517,21 @@ void gfx3d_init() // Use malloc() instead of new because, for some unknown reason, GCC 4.9 has a bug // that causes a std::bad_alloc exception on certain memory allocations. Right now, - // POLYLIST and VERTLIST are POD-style structs, so malloc() can substitute for new + // POLYLIST and VERT are POD-style structs, so malloc() can substitute for new // in this case. - if(polylists == NULL) + if (polylists == NULL) { polylists = (POLYLIST *)malloc(sizeof(POLYLIST)*2); polylist = &polylists[0]; } - if(vertlists == NULL) + if (vertLists == NULL) { - vertlists = (VERTLIST *)malloc(sizeof(VERTLIST)*2); - vertlist = &vertlists[0]; + vertLists = (VERT *)malloc_alignedPage(VERTLIST_SIZE * sizeof(VERT) * 2); + vertList = vertLists; + + vertListCount[0] = 0; + vertListCount[1] = 0; } gfx3d.state.savedDISP3DCNT.value = 0; @@ -547,9 +552,9 @@ void gfx3d_deinit() polylists = NULL; polylist = NULL; - free(vertlists); - vertlists = NULL; - vertlist = NULL; + free_aligned(vertLists); + vertLists = NULL; + vertList = NULL; } void gfx3d_reset() @@ -571,12 +576,13 @@ void gfx3d_reset() drawPending = FALSE; memset(polylists, 0, sizeof(POLYLIST)*2); - memset(vertlists, 0, sizeof(VERTLIST)*2); + memset(vertLists, 0, VERTLIST_SIZE * sizeof(VERT) * 2); gfx3d.state.invalidateToon = true; listTwiddle = 1; twiddleLists(); gfx3d.polylist = polylist; - gfx3d.vertlist = vertlist; + gfx3d.vertList = vertList; + gfx3d.vertListCount = vertListCount[listTwiddle]; polyAttr = 0; textureFormat = 0; @@ -726,7 +732,7 @@ static void SetVertex() //refuse to do anything if we have too many verts or polys polygonListCompleted = 0; - if(vertlist->count >= VERTLIST_SIZE) + if(vertListCount[listTwiddle] >= VERTLIST_SIZE) return; if(polylist->count >= POLYLIST_SIZE) return; @@ -745,13 +751,13 @@ static void SetVertex() //record the vertex //VERT &vert = tempVertList.list[tempVertList.count]; - const size_t vertIndex = vertlist->count + tempVertInfo.count - continuation; + const size_t vertIndex = vertListCount[listTwiddle] + tempVertInfo.count - continuation; if (vertIndex >= VERTLIST_SIZE) { printf("wtf\n"); } - VERT &vert = vertlist->list[vertIndex]; + VERT &vert = vertList[vertIndex]; //printf("%f %f %f\n",coordTransformed[0],coordTransformed[1],coordTransformed[2]); //if(coordTransformed[1] > 20) @@ -775,7 +781,7 @@ static void SetVertex() vert.color[1] = GFX3D_5TO6_LOOKUP(colorRGB[1]); vert.color[2] = GFX3D_5TO6_LOOKUP(colorRGB[2]); vert.color_to_float(); - tempVertInfo.map[tempVertInfo.count] = vertlist->count + tempVertInfo.count - continuation; + tempVertInfo.map[tempVertInfo.count] = vertListCount[listTwiddle] + tempVertInfo.count - continuation; tempVertInfo.count++; //possibly complete a polygon @@ -791,7 +797,7 @@ static void SetVertex() SUBMITVERTEX(0,0); SUBMITVERTEX(1,1); SUBMITVERTEX(2,2); - vertlist->count+=3; + vertListCount[listTwiddle] += 3; polylist->list[polylist->count].type = POLYGON_TYPE_TRIANGLE; tempVertInfo.count = 0; break; @@ -804,7 +810,7 @@ static void SetVertex() SUBMITVERTEX(1,1); SUBMITVERTEX(2,2); SUBMITVERTEX(3,3); - vertlist->count+=4; + vertListCount[listTwiddle] += 4; polylist->list[polylist->count].type = POLYGON_TYPE_QUAD; tempVertInfo.count = 0; break; @@ -819,14 +825,14 @@ static void SetVertex() polylist->list[polylist->count].type = POLYGON_TYPE_TRIANGLE; if(triStripToggle) - tempVertInfo.map[1] = vertlist->count+2-continuation; + tempVertInfo.map[1] = vertListCount[listTwiddle]+2-continuation; else - tempVertInfo.map[0] = vertlist->count+2-continuation; + tempVertInfo.map[0] = vertListCount[listTwiddle]+2-continuation; if(tempVertInfo.first) - vertlist->count+=3; + vertListCount[listTwiddle] += 3; else - vertlist->count+=1; + vertListCount[listTwiddle] += 1; triStripToggle ^= 1; tempVertInfo.first = false; @@ -842,11 +848,11 @@ static void SetVertex() SUBMITVERTEX(2,3); SUBMITVERTEX(3,2); polylist->list[polylist->count].type = POLYGON_TYPE_QUAD; - tempVertInfo.map[0] = vertlist->count+2-continuation; - tempVertInfo.map[1] = vertlist->count+3-continuation; + tempVertInfo.map[0] = vertListCount[listTwiddle]+2-continuation; + tempVertInfo.map[1] = vertListCount[listTwiddle]+3-continuation; if(tempVertInfo.first) - vertlist->count+=4; - else vertlist->count+=2; + vertListCount[listTwiddle] += 4; + else vertListCount[listTwiddle] += 2; tempVertInfo.first = false; tempVertInfo.count = 2; break; @@ -866,9 +872,9 @@ static void SetVertex() if (!(textureFormat & (7 << 26))) // no texture { bool duplicated = false; - const VERT &vert0 = vertlist->list[poly.vertIndexes[0]]; - const VERT &vert1 = vertlist->list[poly.vertIndexes[1]]; - const VERT &vert2 = vertlist->list[poly.vertIndexes[2]]; + const VERT &vert0 = vertList[poly.vertIndexes[0]]; + const VERT &vert1 = vertList[poly.vertIndexes[1]]; + const VERT &vert2 = vertList[poly.vertIndexes[2]]; if ( (vert0.x == vert1.x) && (vert0.y == vert1.y) ) duplicated = true; else if ( (vert1.x == vert2.x) && (vert1.y == vert2.y) ) duplicated = true; @@ -1823,7 +1829,7 @@ int gfx3d_GetNumPolys() int gfx3d_GetNumVertex() { //so is this in the currently-displayed or currently-built list? - return (vertlists[listTwiddle].count); + return (vertListCount[listTwiddle]); } void gfx3d_UpdateToonTable(u8 offset, u16 val) @@ -2226,7 +2232,8 @@ static void gfx3d_doFlush() //the renderer will get the lists we just built gfx3d.polylist = polylist; - gfx3d.vertlist = vertlist; + gfx3d.vertList = vertList; + gfx3d.vertListCount = vertListCount[listTwiddle]; //and also our current render state gfx3d.state.sortmode = BIT0(gfx3d.state.activeFlushCommand); @@ -2252,8 +2259,8 @@ static void gfx3d_doFlush() const size_t polycount = polylist->count; #ifdef _SHOW_VTX_COUNTERS max_polys = max((u32)polycount, max_polys); - max_verts = max((u32)vertlist->count, max_verts); - osd->addFixed(180, 20, "%i/%i", polycount, vertlist->count); // current + max_verts = max((u32)vertListCount[listTwiddle], max_verts); + osd->addFixed(180, 20, "%i/%i", polycount, vertListCount[listTwiddle]); // current osd->addFixed(180, 35, "%i/%i", max_polys, max_verts); // max #endif @@ -2269,15 +2276,15 @@ static void gfx3d_doFlush() // If both of these questions answer to yes, then how does the NDS handle a NaN? // For now, simply prevent w from being zero. POLY &poly = polylist->list[i]; - float verty = vertlist->list[poly.vertIndexes[0]].y; - float vertw = (vertlist->list[poly.vertIndexes[0]].w != 0.0f) ? vertlist->list[poly.vertIndexes[0]].w : 0.00000001f; + float verty = vertList[poly.vertIndexes[0]].y; + float vertw = (vertList[poly.vertIndexes[0]].w != 0.0f) ? vertList[poly.vertIndexes[0]].w : 0.00000001f; verty = 1.0f-(verty+vertw)/(2*vertw); poly.miny = poly.maxy = verty; for (size_t j = 1; j < poly.type; j++) { - verty = vertlist->list[poly.vertIndexes[j]].y; - vertw = (vertlist->list[poly.vertIndexes[j]].w != 0.0f) ? vertlist->list[poly.vertIndexes[j]].w : 0.00000001f; + verty = vertList[poly.vertIndexes[j]].y; + vertw = (vertList[poly.vertIndexes[j]].w != 0.0f) ? vertList[poly.vertIndexes[j]].w : 0.00000001f; verty = 1.0f-(verty+vertw)/(2*vertw); poly.miny = min(poly.miny, verty); poly.maxy = max(poly.maxy, verty); @@ -2330,8 +2337,11 @@ static void gfx3d_doFlush() viewer3d_state->frameNumber = currFrameCounter; viewer3d_state->state = gfx3d.state; viewer3d_state->polylist = *gfx3d.polylist; - viewer3d_state->vertlist = *gfx3d.vertlist; viewer3d_state->indexlist = gfx3d.indexlist; + viewer3d_state->vertListCount = gfx3d.vertListCount; + + memcpy(viewer3d_state->vertList, gfx3d.vertList, gfx3d.vertListCount * sizeof(VERT)); + driver->view3d->NewFrame(); } @@ -2620,9 +2630,9 @@ void gfx3d_savestate(EMUFILE &os) os.write_32LE(4); //dump the render lists - os.write_32LE((u32)vertlist->count); - for (size_t i = 0; i < vertlist->count; i++) - vertlist->list[i].save(os); + os.write_32LE((u32)vertListCount[listTwiddle]); + for (size_t i = 0; i < vertListCount[listTwiddle]; i++) + vertList[i].save(os); os.write_32LE((u32)polylist->count); for (size_t i = 0; i < polylist->count; i++) @@ -2676,8 +2686,8 @@ bool gfx3d_loadstate(EMUFILE &is, int size) //jiggle the lists. and also wipe them. this is clearly not the best thing to be doing. listTwiddle = 0; - polylist = &polylists[listTwiddle]; - vertlist = &vertlists[listTwiddle]; + polylist = &polylists[0]; + vertList = vertLists; gfx3d_parseCurrentDISP3DCNT(); @@ -2687,9 +2697,9 @@ bool gfx3d_loadstate(EMUFILE &is, int size) u32 polyListCount32 = 0; is.read_32LE(vertListCount32); - vertlist->count = vertListCount32; - for (size_t i = 0; i < vertlist->count; i++) - vertlist->list[i].load(is); + vertListCount[0] = vertListCount32; + for (size_t i = 0; i < vertListCount[0]; i++) + vertList[i].load(is); is.read_32LE(polyListCount32); polylist->count = polyListCount32; @@ -2714,9 +2724,9 @@ bool gfx3d_loadstate(EMUFILE &is, int size) } gfx3d.polylist = &polylists[listTwiddle^1]; - gfx3d.vertlist = &vertlists[listTwiddle^1]; + gfx3d.vertList = vertLists + VERTLIST_SIZE; gfx3d.polylist->count = 0; - gfx3d.vertlist->count = 0; + gfx3d.vertListCount = 0; if (version >= 4) { diff --git a/desmume/src/gfx3d.h b/desmume/src/gfx3d.h index 65571303a..be0dec9be 100644 --- a/desmume/src/gfx3d.h +++ b/desmume/src/gfx3d.h @@ -478,6 +478,8 @@ struct POLY { }; #define POLYLIST_SIZE 20000 +#define VERTLIST_SIZE (POLYLIST_SIZE * 4) + struct POLYLIST { POLY list[POLYLIST_SIZE]; size_t count; @@ -505,59 +507,86 @@ struct VERT_POS4f } }; -//dont use SSE optimized matrix instructions in here, things might not be aligned -//we havent padded this because the sheer bulk of data leaves things running faster without the extra bloat -struct VERT { - // Align to 16 for SSE instructions to work - union { +#include "PACKED.h" + +// This struct is padded in such a way so that each component can be accessed with a 16-byte alignment. +struct VERT +{ + union + { float coord[4]; - struct { - float x,y,z,w; + struct + { + float x, y, z, w; }; - } CACHE_ALIGN; - union { - float texcoord[2]; - struct { - float u,v; + }; + + union + { + float texcoord[4]; + struct + { + float u, v, tcPad2, tcPad3; }; - } CACHE_ALIGN; - void set_coord(float x, float y, float z, float w) { + }; + + union + { + float fcolor[4]; + struct + { + float rf, gf, bf, af; // The alpha value is unused and only exists for padding purposes. + }; + }; + + union + { + u32 color32; + u8 color[4]; + + struct + { + u8 r, g, b, a; // The alpha value is unused and only exists for padding purposes. + }; + }; + + u8 padFinal[12]; // Final padding to bring the struct to exactly 64 bytes. + + void set_coord(float x, float y, float z, float w) + { this->x = x; this->y = y; this->z = z; this->w = w; } - void set_coord(float* coords) { + + void set_coord(float* coords) + { x = coords[0]; y = coords[1]; z = coords[2]; w = coords[3]; } - float fcolor[3]; - u8 color[3]; - - - void color_to_float() { - fcolor[0] = color[0]; - fcolor[1] = color[1]; - fcolor[2] = color[2]; + + void color_to_float() + { + rf = (float)r; + gf = (float)g; + bf = (float)b; + af = (float)a; } + void save(EMUFILE &os); void load(EMUFILE &is); }; -#define VERTLIST_SIZE (POLYLIST_SIZE * 4) -struct VERTLIST { - VERT list[VERTLIST_SIZE]; - size_t count; -}; +#include "PACKED_END.h" #define INDEXLIST_SIZE (POLYLIST_SIZE * 4) struct INDEXLIST { int list[INDEXLIST_SIZE]; }; - struct VIEWPORT { u8 x, y; u16 width, height; @@ -660,9 +689,11 @@ struct Viewer3d_State { int frameNumber; GFX3D_State state; - VERTLIST vertlist; + VERT vertList[VERTLIST_SIZE]; POLYLIST polylist; INDEXLIST indexlist; + + size_t vertListCount; }; extern Viewer3d_State* viewer3d_state; @@ -670,8 +701,8 @@ extern Viewer3d_State* viewer3d_state; struct GFX3D { GFX3D() - : polylist(0) - , vertlist(0) + : polylist(NULL) + , vertList(NULL) , render3DFrameCount(0) { } @@ -681,10 +712,11 @@ struct GFX3D //values used for the currently-rendered frame (committed with each flush) GFX3D_State renderState; - POLYLIST* polylist; - VERTLIST* vertlist; + POLYLIST *polylist; + VERT *vertList; INDEXLIST indexlist; + size_t vertListCount; u32 render3DFrameCount; // Increments when gfx3d_doFlush() is called. Resets every 60 video frames. }; extern GFX3D gfx3d; diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index 75b0be436..5da48ad2b 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -1474,7 +1474,7 @@ Render3DError SoftRasterizerRenderer::InitTables() } template -size_t SoftRasterizerRenderer::performClipping(const VERTLIST *vertList, const POLYLIST *polyList, const INDEXLIST *indexList) +size_t SoftRasterizerRenderer::performClipping(const VERT *vertList, const POLYLIST *polyList, const INDEXLIST *indexList) { //submit all polys to clipper clipper.reset(); @@ -1482,12 +1482,10 @@ size_t SoftRasterizerRenderer::performClipping(const VERTLIST *vertList, const P { const POLY &poly = polyList->list[indexList->list[i]]; const VERT *clipVerts[4] = { - &vertList->list[poly.vertIndexes[0]], - &vertList->list[poly.vertIndexes[1]], - &vertList->list[poly.vertIndexes[2]], - poly.type==POLYGON_TYPE_QUAD - ?&vertList->list[poly.vertIndexes[3]] - :NULL + &vertList[poly.vertIndexes[0]], + &vertList[poly.vertIndexes[1]], + &vertList[poly.vertIndexes[2]], + (poly.type == POLYGON_TYPE_QUAD) ? &vertList[poly.vertIndexes[3]] : NULL }; clipper.clipPoly(poly, clipVerts); @@ -1651,11 +1649,11 @@ Render3DError SoftRasterizerRenderer::BeginRender(const GFX3D &engine) if (CommonSettings.GFX3D_HighResolutionInterpolateColor) { - this->_clippedPolyCount = this->performClipping(engine.vertlist, engine.polylist, &engine.indexlist); + this->_clippedPolyCount = this->performClipping(engine.vertList, engine.polylist, &engine.indexlist); } else { - this->_clippedPolyCount = this->performClipping(engine.vertlist, engine.polylist, &engine.indexlist); + this->_clippedPolyCount = this->performClipping(engine.vertList, engine.polylist, &engine.indexlist); } if (rasterizerCores >= 4) diff --git a/desmume/src/rasterize.h b/desmume/src/rasterize.h index 9b608bd7e..6c197a7ea 100644 --- a/desmume/src/rasterize.h +++ b/desmume/src/rasterize.h @@ -101,7 +101,7 @@ protected: // SoftRasterizer-specific methods virtual Render3DError InitTables(); - template size_t performClipping(const VERTLIST *vertList, const POLYLIST *polyList, const INDEXLIST *indexList); + template size_t performClipping(const VERT *vertList, const POLYLIST *polyList, const INDEXLIST *indexList); // Base rendering methods virtual Render3DError BeginRender(const GFX3D &engine);