diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 55254aaa6..6b0815d75 100755 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -488,6 +488,75 @@ Render3DError Render3D::UpdateToonTable(const u16 *toonTableBuffer) return RENDER3DERROR_NOERR; } +template +void Render3D::_ClearImageScrolledLoop(const u8 xScroll, const u8 yScroll, const u16 *__restrict inColor16, const u16 *__restrict inDepth16, const u8 inPolyID, + u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog, u8 *__restrict outPolyID) +{ + if (ISCOLORBLANK && ISDEPTHBLANK) + { + memset(outColor16, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u16)); + memset(outDepth24, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u32)); + memset(outFog, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u8)); + memset(outPolyID, inPolyID, GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u8)); + } + else + { + if (ISCOLORBLANK) + { + // Hint to when the clear color image pointer is pointing to blank memory. + // In this case, just do a simple zero fill for speed. + // + // Test cases: + // - Sonic Chronicles: The Dark Brotherhood + // - The Chronicles of Narnia: The Lion, the Witch and the Wardrobe + memset(outColor16, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u16)); + } + + if (ISDEPTHBLANK) + { + memset(outDepth24, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u32)); + memset(outFog, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u8)); + } + + for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++) + { + const size_t y = ((iy + yScroll) & 0xFF) << 8; + + for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex++, ix++) + { + const size_t x = (ix + xScroll) & 0xFF; + const size_t srcIndex = y | x; + + // Clear image color buffer in RGBA5551 format. + // + // Test cases: + // - Harry Potter and the Order of Phoenix + // - Blazer Drive + if (!ISCOLORBLANK) + { + outColor16[dstIndex] = inColor16[srcIndex]; + } + + // Clear image depth buffer, where the first 15 bits are converted to + // 24-bit depth, and the remaining MSB is the fog flag. + // + // Test cases: + // - Harry Potter and the Order of Phoenix + // - Blazer Drive + // - Sonic Chronicles: The Dark Brotherhood + // - The Chronicles of Narnia: The Lion, the Witch and the Wardrobe + if (!ISDEPTHBLANK) + { + outDepth24[dstIndex] = DS_DEPTH15TO24(inDepth16[srcIndex]); + outFog[dstIndex] = BIT15(inDepth16[srcIndex]); + } + + outPolyID[dstIndex] = inPolyID; + } + } + } +} + Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState) { Render3DError error = RENDER3DERROR_NOERR; @@ -530,27 +599,28 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState) } else { - for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++) + const bool isClearColorBlank = (clearColorBuffer >= (u16 *)MMU.blank_memory); + const bool isClearDepthBlank = (clearDepthBuffer >= (u16 *)MMU.blank_memory); + + if (!isClearColorBlank && !isClearDepthBlank) { - const size_t y = ((iy + yScroll) & 0xFF) << 8; - - for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex++, ix++) - { - const size_t x = (ix + xScroll) & 0xFF; - const size_t srcIndex = y | x; - - //this is tested by harry potter and the order of the phoenix. - //TODO (optimization) dont do this if we are mapped to blank memory (such as in sonic chronicles) - //(or use a special zero fill in the bulk clearing above) - this->clearImageColor16Buffer[dstIndex] = clearColorBuffer[srcIndex]; - - //this is tested quite well in the sonic chronicles main map mode - //where depth values are used for trees etc you can walk behind - this->clearImageDepthBuffer[dstIndex] = DS_DEPTH15TO24(clearDepthBuffer[srcIndex]); - - this->clearImageFogBuffer[dstIndex] = BIT15(clearDepthBuffer[srcIndex]); - this->clearImagePolyIDBuffer[dstIndex] = clearFragment.opaquePolyID; - } + this->_ClearImageScrolledLoop(xScroll, yScroll, clearColorBuffer, clearDepthBuffer, clearFragment.opaquePolyID, + this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer); + } + else if (isClearColorBlank) + { + this->_ClearImageScrolledLoop< true, false>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer, clearFragment.opaquePolyID, + this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer); + } + else if (isClearDepthBlank) + { + this->_ClearImageScrolledLoop(xScroll, yScroll, clearColorBuffer, clearDepthBuffer, clearFragment.opaquePolyID, + this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer); + } + else + { + this->_ClearImageScrolledLoop< true, true>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer, clearFragment.opaquePolyID, + this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer); } } @@ -758,6 +828,11 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState) } else { + // FIXME: Fix SSE2 support for scrolled clear images. + // The depth-related code below doesn't actually work, and I don't know why + // this is, so just use the scalar version for now. + // - rogerman, 2018/09/19 + /* const size_t shiftCount = xScroll & 0x07; for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++) @@ -850,6 +925,30 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState) _mm_storel_epi64((__m128i *)(this->clearImagePolyIDBuffer + dstIndex), opaquePolyID_vec128); } } + */ + const bool isClearColorBlank = (clearColorBuffer >= (u16 *)MMU.blank_memory); + const bool isClearDepthBlank = (clearDepthBuffer >= (u16 *)MMU.blank_memory); + + if (!isClearColorBlank && !isClearDepthBlank) + { + this->_ClearImageScrolledLoop(xScroll, yScroll, clearColorBuffer, clearDepthBuffer, clearFragment.opaquePolyID, + this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer); + } + else if (isClearColorBlank) + { + this->_ClearImageScrolledLoop< true, false>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer, clearFragment.opaquePolyID, + this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer); + } + else if (isClearDepthBlank) + { + this->_ClearImageScrolledLoop(xScroll, yScroll, clearColorBuffer, clearDepthBuffer, clearFragment.opaquePolyID, + this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer); + } + else + { + this->_ClearImageScrolledLoop< true, true>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer, clearFragment.opaquePolyID, + this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer); + } } error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer); diff --git a/desmume/src/render3D.h b/desmume/src/render3D.h index af19743a0..5982ad172 100644 --- a/desmume/src/render3D.h +++ b/desmume/src/render3D.h @@ -183,7 +183,11 @@ protected: CACHE_ALIGN u16 clearImageColor16Buffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; CACHE_ALIGN u32 clearImageDepthBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; CACHE_ALIGN u8 clearImageFogBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; - CACHE_ALIGN u8 clearImagePolyIDBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; + CACHE_ALIGN u8 clearImagePolyIDBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; + + template void _ClearImageScrolledLoop(const u8 xScroll, const u8 yScroll, const u16 *__restrict inColor16, const u16 *__restrict inDepth16, const u8 inPolyID, + u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog, u8 *__restrict outPolyID); + virtual Render3DError BeginRender(const GFX3D &engine); virtual Render3DError RenderGeometry(const GFX3D_State &renderState, const POLYLIST *polyList, const INDEXLIST *indexList);