From f68df5b976be6719f8264efcfb26958dbce43ff1 Mon Sep 17 00:00:00 2001 From: rogerman Date: Mon, 8 Feb 2016 21:20:13 +0000 Subject: [PATCH] Render3D: - Provide a means of specifying which specific framebuffers need to be flushed for each frame. --- desmume/src/GPU.cpp | 15 ++ desmume/src/GPU.h | 3 + desmume/src/OGLRender.cpp | 307 +++++++++++++++++++++++++------------- desmume/src/rasterize.cpp | 8 +- desmume/src/render3D.cpp | 178 ++++++++++++++++------ desmume/src/render3D.h | 6 + 6 files changed, 364 insertions(+), 153 deletions(-) diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 34d6a1bb4..97a9c2099 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -4089,6 +4089,18 @@ void GPUEngineA::SetCustomFramebufferSize(size_t w, size_t h) free_aligned(oldColorRGBA5551Buffer); } + +bool GPUEngineA::Is3DRenderedLayerNeeded() +{ + return ( this->_enableLayer[GPULayerID_BG0] && (this->_IORegisterMap->DISPCNT.BG0_3D != 0) ); +} + +bool GPUEngineA::Is3DCapturingNeeded() +{ + const IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT; + return ( (DISPCAPCNT.CaptureEnable != 0) && (vramConfiguration.banks[DISPCAPCNT.VRAMWriteBlock].purpose == VramConfiguration::LCDC) && (DISPCAPCNT.SrcA != 0) ); +} + template void GPUEngineA::RenderLine(const u16 l) { @@ -5681,7 +5693,10 @@ void GPUSubsystem::RenderLine(const u16 l, bool isFrameSkipRequested) { if (l == 0) { + CurrentRenderer->SetFramebufferFlushStates(this->_engineMain->Is3DRenderedLayerNeeded(), this->_engineMain->Is3DCapturingNeeded()); CurrentRenderer->RenderFinish(); + CurrentRenderer->SetFramebufferFlushStates(true, true); + this->_event->DidFrameBegin(); this->UpdateVRAM3DUsageProperties(); diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index 343a56f3f..f0f3173b9 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -1403,6 +1403,9 @@ public: FragmentColor* Get3DFramebufferRGBA6665() const; u16* Get3DFramebufferRGBA5551() const; virtual void SetCustomFramebufferSize(size_t w, size_t h); + + bool Is3DRenderedLayerNeeded(); + bool Is3DCapturingNeeded(); template void RenderLine(const u16 l); void FramebufferPostprocess(); diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index ebd4e45b4..ab674ec0f 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -895,7 +895,7 @@ void OpenGLRenderer::SetVersion(unsigned int major, unsigned int minor, unsigned Render3DError OpenGLRenderer::FlushFramebuffer(const FragmentColor *__restrict srcRGBA8888, FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551) { - if (srcRGBA8888 == NULL) + if ( srcRGBA8888 == NULL || ((dstRGBA6665 == NULL) && (dstRGBA5551 == NULL)) ) { return RENDER3DERROR_NOERR; } @@ -907,69 +907,152 @@ Render3DError OpenGLRenderer::FlushFramebuffer(const FragmentColor *__restrict s const size_t pixCount = this->_framebufferWidth; const size_t ssePixCount = pixCount - (pixCount % 4); - for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) + if ( (dstRGBA6665 != NULL) && (dstRGBA5551 != NULL) ) { - size_t x = 0; - + for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) + { + size_t x = 0; + #if defined(ENABLE_SSSE3) && defined(LOCAL_LE) - for (; x < ssePixCount; x += 4, ir += 4, iw += 4) - { - // Convert to RGBA6665 - __m128i color = _mm_load_si128((__m128i *)(srcRGBA8888 + ir)); - color = _mm_srli_epi32(color, 2); - - __m128i a = _mm_srli_epi32(color, 1); // Special handling for 5-bit alpha - a = _mm_and_si128(a, _mm_set1_epi32(0x1F000000)); - - color = _mm_and_si128(color, _mm_set1_epi32(0x003F3F3F)); - color = _mm_or_si128(color, a); - color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)); // Swizzle RGBA to BGRA - _mm_store_si128((__m128i *)(dstRGBA6665 + iw), color); - - // Convert to RGBA5551 - color = _mm_load_si128((__m128i *)(srcRGBA8888 + ir)); - - __m128i b = _mm_and_si128(color, _mm_set1_epi32(0x000000F8)); // Read from R - b = _mm_slli_epi32(b, 7); // Shift to B - - __m128i g = _mm_and_si128(color, _mm_set1_epi32(0x0000F800)); // Read from G - g = _mm_srli_epi32(g, 6); // Shift in G - - __m128i r = _mm_and_si128(color, _mm_set1_epi32(0x00F80000)); // Read from B - r = _mm_srli_epi32(r, 19); // Shift to R - - a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A - a = _mm_cmpeq_epi32(a, _mm_setzero_si128()); // Determine A - a = _mm_andnot_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A - - color = _mm_or_si128(_mm_or_si128(_mm_or_si128(b, g), r), a); - - // All the colors are currently placed on 32 bit boundaries, so we need to swizzle them - // to the lower 64 bits of our vector before we store them back to memory. - // Note: Do not attempt to use packssdw here since packing with the 0x8000 bit set will - // result in values of 0x7FFF, which are incorrect values in this case. - color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); - _mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), color); - } + for (; x < ssePixCount; x += 4, ir += 4, iw += 4) + { + __m128i color; + + // Convert to RGBA6665 + color = _mm_load_si128((__m128i *)(srcRGBA8888 + ir)); + color = _mm_srli_epi32(color, 2); + + __m128i a = _mm_srli_epi32(color, 1); // Special handling for 5-bit alpha + a = _mm_and_si128(a, _mm_set1_epi32(0x1F000000)); + + color = _mm_and_si128(color, _mm_set1_epi32(0x003F3F3F)); + color = _mm_or_si128(color, a); + color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)); // Swizzle RGBA to BGRA + _mm_store_si128((__m128i *)(dstRGBA6665 + iw), color); + + // Convert to RGBA5551 + color = _mm_load_si128((__m128i *)(srcRGBA8888 + ir)); + + __m128i b = _mm_and_si128(color, _mm_set1_epi32(0x000000F8)); // Read from R + b = _mm_slli_epi32(b, 7); // Shift to B + + __m128i g = _mm_and_si128(color, _mm_set1_epi32(0x0000F800)); // Read from G + g = _mm_srli_epi32(g, 6); // Shift in G + + __m128i r = _mm_and_si128(color, _mm_set1_epi32(0x00F80000)); // Read from B + r = _mm_srli_epi32(r, 19); // Shift to R + + a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A + a = _mm_cmpeq_epi32(a, _mm_setzero_si128()); // Determine A + a = _mm_andnot_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A + + color = _mm_or_si128(_mm_or_si128(_mm_or_si128(b, g), r), a); + + // All the colors are currently placed on 32 bit boundaries, so we need to swizzle them + // to the lower 64 bits of our vector before we store them back to memory. + // Note: Do not attempt to use packssdw here since packing with the 0x8000 bit set will + // result in values of 0x7FFF, which are incorrect values in this case. + color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + _mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), color); + } #endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE) - - for (; x < pixCount; x++, ir++, iw++) - { - // Use the correct endian format since OpenGL uses the native endian of - // the architecture it is running on. + + for (; x < pixCount; x++, ir++, iw++) + { + // Use the correct endian format since OpenGL uses the native endian of + // the architecture it is running on. #ifdef LOCAL_BE - dstRGBA6665[iw].color = BGRA8888_32_To_RGBA6665_32(srcRGBA8888[ir].color); - dstRGBA5551[iw] = R5G5B5TORGB15( (srcRGBA8888[ir].b >> 3) & 0x1F, - (srcRGBA8888[ir].g >> 3) & 0x1F, - (srcRGBA8888[ir].r >> 3) & 0x1F) | - ((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000); + dstRGBA6665[iw].color = BGRA8888_32_To_RGBA6665_32(srcRGBA8888[ir].color); + dstRGBA5551[iw] = R5G5B5TORGB15( (srcRGBA8888[ir].b >> 3) & 0x1F, + (srcRGBA8888[ir].g >> 3) & 0x1F, + (srcRGBA8888[ir].r >> 3) & 0x1F) | + ((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000); #else - dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(srcRGBA8888[ir].color); - dstRGBA5551[iw] = R5G5B5TORGB15( (srcRGBA8888[ir].b >> 3) & 0x1F, - (srcRGBA8888[ir].g >> 3) & 0x1F, - (srcRGBA8888[ir].r >> 3) & 0x1F) | - ((srcRGBA8888[ir].a == 0) ? 0x0000 : 0x8000); + dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(srcRGBA8888[ir].color); + dstRGBA5551[iw] = R5G5B5TORGB15( (srcRGBA8888[ir].b >> 3) & 0x1F, + (srcRGBA8888[ir].g >> 3) & 0x1F, + (srcRGBA8888[ir].r >> 3) & 0x1F) | + ((srcRGBA8888[ir].a == 0) ? 0x0000 : 0x8000); #endif + } + } + } + else if (dstRGBA6665 != NULL) + { + for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) + { + size_t x = 0; + +#if defined(ENABLE_SSSE3) && defined(LOCAL_LE) + for (; x < ssePixCount; x += 4, ir += 4, iw += 4) + { + __m128i color = _mm_load_si128((__m128i *)(srcRGBA8888 + ir)); + color = _mm_srli_epi32(color, 2); + + __m128i a = _mm_srli_epi32(color, 1); // Special handling for 5-bit alpha + a = _mm_and_si128(a, _mm_set1_epi32(0x1F000000)); + + color = _mm_and_si128(color, _mm_set1_epi32(0x003F3F3F)); + color = _mm_or_si128(color, a); + color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)); // Swizzle RGBA to BGRA + _mm_store_si128((__m128i *)(dstRGBA6665 + iw), color); + } +#endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE) + + for (; x < pixCount; x++, ir++, iw++) + { +#ifdef LOCAL_BE + dstRGBA6665[iw].color = BGRA8888_32_To_RGBA6665_32(srcRGBA8888[ir].color); +#else + dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(srcRGBA8888[ir].color); +#endif + } + } + } + else + { + for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) + { + size_t x = 0; + +#if defined(ENABLE_SSSE3) && defined(LOCAL_LE) + for (; x < ssePixCount; x += 4, ir += 4, iw += 4) + { + __m128i color = _mm_load_si128((__m128i *)(srcRGBA8888 + ir)); + + __m128i b = _mm_and_si128(color, _mm_set1_epi32(0x000000F8)); // Read from R + b = _mm_slli_epi32(b, 7); // Shift to B + + __m128i g = _mm_and_si128(color, _mm_set1_epi32(0x0000F800)); // Read from G + g = _mm_srli_epi32(g, 6); // Shift in G + + __m128i r = _mm_and_si128(color, _mm_set1_epi32(0x00F80000)); // Read from B + r = _mm_srli_epi32(r, 19); // Shift to R + + __m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A + a = _mm_cmpeq_epi32(a, _mm_setzero_si128()); // Determine A + a = _mm_andnot_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A + + color = _mm_or_si128(_mm_or_si128(_mm_or_si128(b, g), r), a); + color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + _mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), color); + } +#endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE) + + for (; x < pixCount; x++, ir++, iw++) + { +#ifdef LOCAL_BE + dstRGBA5551[iw] = R5G5B5TORGB15( (srcRGBA8888[ir].b >> 3) & 0x1F, + (srcRGBA8888[ir].g >> 3) & 0x1F, + (srcRGBA8888[ir].r >> 3) & 0x1F) | + ((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000); +#else + dstRGBA5551[iw] = R5G5B5TORGB15( (srcRGBA8888[ir].b >> 3) & 0x1F, + (srcRGBA8888[ir].g >> 3) & 0x1F, + (srcRGBA8888[ir].r >> 3) & 0x1F) | + ((srcRGBA8888[ir].a == 0) ? 0x0000 : 0x8000); +#endif + } } } @@ -2736,28 +2819,34 @@ Render3DError OpenGLRenderer_1_2::RenderFinish() return OGLERROR_NOERR; } - if(!BEGINGL()) - { - GPU->GetEventHandler()->DidRender3DEnd(); - return OGLERROR_BEGINGL_FAILED; - } + FragmentColor *framebufferRGBA6665 = (this->_willFlushFramebufferRGBA6665) ? GPU->GetEngineMain()->Get3DFramebufferRGBA6665() : NULL; + u16 *framebufferRGBA5551 = (this->_willFlushFramebufferRGBA5551) ? GPU->GetEngineMain()->Get3DFramebufferRGBA5551() : NULL; - if (this->isPBOSupported) + if ( (framebufferRGBA6665 != NULL) || (framebufferRGBA5551 != NULL) ) { - const FragmentColor *__restrict mappedBufferPtr = (FragmentColor *__restrict)glMapBufferARB(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY_ARB); - if (mappedBufferPtr != NULL) + if(!BEGINGL()) { - this->FlushFramebuffer(mappedBufferPtr, GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551()); - glUnmapBufferARB(GL_PIXEL_PACK_BUFFER_ARB); + GPU->GetEventHandler()->DidRender3DEnd(); + return OGLERROR_BEGINGL_FAILED; } + + if (this->isPBOSupported) + { + const FragmentColor *__restrict mappedBufferPtr = (FragmentColor *__restrict)glMapBufferARB(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY_ARB); + if (mappedBufferPtr != NULL) + { + this->FlushFramebuffer(mappedBufferPtr, framebufferRGBA6665, framebufferRGBA5551); + glUnmapBufferARB(GL_PIXEL_PACK_BUFFER_ARB); + } + } + else + { + glReadPixels(0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_BGRA, GL_UNSIGNED_BYTE, this->_framebufferColor); + this->FlushFramebuffer(this->_framebufferColor, framebufferRGBA6665, framebufferRGBA5551); + } + + ENDGL(); } - else - { - glReadPixels(0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_BGRA, GL_UNSIGNED_BYTE, this->_framebufferColor); - this->FlushFramebuffer(this->_framebufferColor, GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551()); - } - - ENDGL(); this->_pixelReadNeedsFinish = false; @@ -3314,28 +3403,34 @@ Render3DError OpenGLRenderer_1_5::RenderFinish() return OGLERROR_NOERR; } - if(!BEGINGL()) - { - GPU->GetEventHandler()->DidRender3DEnd(); - return OGLERROR_BEGINGL_FAILED; - } + FragmentColor *framebufferRGBA6665 = (this->_willFlushFramebufferRGBA6665) ? GPU->GetEngineMain()->Get3DFramebufferRGBA6665() : NULL; + u16 *framebufferRGBA5551 = (this->_willFlushFramebufferRGBA5551) ? GPU->GetEngineMain()->Get3DFramebufferRGBA5551() : NULL; - if (this->isPBOSupported) + if ( (framebufferRGBA6665 != NULL) || (framebufferRGBA5551 != NULL) ) { - const FragmentColor *__restrict mappedBufferPtr = (FragmentColor *__restrict)glMapBuffer(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY); - if (mappedBufferPtr != NULL) + if(!BEGINGL()) { - this->FlushFramebuffer(mappedBufferPtr, GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551()); - glUnmapBuffer(GL_PIXEL_PACK_BUFFER_ARB); + GPU->GetEventHandler()->DidRender3DEnd(); + return OGLERROR_BEGINGL_FAILED; } + + if (this->isPBOSupported) + { + const FragmentColor *__restrict mappedBufferPtr = (FragmentColor *__restrict)glMapBuffer(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY); + if (mappedBufferPtr != NULL) + { + this->FlushFramebuffer(mappedBufferPtr, framebufferRGBA6665, framebufferRGBA5551); + glUnmapBuffer(GL_PIXEL_PACK_BUFFER_ARB); + } + } + else + { + glReadPixels(0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_BGRA, GL_UNSIGNED_BYTE, this->_framebufferColor); + this->FlushFramebuffer(this->_framebufferColor, framebufferRGBA6665, framebufferRGBA5551); + } + + ENDGL(); } - else - { - glReadPixels(0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_BGRA, GL_UNSIGNED_BYTE, this->_framebufferColor); - this->FlushFramebuffer(this->_framebufferColor, GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551()); - } - - ENDGL(); this->_pixelReadNeedsFinish = false; @@ -4107,21 +4202,27 @@ Render3DError OpenGLRenderer_2_1::RenderFinish() return OGLERROR_NOERR; } - if(!BEGINGL()) - { - GPU->GetEventHandler()->DidRender3DEnd(); - return OGLERROR_BEGINGL_FAILED; - } + FragmentColor *framebufferRGBA6665 = (this->_willFlushFramebufferRGBA6665) ? GPU->GetEngineMain()->Get3DFramebufferRGBA6665() : NULL; + u16 *framebufferRGBA5551 = (this->_willFlushFramebufferRGBA5551) ? GPU->GetEngineMain()->Get3DFramebufferRGBA5551() : NULL; - const FragmentColor *__restrict mappedBufferPtr = (FragmentColor *__restrict)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY); - if (mappedBufferPtr != NULL) + if ( (framebufferRGBA6665 != NULL) || (framebufferRGBA5551 != NULL) ) { - this->FlushFramebuffer(mappedBufferPtr, GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551()); - glUnmapBuffer(GL_PIXEL_PACK_BUFFER); + if(!BEGINGL()) + { + GPU->GetEventHandler()->DidRender3DEnd(); + return OGLERROR_BEGINGL_FAILED; + } + + const FragmentColor *__restrict mappedBufferPtr = (FragmentColor *__restrict)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY); + if (mappedBufferPtr != NULL) + { + this->FlushFramebuffer(mappedBufferPtr, framebufferRGBA6665, framebufferRGBA5551); + glUnmapBuffer(GL_PIXEL_PACK_BUFFER); + } + + ENDGL(); } - ENDGL(); - this->_pixelReadNeedsFinish = false; GPU->GetEventHandler()->DidRender3DEnd(); diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index 1e245525b..aab39031e 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -1961,7 +1961,9 @@ Render3DError SoftRasterizerRenderer::EndRender(const u64 frameCount) this->RenderEdgeMarkingAndFog(this->postprocessParam[0]); } - this->FlushFramebuffer(GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551()); + FragmentColor *framebufferRGBA6665 = (this->_willFlushFramebufferRGBA6665) ? GPU->GetEngineMain()->Get3DFramebufferRGBA6665() : NULL; + u16 *framebufferRGBA5551 = (this->_willFlushFramebufferRGBA5551) ? GPU->GetEngineMain()->Get3DFramebufferRGBA5551() : NULL; + this->FlushFramebuffer(framebufferRGBA6665, framebufferRGBA5551); } return RENDER3DERROR_NOERR; @@ -2005,7 +2007,9 @@ Render3DError SoftRasterizerRenderer::RenderFinish() } } - this->FlushFramebuffer(GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551()); + FragmentColor *framebufferRGBA6665 = (this->_willFlushFramebufferRGBA6665) ? GPU->GetEngineMain()->Get3DFramebufferRGBA6665() : NULL; + u16 *framebufferRGBA5551 = (this->_willFlushFramebufferRGBA5551) ? GPU->GetEngineMain()->Get3DFramebufferRGBA5551() : NULL; + this->FlushFramebuffer(framebufferRGBA6665, framebufferRGBA5551); GPU->GetEventHandler()->DidRender3DEnd(); return RENDER3DERROR_NOERR; diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 3430027c0..2084cb5eb 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -228,6 +228,9 @@ Render3D::Render3D() _framebufferColorSizeBytes = 0; _framebufferColor = NULL; + _willFlushFramebufferRGBA6665 = true; + _willFlushFramebufferRGBA5551 = true; + Reset(); } @@ -282,6 +285,18 @@ Render3DError Render3D::SetFramebufferSize(size_t w, size_t h) return RENDER3DERROR_NOERR; } +void Render3D::GetFramebufferFlushStates(bool &willFlushRGBA6665, bool &willFlushRGBA5551) +{ + willFlushRGBA6665 = this->_willFlushFramebufferRGBA6665; + willFlushRGBA5551 = this->_willFlushFramebufferRGBA5551; +} + +void Render3D::SetFramebufferFlushStates(bool willFlushRGBA6665, bool willFlushRGBA5551) +{ + this->_willFlushFramebufferRGBA6665 = willFlushRGBA6665; + this->_willFlushFramebufferRGBA5551 = willFlushRGBA5551; +} + Render3DError Render3D::BeginRender(const GFX3D &engine) { return RENDER3DERROR_NOERR; @@ -309,12 +324,17 @@ Render3DError Render3D::EndRender(const u64 frameCount) Render3DError Render3D::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551) { - memcpy(dstRGBA6665, this->_framebufferColor, this->_framebufferColorSizeBytes); - - // Convert to RGBA5551 - for (size_t i = 0; i < (this->_framebufferWidth * this->_framebufferHeight); i++) + if (dstRGBA6665 != NULL) { - dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000); + memcpy(dstRGBA6665, this->_framebufferColor, this->_framebufferColorSizeBytes); + } + + if (dstRGBA5551 != NULL) + { + for (size_t i = 0; i < (this->_framebufferWidth * this->_framebufferHeight); i++) + { + dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000); + } } return RENDER3DERROR_NOERR; @@ -453,6 +473,9 @@ Render3DError Render3D::Reset() memset(this->clearImagePolyIDBuffer, 0, sizeof(this->clearImagePolyIDBuffer)); memset(this->clearImageFogBuffer, 0, sizeof(this->clearImageFogBuffer)); + this->_willFlushFramebufferRGBA6665 = true; + this->_willFlushFramebufferRGBA5551 = true; + TexCache_Reset(); return RENDER3DERROR_NOERR; @@ -505,59 +528,118 @@ Render3DError Render3D::VramReconfigureSignal() Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551) { + if ( (dstRGBA6665 == NULL) && (dstRGBA5551 == NULL) ) + { + return RENDER3DERROR_NOERR; + } + const __m128i zero_vec128 = _mm_setzero_si128(); size_t i = 0; const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight; const size_t ssePixCount = pixCount - (pixCount % 4); - for (; i < ssePixCount; i += 4) + if ( (dstRGBA6665 != NULL) && (dstRGBA5551 != NULL) ) { - // Copy the framebufferColor buffer - __m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i)); - _mm_store_si128((__m128i *)(dstRGBA6665 + i), color); + for (; i < ssePixCount; i += 4) + { + // Copy the framebufferColor buffer + __m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i)); + _mm_store_si128((__m128i *)(dstRGBA6665 + i), color); + + // Convert to RGBA5551 + __m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E)); // Read from R + r = _mm_srli_epi32(r, 1); // Shift to R + + __m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00)); // Read from G + g = _mm_srli_epi32(g, 4); // Shift in G + + __m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000)); // Read from B + b = _mm_srli_epi32(b, 7); // Shift to B + + __m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A + a = _mm_cmpeq_epi32(a, zero_vec128); // Determine A + + // From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned + // 16-bit. Since SSE2 only has packssdw (signed saturated 16-bit pack), using + // packssdw on the alpha bit (0x8000) will result in a value of 0x7FFF, which is + // incorrect. Now if we were to use SSE4.1's packusdw (unsigned saturated 16-bit + // pack), we wouldn't have to go through this hassle. But not everyone has an + // SSE4.1-capable CPU, so doing this the SSE2 way is more guaranteed to work for + // everyone's CPU. + // + // To use packssdw, we take a bit one position lower for the alpha bit, run + // packssdw, then shift the bit back to its original position. Then we por the + // alpha vector with the post-packed color vector to get the final color. + + a = _mm_andnot_si128(a, _mm_set1_epi32(0x00004000)); // Mask out the bit before A + a = _mm_packs_epi32(a, zero_vec128); // Pack 32-bit down to 16-bit + a = _mm_slli_epi16(a, 1); // Shift the A bit back to where it needs to be + + // Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in. + color = _mm_or_si128(_mm_or_si128(r, g), b); + color = _mm_packs_epi32(color, zero_vec128); + color = _mm_or_si128(color, a); + + _mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color); + } - // Convert to RGBA5551 - __m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E)); // Read from R - r = _mm_srli_epi32(r, 1); // Shift to R - - __m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00)); // Read from G - g = _mm_srli_epi32(g, 4); // Shift in G - - __m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000)); // Read from B - b = _mm_srli_epi32(b, 7); // Shift to B - - __m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A - a = _mm_cmpeq_epi32(a, zero_vec128); // Determine A - - // From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned - // 16-bit. Since SSE2 only has packssdw (signed saturated 16-bit pack), using - // packssdw on the alpha bit (0x8000) will result in a value of 0x7FFF, which is - // incorrect. Now if we were to use SSE4.1's packusdw (unsigned saturated 16-bit - // pack), we wouldn't have to go through this hassle. But not everyone has an - // SSE4.1-capable CPU, so doing this the SSE2 way is more guaranteed to work for - // everyone's CPU. - // - // To use packssdw, we take a bit one position lower for the alpha bit, run - // packssdw, then shift the bit back to its original position. Then we por the - // alpha vector with the post-packed color vector to get the final color. - - a = _mm_andnot_si128(a, _mm_set1_epi32(0x00004000)); // Mask out the bit before A - a = _mm_packs_epi32(a, zero_vec128); // Pack 32-bit down to 16-bit - a = _mm_slli_epi16(a, 1); // Shift the A bit back to where it needs to be - - // Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in. - color = _mm_or_si128(_mm_or_si128(r, g), b); - color = _mm_packs_epi32(color, zero_vec128); - color = _mm_or_si128(color, a); - - _mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color); + for (; i < pixCount; i++) + { + dstRGBA6665[i] = this->_framebufferColor[i]; + dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000); + } } - - for (; i < pixCount; i++) + else if (dstRGBA6665 != NULL) { - dstRGBA6665[i] = this->_framebufferColor[i]; - dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000); + memcpy(dstRGBA6665, this->_framebufferColor, this->_framebufferColorSizeBytes); + } + else + { + for (; i < ssePixCount; i += 4) + { + // Convert to RGBA5551 + __m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i)); + __m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E)); // Read from R + r = _mm_srli_epi32(r, 1); // Shift to R + + __m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00)); // Read from G + g = _mm_srli_epi32(g, 4); // Shift in G + + __m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000)); // Read from B + b = _mm_srli_epi32(b, 7); // Shift to B + + __m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A + a = _mm_cmpeq_epi32(a, zero_vec128); // Determine A + + // From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned + // 16-bit. Since SSE2 only has packssdw (signed saturated 16-bit pack), using + // packssdw on the alpha bit (0x8000) will result in a value of 0x7FFF, which is + // incorrect. Now if we were to use SSE4.1's packusdw (unsigned saturated 16-bit + // pack), we wouldn't have to go through this hassle. But not everyone has an + // SSE4.1-capable CPU, so doing this the SSE2 way is more guaranteed to work for + // everyone's CPU. + // + // To use packssdw, we take a bit one position lower for the alpha bit, run + // packssdw, then shift the bit back to its original position. Then we por the + // alpha vector with the post-packed color vector to get the final color. + + a = _mm_andnot_si128(a, _mm_set1_epi32(0x00004000)); // Mask out the bit before A + a = _mm_packs_epi32(a, zero_vec128); // Pack 32-bit down to 16-bit + a = _mm_slli_epi16(a, 1); // Shift the A bit back to where it needs to be + + // Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in. + color = _mm_or_si128(_mm_or_si128(r, g), b); + color = _mm_packs_epi32(color, zero_vec128); + color = _mm_or_si128(color, a); + + _mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color); + } + + for (; i < pixCount; i++) + { + dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000); + } } return RENDER3DERROR_NOERR; diff --git a/desmume/src/render3D.h b/desmume/src/render3D.h index dd627d41c..bc9ecf55a 100644 --- a/desmume/src/render3D.h +++ b/desmume/src/render3D.h @@ -109,6 +109,9 @@ protected: size_t _framebufferColorSizeBytes; FragmentColor *_framebufferColor; + bool _willFlushFramebufferRGBA6665; + bool _willFlushFramebufferRGBA5551; + CACHE_ALIGN u16 clearImageColor16Buffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; CACHE_ALIGN u32 clearImageDepthBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; CACHE_ALIGN u8 clearImageFogBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; @@ -155,6 +158,9 @@ public: virtual Render3DError VramReconfigureSignal(); // Called when the emulator reconfigures its VRAM. You may need to invalidate your texture cache. virtual Render3DError SetFramebufferSize(size_t w, size_t h); // Called whenever the output framebuffer size changes. + + virtual void GetFramebufferFlushStates(bool &willFlushRGBA6665, bool &willFlushRGBA5551); + virtual void SetFramebufferFlushStates(bool willFlushRGBA6665, bool willFlushRGBA5551); }; #ifdef ENABLE_SSE2