Render3D:

- Provide a means of specifying which specific framebuffers need to be flushed for each frame.
This commit is contained in:
rogerman 2016-02-08 21:20:13 +00:00
parent ed1879dc8f
commit f68df5b976
6 changed files with 364 additions and 153 deletions

View File

@ -4089,6 +4089,18 @@ void GPUEngineA::SetCustomFramebufferSize(size_t w, size_t h)
free_aligned(oldColorRGBA5551Buffer);
}
bool GPUEngineA::Is3DRenderedLayerNeeded()
{
return ( this->_enableLayer[GPULayerID_BG0] && (this->_IORegisterMap->DISPCNT.BG0_3D != 0) );
}
bool GPUEngineA::Is3DCapturingNeeded()
{
const IOREG_DISPCAPCNT &DISPCAPCNT = this->_IORegisterMap->DISPCAPCNT;
return ( (DISPCAPCNT.CaptureEnable != 0) && (vramConfiguration.banks[DISPCAPCNT.VRAMWriteBlock].purpose == VramConfiguration::LCDC) && (DISPCAPCNT.SrcA != 0) );
}
template<bool ISCUSTOMRENDERINGNEEDED>
void GPUEngineA::RenderLine(const u16 l)
{
@ -5681,7 +5693,10 @@ void GPUSubsystem::RenderLine(const u16 l, bool isFrameSkipRequested)
{
if (l == 0)
{
CurrentRenderer->SetFramebufferFlushStates(this->_engineMain->Is3DRenderedLayerNeeded(), this->_engineMain->Is3DCapturingNeeded());
CurrentRenderer->RenderFinish();
CurrentRenderer->SetFramebufferFlushStates(true, true);
this->_event->DidFrameBegin();
this->UpdateVRAM3DUsageProperties();

View File

@ -1403,6 +1403,9 @@ public:
FragmentColor* Get3DFramebufferRGBA6665() const;
u16* Get3DFramebufferRGBA5551() const;
virtual void SetCustomFramebufferSize(size_t w, size_t h);
bool Is3DRenderedLayerNeeded();
bool Is3DCapturingNeeded();
template<bool ISCUSTOMRENDERINGNEEDED> void RenderLine(const u16 l);
void FramebufferPostprocess();

View File

@ -895,7 +895,7 @@ void OpenGLRenderer::SetVersion(unsigned int major, unsigned int minor, unsigned
Render3DError OpenGLRenderer::FlushFramebuffer(const FragmentColor *__restrict srcRGBA8888, FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
{
if (srcRGBA8888 == NULL)
if ( srcRGBA8888 == NULL || ((dstRGBA6665 == NULL) && (dstRGBA5551 == NULL)) )
{
return RENDER3DERROR_NOERR;
}
@ -907,69 +907,152 @@ Render3DError OpenGLRenderer::FlushFramebuffer(const FragmentColor *__restrict s
const size_t pixCount = this->_framebufferWidth;
const size_t ssePixCount = pixCount - (pixCount % 4);
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
if ( (dstRGBA6665 != NULL) && (dstRGBA5551 != NULL) )
{
size_t x = 0;
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
{
size_t x = 0;
#if defined(ENABLE_SSSE3) && defined(LOCAL_LE)
for (; x < ssePixCount; x += 4, ir += 4, iw += 4)
{
// Convert to RGBA6665
__m128i color = _mm_load_si128((__m128i *)(srcRGBA8888 + ir));
color = _mm_srli_epi32(color, 2);
__m128i a = _mm_srli_epi32(color, 1); // Special handling for 5-bit alpha
a = _mm_and_si128(a, _mm_set1_epi32(0x1F000000));
color = _mm_and_si128(color, _mm_set1_epi32(0x003F3F3F));
color = _mm_or_si128(color, a);
color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)); // Swizzle RGBA to BGRA
_mm_store_si128((__m128i *)(dstRGBA6665 + iw), color);
// Convert to RGBA5551
color = _mm_load_si128((__m128i *)(srcRGBA8888 + ir));
__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x000000F8)); // Read from R
b = _mm_slli_epi32(b, 7); // Shift to B
__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x0000F800)); // Read from G
g = _mm_srli_epi32(g, 6); // Shift in G
__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x00F80000)); // Read from B
r = _mm_srli_epi32(r, 19); // Shift to R
a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A
a = _mm_cmpeq_epi32(a, _mm_setzero_si128()); // Determine A
a = _mm_andnot_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A
color = _mm_or_si128(_mm_or_si128(_mm_or_si128(b, g), r), a);
// All the colors are currently placed on 32 bit boundaries, so we need to swizzle them
// to the lower 64 bits of our vector before we store them back to memory.
// Note: Do not attempt to use packssdw here since packing with the 0x8000 bit set will
// result in values of 0x7FFF, which are incorrect values in this case.
color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
_mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), color);
}
for (; x < ssePixCount; x += 4, ir += 4, iw += 4)
{
__m128i color;
// Convert to RGBA6665
color = _mm_load_si128((__m128i *)(srcRGBA8888 + ir));
color = _mm_srli_epi32(color, 2);
__m128i a = _mm_srli_epi32(color, 1); // Special handling for 5-bit alpha
a = _mm_and_si128(a, _mm_set1_epi32(0x1F000000));
color = _mm_and_si128(color, _mm_set1_epi32(0x003F3F3F));
color = _mm_or_si128(color, a);
color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)); // Swizzle RGBA to BGRA
_mm_store_si128((__m128i *)(dstRGBA6665 + iw), color);
// Convert to RGBA5551
color = _mm_load_si128((__m128i *)(srcRGBA8888 + ir));
__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x000000F8)); // Read from R
b = _mm_slli_epi32(b, 7); // Shift to B
__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x0000F800)); // Read from G
g = _mm_srli_epi32(g, 6); // Shift in G
__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x00F80000)); // Read from B
r = _mm_srli_epi32(r, 19); // Shift to R
a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A
a = _mm_cmpeq_epi32(a, _mm_setzero_si128()); // Determine A
a = _mm_andnot_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A
color = _mm_or_si128(_mm_or_si128(_mm_or_si128(b, g), r), a);
// All the colors are currently placed on 32 bit boundaries, so we need to swizzle them
// to the lower 64 bits of our vector before we store them back to memory.
// Note: Do not attempt to use packssdw here since packing with the 0x8000 bit set will
// result in values of 0x7FFF, which are incorrect values in this case.
color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
_mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), color);
}
#endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE)
for (; x < pixCount; x++, ir++, iw++)
{
// Use the correct endian format since OpenGL uses the native endian of
// the architecture it is running on.
for (; x < pixCount; x++, ir++, iw++)
{
// Use the correct endian format since OpenGL uses the native endian of
// the architecture it is running on.
#ifdef LOCAL_BE
dstRGBA6665[iw].color = BGRA8888_32_To_RGBA6665_32(srcRGBA8888[ir].color);
dstRGBA5551[iw] = R5G5B5TORGB15( (srcRGBA8888[ir].b >> 3) & 0x1F,
(srcRGBA8888[ir].g >> 3) & 0x1F,
(srcRGBA8888[ir].r >> 3) & 0x1F) |
((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
dstRGBA6665[iw].color = BGRA8888_32_To_RGBA6665_32(srcRGBA8888[ir].color);
dstRGBA5551[iw] = R5G5B5TORGB15( (srcRGBA8888[ir].b >> 3) & 0x1F,
(srcRGBA8888[ir].g >> 3) & 0x1F,
(srcRGBA8888[ir].r >> 3) & 0x1F) |
((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
#else
dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(srcRGBA8888[ir].color);
dstRGBA5551[iw] = R5G5B5TORGB15( (srcRGBA8888[ir].b >> 3) & 0x1F,
(srcRGBA8888[ir].g >> 3) & 0x1F,
(srcRGBA8888[ir].r >> 3) & 0x1F) |
((srcRGBA8888[ir].a == 0) ? 0x0000 : 0x8000);
dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(srcRGBA8888[ir].color);
dstRGBA5551[iw] = R5G5B5TORGB15( (srcRGBA8888[ir].b >> 3) & 0x1F,
(srcRGBA8888[ir].g >> 3) & 0x1F,
(srcRGBA8888[ir].r >> 3) & 0x1F) |
((srcRGBA8888[ir].a == 0) ? 0x0000 : 0x8000);
#endif
}
}
}
else if (dstRGBA6665 != NULL)
{
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
{
size_t x = 0;
#if defined(ENABLE_SSSE3) && defined(LOCAL_LE)
for (; x < ssePixCount; x += 4, ir += 4, iw += 4)
{
__m128i color = _mm_load_si128((__m128i *)(srcRGBA8888 + ir));
color = _mm_srli_epi32(color, 2);
__m128i a = _mm_srli_epi32(color, 1); // Special handling for 5-bit alpha
a = _mm_and_si128(a, _mm_set1_epi32(0x1F000000));
color = _mm_and_si128(color, _mm_set1_epi32(0x003F3F3F));
color = _mm_or_si128(color, a);
color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)); // Swizzle RGBA to BGRA
_mm_store_si128((__m128i *)(dstRGBA6665 + iw), color);
}
#endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE)
for (; x < pixCount; x++, ir++, iw++)
{
#ifdef LOCAL_BE
dstRGBA6665[iw].color = BGRA8888_32_To_RGBA6665_32(srcRGBA8888[ir].color);
#else
dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(srcRGBA8888[ir].color);
#endif
}
}
}
else
{
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
{
size_t x = 0;
#if defined(ENABLE_SSSE3) && defined(LOCAL_LE)
for (; x < ssePixCount; x += 4, ir += 4, iw += 4)
{
__m128i color = _mm_load_si128((__m128i *)(srcRGBA8888 + ir));
__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x000000F8)); // Read from R
b = _mm_slli_epi32(b, 7); // Shift to B
__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x0000F800)); // Read from G
g = _mm_srli_epi32(g, 6); // Shift in G
__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x00F80000)); // Read from B
r = _mm_srli_epi32(r, 19); // Shift to R
__m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A
a = _mm_cmpeq_epi32(a, _mm_setzero_si128()); // Determine A
a = _mm_andnot_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A
color = _mm_or_si128(_mm_or_si128(_mm_or_si128(b, g), r), a);
color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
_mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), color);
}
#endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE)
for (; x < pixCount; x++, ir++, iw++)
{
#ifdef LOCAL_BE
dstRGBA5551[iw] = R5G5B5TORGB15( (srcRGBA8888[ir].b >> 3) & 0x1F,
(srcRGBA8888[ir].g >> 3) & 0x1F,
(srcRGBA8888[ir].r >> 3) & 0x1F) |
((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
#else
dstRGBA5551[iw] = R5G5B5TORGB15( (srcRGBA8888[ir].b >> 3) & 0x1F,
(srcRGBA8888[ir].g >> 3) & 0x1F,
(srcRGBA8888[ir].r >> 3) & 0x1F) |
((srcRGBA8888[ir].a == 0) ? 0x0000 : 0x8000);
#endif
}
}
}
@ -2736,28 +2819,34 @@ Render3DError OpenGLRenderer_1_2::RenderFinish()
return OGLERROR_NOERR;
}
if(!BEGINGL())
{
GPU->GetEventHandler()->DidRender3DEnd();
return OGLERROR_BEGINGL_FAILED;
}
FragmentColor *framebufferRGBA6665 = (this->_willFlushFramebufferRGBA6665) ? GPU->GetEngineMain()->Get3DFramebufferRGBA6665() : NULL;
u16 *framebufferRGBA5551 = (this->_willFlushFramebufferRGBA5551) ? GPU->GetEngineMain()->Get3DFramebufferRGBA5551() : NULL;
if (this->isPBOSupported)
if ( (framebufferRGBA6665 != NULL) || (framebufferRGBA5551 != NULL) )
{
const FragmentColor *__restrict mappedBufferPtr = (FragmentColor *__restrict)glMapBufferARB(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY_ARB);
if (mappedBufferPtr != NULL)
if(!BEGINGL())
{
this->FlushFramebuffer(mappedBufferPtr, GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551());
glUnmapBufferARB(GL_PIXEL_PACK_BUFFER_ARB);
GPU->GetEventHandler()->DidRender3DEnd();
return OGLERROR_BEGINGL_FAILED;
}
if (this->isPBOSupported)
{
const FragmentColor *__restrict mappedBufferPtr = (FragmentColor *__restrict)glMapBufferARB(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY_ARB);
if (mappedBufferPtr != NULL)
{
this->FlushFramebuffer(mappedBufferPtr, framebufferRGBA6665, framebufferRGBA5551);
glUnmapBufferARB(GL_PIXEL_PACK_BUFFER_ARB);
}
}
else
{
glReadPixels(0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_BGRA, GL_UNSIGNED_BYTE, this->_framebufferColor);
this->FlushFramebuffer(this->_framebufferColor, framebufferRGBA6665, framebufferRGBA5551);
}
ENDGL();
}
else
{
glReadPixels(0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_BGRA, GL_UNSIGNED_BYTE, this->_framebufferColor);
this->FlushFramebuffer(this->_framebufferColor, GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551());
}
ENDGL();
this->_pixelReadNeedsFinish = false;
@ -3314,28 +3403,34 @@ Render3DError OpenGLRenderer_1_5::RenderFinish()
return OGLERROR_NOERR;
}
if(!BEGINGL())
{
GPU->GetEventHandler()->DidRender3DEnd();
return OGLERROR_BEGINGL_FAILED;
}
FragmentColor *framebufferRGBA6665 = (this->_willFlushFramebufferRGBA6665) ? GPU->GetEngineMain()->Get3DFramebufferRGBA6665() : NULL;
u16 *framebufferRGBA5551 = (this->_willFlushFramebufferRGBA5551) ? GPU->GetEngineMain()->Get3DFramebufferRGBA5551() : NULL;
if (this->isPBOSupported)
if ( (framebufferRGBA6665 != NULL) || (framebufferRGBA5551 != NULL) )
{
const FragmentColor *__restrict mappedBufferPtr = (FragmentColor *__restrict)glMapBuffer(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY);
if (mappedBufferPtr != NULL)
if(!BEGINGL())
{
this->FlushFramebuffer(mappedBufferPtr, GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551());
glUnmapBuffer(GL_PIXEL_PACK_BUFFER_ARB);
GPU->GetEventHandler()->DidRender3DEnd();
return OGLERROR_BEGINGL_FAILED;
}
if (this->isPBOSupported)
{
const FragmentColor *__restrict mappedBufferPtr = (FragmentColor *__restrict)glMapBuffer(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY);
if (mappedBufferPtr != NULL)
{
this->FlushFramebuffer(mappedBufferPtr, framebufferRGBA6665, framebufferRGBA5551);
glUnmapBuffer(GL_PIXEL_PACK_BUFFER_ARB);
}
}
else
{
glReadPixels(0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_BGRA, GL_UNSIGNED_BYTE, this->_framebufferColor);
this->FlushFramebuffer(this->_framebufferColor, framebufferRGBA6665, framebufferRGBA5551);
}
ENDGL();
}
else
{
glReadPixels(0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_BGRA, GL_UNSIGNED_BYTE, this->_framebufferColor);
this->FlushFramebuffer(this->_framebufferColor, GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551());
}
ENDGL();
this->_pixelReadNeedsFinish = false;
@ -4107,21 +4202,27 @@ Render3DError OpenGLRenderer_2_1::RenderFinish()
return OGLERROR_NOERR;
}
if(!BEGINGL())
{
GPU->GetEventHandler()->DidRender3DEnd();
return OGLERROR_BEGINGL_FAILED;
}
FragmentColor *framebufferRGBA6665 = (this->_willFlushFramebufferRGBA6665) ? GPU->GetEngineMain()->Get3DFramebufferRGBA6665() : NULL;
u16 *framebufferRGBA5551 = (this->_willFlushFramebufferRGBA5551) ? GPU->GetEngineMain()->Get3DFramebufferRGBA5551() : NULL;
const FragmentColor *__restrict mappedBufferPtr = (FragmentColor *__restrict)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
if (mappedBufferPtr != NULL)
if ( (framebufferRGBA6665 != NULL) || (framebufferRGBA5551 != NULL) )
{
this->FlushFramebuffer(mappedBufferPtr, GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551());
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
if(!BEGINGL())
{
GPU->GetEventHandler()->DidRender3DEnd();
return OGLERROR_BEGINGL_FAILED;
}
const FragmentColor *__restrict mappedBufferPtr = (FragmentColor *__restrict)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
if (mappedBufferPtr != NULL)
{
this->FlushFramebuffer(mappedBufferPtr, framebufferRGBA6665, framebufferRGBA5551);
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
}
ENDGL();
}
ENDGL();
this->_pixelReadNeedsFinish = false;
GPU->GetEventHandler()->DidRender3DEnd();

View File

@ -1961,7 +1961,9 @@ Render3DError SoftRasterizerRenderer::EndRender(const u64 frameCount)
this->RenderEdgeMarkingAndFog(this->postprocessParam[0]);
}
this->FlushFramebuffer(GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551());
FragmentColor *framebufferRGBA6665 = (this->_willFlushFramebufferRGBA6665) ? GPU->GetEngineMain()->Get3DFramebufferRGBA6665() : NULL;
u16 *framebufferRGBA5551 = (this->_willFlushFramebufferRGBA5551) ? GPU->GetEngineMain()->Get3DFramebufferRGBA5551() : NULL;
this->FlushFramebuffer(framebufferRGBA6665, framebufferRGBA5551);
}
return RENDER3DERROR_NOERR;
@ -2005,7 +2007,9 @@ Render3DError SoftRasterizerRenderer::RenderFinish()
}
}
this->FlushFramebuffer(GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551());
FragmentColor *framebufferRGBA6665 = (this->_willFlushFramebufferRGBA6665) ? GPU->GetEngineMain()->Get3DFramebufferRGBA6665() : NULL;
u16 *framebufferRGBA5551 = (this->_willFlushFramebufferRGBA5551) ? GPU->GetEngineMain()->Get3DFramebufferRGBA5551() : NULL;
this->FlushFramebuffer(framebufferRGBA6665, framebufferRGBA5551);
GPU->GetEventHandler()->DidRender3DEnd();
return RENDER3DERROR_NOERR;

View File

@ -228,6 +228,9 @@ Render3D::Render3D()
_framebufferColorSizeBytes = 0;
_framebufferColor = NULL;
_willFlushFramebufferRGBA6665 = true;
_willFlushFramebufferRGBA5551 = true;
Reset();
}
@ -282,6 +285,18 @@ Render3DError Render3D::SetFramebufferSize(size_t w, size_t h)
return RENDER3DERROR_NOERR;
}
void Render3D::GetFramebufferFlushStates(bool &willFlushRGBA6665, bool &willFlushRGBA5551)
{
willFlushRGBA6665 = this->_willFlushFramebufferRGBA6665;
willFlushRGBA5551 = this->_willFlushFramebufferRGBA5551;
}
void Render3D::SetFramebufferFlushStates(bool willFlushRGBA6665, bool willFlushRGBA5551)
{
this->_willFlushFramebufferRGBA6665 = willFlushRGBA6665;
this->_willFlushFramebufferRGBA5551 = willFlushRGBA5551;
}
Render3DError Render3D::BeginRender(const GFX3D &engine)
{
return RENDER3DERROR_NOERR;
@ -309,12 +324,17 @@ Render3DError Render3D::EndRender(const u64 frameCount)
Render3DError Render3D::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
{
memcpy(dstRGBA6665, this->_framebufferColor, this->_framebufferColorSizeBytes);
// Convert to RGBA5551
for (size_t i = 0; i < (this->_framebufferWidth * this->_framebufferHeight); i++)
if (dstRGBA6665 != NULL)
{
dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000);
memcpy(dstRGBA6665, this->_framebufferColor, this->_framebufferColorSizeBytes);
}
if (dstRGBA5551 != NULL)
{
for (size_t i = 0; i < (this->_framebufferWidth * this->_framebufferHeight); i++)
{
dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000);
}
}
return RENDER3DERROR_NOERR;
@ -453,6 +473,9 @@ Render3DError Render3D::Reset()
memset(this->clearImagePolyIDBuffer, 0, sizeof(this->clearImagePolyIDBuffer));
memset(this->clearImageFogBuffer, 0, sizeof(this->clearImageFogBuffer));
this->_willFlushFramebufferRGBA6665 = true;
this->_willFlushFramebufferRGBA5551 = true;
TexCache_Reset();
return RENDER3DERROR_NOERR;
@ -505,59 +528,118 @@ Render3DError Render3D::VramReconfigureSignal()
Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
{
if ( (dstRGBA6665 == NULL) && (dstRGBA5551 == NULL) )
{
return RENDER3DERROR_NOERR;
}
const __m128i zero_vec128 = _mm_setzero_si128();
size_t i = 0;
const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
const size_t ssePixCount = pixCount - (pixCount % 4);
for (; i < ssePixCount; i += 4)
if ( (dstRGBA6665 != NULL) && (dstRGBA5551 != NULL) )
{
// Copy the framebufferColor buffer
__m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i));
_mm_store_si128((__m128i *)(dstRGBA6665 + i), color);
for (; i < ssePixCount; i += 4)
{
// Copy the framebufferColor buffer
__m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i));
_mm_store_si128((__m128i *)(dstRGBA6665 + i), color);
// Convert to RGBA5551
__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E)); // Read from R
r = _mm_srli_epi32(r, 1); // Shift to R
__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00)); // Read from G
g = _mm_srli_epi32(g, 4); // Shift in G
__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000)); // Read from B
b = _mm_srli_epi32(b, 7); // Shift to B
__m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A
a = _mm_cmpeq_epi32(a, zero_vec128); // Determine A
// From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned
// 16-bit. Since SSE2 only has packssdw (signed saturated 16-bit pack), using
// packssdw on the alpha bit (0x8000) will result in a value of 0x7FFF, which is
// incorrect. Now if we were to use SSE4.1's packusdw (unsigned saturated 16-bit
// pack), we wouldn't have to go through this hassle. But not everyone has an
// SSE4.1-capable CPU, so doing this the SSE2 way is more guaranteed to work for
// everyone's CPU.
//
// To use packssdw, we take a bit one position lower for the alpha bit, run
// packssdw, then shift the bit back to its original position. Then we por the
// alpha vector with the post-packed color vector to get the final color.
a = _mm_andnot_si128(a, _mm_set1_epi32(0x00004000)); // Mask out the bit before A
a = _mm_packs_epi32(a, zero_vec128); // Pack 32-bit down to 16-bit
a = _mm_slli_epi16(a, 1); // Shift the A bit back to where it needs to be
// Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
color = _mm_or_si128(_mm_or_si128(r, g), b);
color = _mm_packs_epi32(color, zero_vec128);
color = _mm_or_si128(color, a);
_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color);
}
// Convert to RGBA5551
__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E)); // Read from R
r = _mm_srli_epi32(r, 1); // Shift to R
__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00)); // Read from G
g = _mm_srli_epi32(g, 4); // Shift in G
__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000)); // Read from B
b = _mm_srli_epi32(b, 7); // Shift to B
__m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A
a = _mm_cmpeq_epi32(a, zero_vec128); // Determine A
// From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned
// 16-bit. Since SSE2 only has packssdw (signed saturated 16-bit pack), using
// packssdw on the alpha bit (0x8000) will result in a value of 0x7FFF, which is
// incorrect. Now if we were to use SSE4.1's packusdw (unsigned saturated 16-bit
// pack), we wouldn't have to go through this hassle. But not everyone has an
// SSE4.1-capable CPU, so doing this the SSE2 way is more guaranteed to work for
// everyone's CPU.
//
// To use packssdw, we take a bit one position lower for the alpha bit, run
// packssdw, then shift the bit back to its original position. Then we por the
// alpha vector with the post-packed color vector to get the final color.
a = _mm_andnot_si128(a, _mm_set1_epi32(0x00004000)); // Mask out the bit before A
a = _mm_packs_epi32(a, zero_vec128); // Pack 32-bit down to 16-bit
a = _mm_slli_epi16(a, 1); // Shift the A bit back to where it needs to be
// Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
color = _mm_or_si128(_mm_or_si128(r, g), b);
color = _mm_packs_epi32(color, zero_vec128);
color = _mm_or_si128(color, a);
_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color);
for (; i < pixCount; i++)
{
dstRGBA6665[i] = this->_framebufferColor[i];
dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000);
}
}
for (; i < pixCount; i++)
else if (dstRGBA6665 != NULL)
{
dstRGBA6665[i] = this->_framebufferColor[i];
dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000);
memcpy(dstRGBA6665, this->_framebufferColor, this->_framebufferColorSizeBytes);
}
else
{
for (; i < ssePixCount; i += 4)
{
// Convert to RGBA5551
__m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i));
__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E)); // Read from R
r = _mm_srli_epi32(r, 1); // Shift to R
__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00)); // Read from G
g = _mm_srli_epi32(g, 4); // Shift in G
__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000)); // Read from B
b = _mm_srli_epi32(b, 7); // Shift to B
__m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A
a = _mm_cmpeq_epi32(a, zero_vec128); // Determine A
// From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned
// 16-bit. Since SSE2 only has packssdw (signed saturated 16-bit pack), using
// packssdw on the alpha bit (0x8000) will result in a value of 0x7FFF, which is
// incorrect. Now if we were to use SSE4.1's packusdw (unsigned saturated 16-bit
// pack), we wouldn't have to go through this hassle. But not everyone has an
// SSE4.1-capable CPU, so doing this the SSE2 way is more guaranteed to work for
// everyone's CPU.
//
// To use packssdw, we take a bit one position lower for the alpha bit, run
// packssdw, then shift the bit back to its original position. Then we por the
// alpha vector with the post-packed color vector to get the final color.
a = _mm_andnot_si128(a, _mm_set1_epi32(0x00004000)); // Mask out the bit before A
a = _mm_packs_epi32(a, zero_vec128); // Pack 32-bit down to 16-bit
a = _mm_slli_epi16(a, 1); // Shift the A bit back to where it needs to be
// Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
color = _mm_or_si128(_mm_or_si128(r, g), b);
color = _mm_packs_epi32(color, zero_vec128);
color = _mm_or_si128(color, a);
_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color);
}
for (; i < pixCount; i++)
{
dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000);
}
}
return RENDER3DERROR_NOERR;

View File

@ -109,6 +109,9 @@ protected:
size_t _framebufferColorSizeBytes;
FragmentColor *_framebufferColor;
bool _willFlushFramebufferRGBA6665;
bool _willFlushFramebufferRGBA5551;
CACHE_ALIGN u16 clearImageColor16Buffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
CACHE_ALIGN u32 clearImageDepthBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
CACHE_ALIGN u8 clearImageFogBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
@ -155,6 +158,9 @@ public:
virtual Render3DError VramReconfigureSignal(); // Called when the emulator reconfigures its VRAM. You may need to invalidate your texture cache.
virtual Render3DError SetFramebufferSize(size_t w, size_t h); // Called whenever the output framebuffer size changes.
virtual void GetFramebufferFlushStates(bool &willFlushRGBA6665, bool &willFlushRGBA5551);
virtual void SetFramebufferFlushStates(bool willFlushRGBA6665, bool willFlushRGBA5551);
};
#ifdef ENABLE_SSE2