Render3D:
- SSSE3-specific optimizations now only require SSE2. - Better optimize clear image operations.
This commit is contained in:
parent
611d0c9036
commit
fbda969347
|
@ -935,8 +935,7 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
|
|||
a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A
|
||||
a = _mm_and_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A
|
||||
|
||||
color = b;
|
||||
color = _mm_or_si128(color, g);
|
||||
color = _mm_or_si128(b, g);
|
||||
color = _mm_or_si128(color, r);
|
||||
color = _mm_or_si128(color, a);
|
||||
|
||||
|
@ -2393,22 +2392,22 @@ Render3DError OpenGLRenderer_1_2::ClearUsingImage(const u16 *__restrict colorBuf
|
|||
// Blit the working depth buffer
|
||||
glReadBuffer(GL_COLOR_ATTACHMENT1_EXT);
|
||||
glDrawBuffer(GL_COLOR_ATTACHMENT1_EXT);
|
||||
glBlitFramebufferEXT(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST);
|
||||
glBlitFramebufferEXT(0, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GPU_FRAMEBUFFER_NATIVE_WIDTH, 0, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST);
|
||||
|
||||
// Blit the polygon ID buffer
|
||||
glReadBuffer(GL_COLOR_ATTACHMENT2_EXT);
|
||||
glDrawBuffer(GL_COLOR_ATTACHMENT2_EXT);
|
||||
glBlitFramebufferEXT(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST);
|
||||
glBlitFramebufferEXT(0, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GPU_FRAMEBUFFER_NATIVE_WIDTH, 0, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST);
|
||||
|
||||
// Blit the fog buffer
|
||||
glReadBuffer(GL_COLOR_ATTACHMENT3_EXT);
|
||||
glDrawBuffer(GL_COLOR_ATTACHMENT3_EXT);
|
||||
glBlitFramebufferEXT(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST);
|
||||
glBlitFramebufferEXT(0, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GPU_FRAMEBUFFER_NATIVE_WIDTH, 0, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST);
|
||||
|
||||
// Blit the color buffer. Do this last so that color attachment 0 is set to the read FBO.
|
||||
glReadBuffer(GL_COLOR_ATTACHMENT0_EXT);
|
||||
glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
|
||||
glBlitFramebufferEXT(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT, GL_NEAREST);
|
||||
glBlitFramebufferEXT(0, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GPU_FRAMEBUFFER_NATIVE_WIDTH, 0, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT, GL_NEAREST);
|
||||
|
||||
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, OGLRef.fboRenderID);
|
||||
glDrawBuffers(4, RenderDrawList);
|
||||
|
|
|
@ -1345,22 +1345,22 @@ Render3DError OpenGLRenderer_3_2::ClearUsingImage(const u16 *__restrict colorBuf
|
|||
// Blit the working depth buffer
|
||||
glReadBuffer(GL_COLOR_ATTACHMENT1);
|
||||
glDrawBuffer(GL_COLOR_ATTACHMENT1);
|
||||
glBlitFramebuffer(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST);
|
||||
glBlitFramebuffer(0, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GPU_FRAMEBUFFER_NATIVE_WIDTH, 0, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST);
|
||||
|
||||
// Blit the polygon ID buffer
|
||||
glReadBuffer(GL_COLOR_ATTACHMENT2);
|
||||
glDrawBuffer(GL_COLOR_ATTACHMENT2);
|
||||
glBlitFramebuffer(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST);
|
||||
glBlitFramebuffer(0, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GPU_FRAMEBUFFER_NATIVE_WIDTH, 0, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST);
|
||||
|
||||
// Blit the fog buffer
|
||||
glReadBuffer(GL_COLOR_ATTACHMENT3);
|
||||
glDrawBuffer(GL_COLOR_ATTACHMENT3);
|
||||
glBlitFramebuffer(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST);
|
||||
glBlitFramebuffer(0, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GPU_FRAMEBUFFER_NATIVE_WIDTH, 0, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST);
|
||||
|
||||
// Blit the color buffer. Do this last so that color attachment 0 is set to the read FBO.
|
||||
glReadBuffer(GL_COLOR_ATTACHMENT0);
|
||||
glDrawBuffer(GL_COLOR_ATTACHMENT0);
|
||||
glBlitFramebuffer(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT, GL_NEAREST);
|
||||
glBlitFramebuffer(0, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GPU_FRAMEBUFFER_NATIVE_WIDTH, 0, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT, GL_NEAREST);
|
||||
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, OGLRef.fboRenderID);
|
||||
glDrawBuffers(4, RenderDrawList);
|
||||
|
|
|
@ -1794,14 +1794,10 @@ Render3DError SoftRasterizerRenderer::RenderEdgeMarkingAndFog(const SoftRasteriz
|
|||
// - the character edges in-level are clearly transparent, and also show well through shield powerups.
|
||||
|
||||
FragmentColor edgeColor = this->edgeMarkTable[polyID>>3];
|
||||
bool upleft = false;
|
||||
bool up = false;
|
||||
bool upright = false;
|
||||
bool left = false;
|
||||
bool right = false;
|
||||
bool downleft = false;
|
||||
bool down = false;
|
||||
bool downright = false;
|
||||
|
||||
#define PIXOFFSET(dx,dy) ((dx)+(this->_framebufferWidth*(dy)))
|
||||
#define ISEDGE(dx,dy) ((x+(dx) < this->_framebufferWidth) && (y+(dy) < this->_framebufferHeight) && polyID != this->_framebufferAttributes->opaquePolyID[i+PIXOFFSET(dx,dy)] && depth >= this->_framebufferAttributes->depth[i+PIXOFFSET(dx,dy)])
|
||||
|
@ -1889,27 +1885,25 @@ Render3DError SoftRasterizerRenderer::UpdateToonTable(const u16 *toonTableBuffer
|
|||
|
||||
Render3DError SoftRasterizerRenderer::ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer)
|
||||
{
|
||||
const float lineDecrement = ((float)GPU_FRAMEBUFFER_NATIVE_HEIGHT / (float)this->_framebufferHeight) + 0.000001;
|
||||
const float readIncrement = ((float)GPU_FRAMEBUFFER_NATIVE_WIDTH / (float)this->_framebufferWidth) + 0.000001;
|
||||
float line = GPU_FRAMEBUFFER_NATIVE_HEIGHT - 1.0 + lineDecrement;
|
||||
float readLocation = (GPU_FRAMEBUFFER_NATIVE_HEIGHT - 1) * GPU_FRAMEBUFFER_NATIVE_WIDTH;
|
||||
const size_t xRatio = (size_t)((GPU_FRAMEBUFFER_NATIVE_WIDTH << 16) / this->_framebufferWidth) + 1;
|
||||
const size_t yRatio = (size_t)((GPU_FRAMEBUFFER_NATIVE_HEIGHT << 16) / this->_framebufferHeight) + 1;
|
||||
|
||||
// The clear image buffer is y-flipped, so we need to flip it back to normal here.
|
||||
for (size_t y = 0, iw = 0; y < this->_framebufferHeight; y++, readLocation = ((size_t)line * GPU_FRAMEBUFFER_NATIVE_WIDTH))
|
||||
for (size_t y = 0, iw = 0; y < this->_framebufferHeight; y++)
|
||||
{
|
||||
for (size_t x = 0; x < this->_framebufferWidth; x++, iw++, readLocation += readIncrement)
|
||||
const size_t readLine = (size_t)(((y * yRatio) >> 16) * GPU_FRAMEBUFFER_NATIVE_WIDTH);
|
||||
|
||||
for (size_t x = 0; x < this->_framebufferWidth; x++, iw++)
|
||||
{
|
||||
const size_t ir = (size_t)readLocation;
|
||||
const size_t ir = readLine + ((x * xRatio) >> 16);
|
||||
|
||||
this->_framebufferColor[iw].color = RGB15TO6665(colorBuffer[ir] & 0x7FFF, (colorBuffer[ir] >> 15) * 0x1F);
|
||||
this->_framebufferAttributes->isFogged[iw] = fogBuffer[ir];
|
||||
this->_framebufferAttributes->depth[iw] = depthBuffer[ir];
|
||||
this->_framebufferAttributes->isFogged[iw] = fogBuffer[ir];
|
||||
this->_framebufferAttributes->opaquePolyID[iw] = polyIDBuffer[ir];
|
||||
this->_framebufferAttributes->translucentPolyID[iw] = kUnsetTranslucentPolyID;
|
||||
this->_framebufferAttributes->isTranslucentPoly[iw] = 0;
|
||||
this->_framebufferAttributes->stencil[iw] = 0;
|
||||
}
|
||||
|
||||
line -= lineDecrement;
|
||||
}
|
||||
|
||||
return RENDER3DERROR_NOERR;
|
||||
|
|
|
@ -345,13 +345,11 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState)
|
|||
const u8 xScroll = scrollBits & 0xFF;
|
||||
const u8 yScroll = (scrollBits >> 8) & 0xFF;
|
||||
|
||||
size_t dstIndex = (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT) - GPU_FRAMEBUFFER_NATIVE_WIDTH;
|
||||
|
||||
for (size_t iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
|
||||
for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
|
||||
{
|
||||
const size_t y = ((iy + yScroll) & 0xFF) << 8;
|
||||
|
||||
for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; ix++)
|
||||
for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex++, ix++)
|
||||
{
|
||||
const size_t x = (ix + xScroll) & 0xFF;
|
||||
const size_t srcIndex = y | x;
|
||||
|
@ -367,11 +365,7 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState)
|
|||
|
||||
this->clearImageFogBuffer[dstIndex] = BIT15(clearDepthBuffer[srcIndex]);
|
||||
this->clearImagePolyIDBuffer[dstIndex] = clearFragment.opaquePolyID;
|
||||
|
||||
dstIndex++;
|
||||
}
|
||||
|
||||
dstIndex -= GPU_FRAMEBUFFER_NATIVE_WIDTH * 2;
|
||||
}
|
||||
|
||||
error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer);
|
||||
|
@ -474,6 +468,65 @@ Render3DError Render3D::VramReconfigureSignal()
|
|||
|
||||
#ifdef ENABLE_SSE2
|
||||
|
||||
Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
|
||||
{
|
||||
static const __m128i zeroColor = _mm_set1_epi32(0);
|
||||
const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
|
||||
const size_t ssePixCount = pixCount - (pixCount % 4);
|
||||
|
||||
for (size_t i = 0; i < ssePixCount; i += 4)
|
||||
{
|
||||
// Copy the framebufferColor buffer
|
||||
__m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i));
|
||||
_mm_store_si128((__m128i *)(dstRGBA6665 + i), color);
|
||||
|
||||
// Convert to RGBA5551
|
||||
__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E)); // Read from R
|
||||
r = _mm_srli_epi32(r, 1); // Shift to R
|
||||
|
||||
__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00)); // Read from G
|
||||
g = _mm_srli_epi32(g, 4); // Shift in G
|
||||
|
||||
__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000)); // Read from B
|
||||
b = _mm_srli_epi32(b, 7); // Shift to B
|
||||
|
||||
__m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A
|
||||
a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A
|
||||
|
||||
// From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned
|
||||
// 16-bit. Since SSE2 only has packssdw (signed 16-bit pack), then the alpha bit
|
||||
// may be undefined. Now if we were using SSE4.1's packusdw (unsigned 16-bit pack),
|
||||
// we wouldn't have to go through this hassle. But not everyone has an SSE4.1-capable
|
||||
// CPU, so doing this the SSE2 way is more guaranteed to work an everyone's CPU.
|
||||
//
|
||||
// To use packssdw, we take a bit one position lower for the alpha bit, run
|
||||
// packssdw, then shift the bit back to its original position. Then we por the
|
||||
// alpha vector with the post-packed color vector to get the final color.
|
||||
|
||||
a = _mm_and_si128(a, _mm_set1_epi32(0x00004000)); // Mask out the bit before A
|
||||
a = _mm_packs_epi32(a, zeroColor); // Pack 32-bit down to 16-bit
|
||||
a = _mm_slli_epi16(a, 1); // Shift the A bit back to where it needs to be
|
||||
|
||||
// Assemble the RGB colors
|
||||
color = _mm_or_si128(r, g);
|
||||
color = _mm_or_si128(color, b);
|
||||
|
||||
// Pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
|
||||
color = _mm_packs_epi32(color, zeroColor);
|
||||
color = _mm_or_si128(color, a);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color);
|
||||
}
|
||||
|
||||
for (size_t i = ssePixCount; i < pixCount; i++)
|
||||
{
|
||||
dstRGBA6665[i] = this->_framebufferColor[i];
|
||||
dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000);
|
||||
}
|
||||
|
||||
return RENDER3DERROR_NOERR;
|
||||
}
|
||||
|
||||
Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
|
||||
{
|
||||
Render3DError error = RENDER3DERROR_NOERR;
|
||||
|
@ -504,210 +557,65 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
|
|||
const u16 scrollBits = T1ReadWord(MMU.ARM9_REG, 0x356); //CLRIMAGE_OFFSET
|
||||
const u8 xScroll = scrollBits & 0xFF;
|
||||
const u8 yScroll = (scrollBits >> 8) & 0xFF;
|
||||
|
||||
size_t dstIndex = (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT) - GPU_FRAMEBUFFER_NATIVE_WIDTH;
|
||||
|
||||
static const __m128i addrOffset = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
static const __m128i addrRolloverMask = _mm_set1_epi16(0x00FF);
|
||||
const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID);
|
||||
|
||||
for (size_t iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
|
||||
{
|
||||
const size_t y = ((iy + yScroll) & 0xFF) << 8;
|
||||
__m128i y_vec128 = _mm_set1_epi16(y);
|
||||
|
||||
for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; ix += 8)
|
||||
{
|
||||
__m128i addr_vec128 = _mm_set1_epi16(ix + xScroll);
|
||||
addr_vec128 = _mm_add_epi16(addr_vec128, addrOffset);
|
||||
addr_vec128 = _mm_and_si128(addr_vec128, addrRolloverMask);
|
||||
addr_vec128 = _mm_or_si128(addr_vec128, y_vec128);
|
||||
|
||||
this->clearImageColor16Buffer[dstIndex+7] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 7)];
|
||||
this->clearImageColor16Buffer[dstIndex+6] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 6)];
|
||||
this->clearImageColor16Buffer[dstIndex+5] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 5)];
|
||||
this->clearImageColor16Buffer[dstIndex+4] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 4)];
|
||||
this->clearImageColor16Buffer[dstIndex+3] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 3)];
|
||||
this->clearImageColor16Buffer[dstIndex+2] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 2)];
|
||||
this->clearImageColor16Buffer[dstIndex+1] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 1)];
|
||||
this->clearImageColor16Buffer[dstIndex+0] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 0)];
|
||||
|
||||
this->clearImageDepthBuffer[dstIndex+7] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 7)] & 0x7FFF];
|
||||
this->clearImageDepthBuffer[dstIndex+6] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 6)] & 0x7FFF];
|
||||
this->clearImageDepthBuffer[dstIndex+5] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 5)] & 0x7FFF];
|
||||
this->clearImageDepthBuffer[dstIndex+4] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 4)] & 0x7FFF];
|
||||
this->clearImageDepthBuffer[dstIndex+3] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 3)] & 0x7FFF];
|
||||
this->clearImageDepthBuffer[dstIndex+2] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 2)] & 0x7FFF];
|
||||
this->clearImageDepthBuffer[dstIndex+1] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 1)] & 0x7FFF];
|
||||
this->clearImageDepthBuffer[dstIndex+0] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 0)] & 0x7FFF];
|
||||
|
||||
this->clearImageFogBuffer[dstIndex+7] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 7)] );
|
||||
this->clearImageFogBuffer[dstIndex+6] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 6)] );
|
||||
this->clearImageFogBuffer[dstIndex+5] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 5)] );
|
||||
this->clearImageFogBuffer[dstIndex+4] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 4)] );
|
||||
this->clearImageFogBuffer[dstIndex+3] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 3)] );
|
||||
this->clearImageFogBuffer[dstIndex+2] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 2)] );
|
||||
this->clearImageFogBuffer[dstIndex+1] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 1)] );
|
||||
this->clearImageFogBuffer[dstIndex+0] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 0)] );
|
||||
|
||||
_mm_storel_epi64((__m128i *)(this->clearImagePolyIDBuffer + dstIndex), opaquePolyID_vec128);
|
||||
|
||||
dstIndex += 8;
|
||||
}
|
||||
|
||||
dstIndex -= GPU_FRAMEBUFFER_NATIVE_WIDTH * 2;
|
||||
}
|
||||
|
||||
error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer);
|
||||
if (error != RENDER3DERROR_NOERR)
|
||||
{
|
||||
error = this->ClearUsingValues(clearColor, clearFragment);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
error = this->ClearUsingValues(clearColor, clearFragment);
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
#endif // ENABLE_SSE2
|
||||
|
||||
#ifdef ENABLE_SSSE3
|
||||
|
||||
Render3DError Render3D_SSSE3::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
|
||||
{
|
||||
// Convert to RGBA5551
|
||||
const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
|
||||
const size_t ssePixCount = pixCount - (pixCount % 4);
|
||||
|
||||
for (size_t i = 0; i < ssePixCount; i += 4)
|
||||
{
|
||||
__m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i));
|
||||
_mm_store_si128((__m128i *)(dstRGBA6665 + i), color);
|
||||
|
||||
__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E)); // Read from R
|
||||
r = _mm_srli_epi32(r, 1); // Shift to R
|
||||
|
||||
__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00)); // Read from G
|
||||
g = _mm_srli_epi32(g, 4); // Shift in G
|
||||
|
||||
__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000)); // Read from B
|
||||
b = _mm_srli_epi32(b, 7); // Shift to B
|
||||
|
||||
__m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A
|
||||
a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A
|
||||
a = _mm_and_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A
|
||||
|
||||
color = r;
|
||||
color = _mm_or_si128(color, g);
|
||||
color = _mm_or_si128(color, b);
|
||||
color = _mm_or_si128(color, a);
|
||||
|
||||
// All the colors are currently placed every other 16 bits, so we need to swizzle them
|
||||
// to the lower 64 bits of our vector before we store them back to memory.
|
||||
color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
|
||||
_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color);
|
||||
}
|
||||
|
||||
for (size_t i = ssePixCount; i < pixCount; i++)
|
||||
{
|
||||
dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000);
|
||||
}
|
||||
|
||||
return RENDER3DERROR_NOERR;
|
||||
}
|
||||
|
||||
Render3DError Render3D_SSSE3::ClearFramebuffer(const GFX3D_State &renderState)
|
||||
{
|
||||
Render3DError error = RENDER3DERROR_NOERR;
|
||||
|
||||
FragmentColor clearColor;
|
||||
clearColor.r = renderState.clearColor & 0x1F;
|
||||
clearColor.g = (renderState.clearColor >> 5) & 0x1F;
|
||||
clearColor.b = (renderState.clearColor >> 10) & 0x1F;
|
||||
clearColor.a = (renderState.clearColor >> 16) & 0x1F;
|
||||
|
||||
FragmentAttributes clearFragment;
|
||||
clearFragment.opaquePolyID = (renderState.clearColor >> 24) & 0x3F;
|
||||
//special value for uninitialized translucent polyid. without this, fires in spiderman2 dont display
|
||||
//I am not sure whether it is right, though. previously this was cleared to 0, as a guess,
|
||||
//but in spiderman2 some fires with polyid 0 try to render on top of the background
|
||||
clearFragment.translucentPolyID = kUnsetTranslucentPolyID;
|
||||
clearFragment.depth = renderState.clearDepth;
|
||||
clearFragment.stencil = 0;
|
||||
clearFragment.isTranslucentPoly = 0;
|
||||
clearFragment.isFogged = BIT15(renderState.clearColor);
|
||||
|
||||
if (renderState.enableClearImage)
|
||||
{
|
||||
//the lion, the witch, and the wardrobe (thats book 1, suck it you new-school numberers)
|
||||
//uses the scroll registers in the main game engine
|
||||
const u16 *__restrict clearColorBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[2];
|
||||
const u16 *__restrict clearDepthBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[3];
|
||||
const u16 scrollBits = T1ReadWord(MMU.ARM9_REG, 0x356); //CLRIMAGE_OFFSET
|
||||
const u8 xScroll = scrollBits & 0xFF;
|
||||
const u8 yScroll = (scrollBits >> 8) & 0xFF;
|
||||
|
||||
size_t dstIndex = (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT) - GPU_FRAMEBUFFER_NATIVE_WIDTH;
|
||||
|
||||
if (xScroll == 0 && yScroll == 0)
|
||||
{
|
||||
const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID);
|
||||
|
||||
for (size_t iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
|
||||
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i += 16)
|
||||
{
|
||||
for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; ix += 16)
|
||||
{
|
||||
static const __m128i depthBitMask_vec128 = _mm_set1_epi16(0x7FFF);
|
||||
static const __m128i fogBufferBitMask_vec128 = _mm_set1_epi16(BIT(15));
|
||||
const u16 addr = (iy << 8) | ix;
|
||||
|
||||
_mm_store_si128((__m128i *)(this->clearImageColor16Buffer + dstIndex + 8), *(__m128i *)(clearColorBuffer + addr + 8));
|
||||
_mm_store_si128((__m128i *)(this->clearImageColor16Buffer + dstIndex), *(__m128i *)(clearColorBuffer + addr));
|
||||
|
||||
__m128i clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + addr + 8));
|
||||
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, depthBitMask_vec128);
|
||||
this->clearImageDepthBuffer[dstIndex+15] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 7)];
|
||||
this->clearImageDepthBuffer[dstIndex+14] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 6)];
|
||||
this->clearImageDepthBuffer[dstIndex+13] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 5)];
|
||||
this->clearImageDepthBuffer[dstIndex+12] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 4)];
|
||||
this->clearImageDepthBuffer[dstIndex+11] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 3)];
|
||||
this->clearImageDepthBuffer[dstIndex+10] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 2)];
|
||||
this->clearImageDepthBuffer[dstIndex+ 9] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 1)];
|
||||
this->clearImageDepthBuffer[dstIndex+ 8] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 0)];
|
||||
|
||||
clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + addr));
|
||||
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, depthBitMask_vec128);
|
||||
this->clearImageDepthBuffer[dstIndex+ 7] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 7)];
|
||||
this->clearImageDepthBuffer[dstIndex+ 6] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 6)];
|
||||
this->clearImageDepthBuffer[dstIndex+ 5] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 5)];
|
||||
this->clearImageDepthBuffer[dstIndex+ 4] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 4)];
|
||||
this->clearImageDepthBuffer[dstIndex+ 3] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 3)];
|
||||
this->clearImageDepthBuffer[dstIndex+ 2] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 2)];
|
||||
this->clearImageDepthBuffer[dstIndex+ 1] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 1)];
|
||||
this->clearImageDepthBuffer[dstIndex+ 0] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 0)];
|
||||
|
||||
clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + addr + 8));
|
||||
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, fogBufferBitMask_vec128);
|
||||
clearDepth_vec128 = _mm_srli_epi16(clearDepth_vec128, 15);
|
||||
|
||||
__m128i clearDepthFogBit_vec128 = _mm_shuffle_epi8(clearDepth_vec128, _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0));
|
||||
|
||||
clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + addr));
|
||||
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, fogBufferBitMask_vec128);
|
||||
clearDepth_vec128 = _mm_srli_epi16(clearDepth_vec128, 15);
|
||||
|
||||
clearDepth_vec128 = _mm_shuffle_epi8(clearDepth_vec128, _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1));
|
||||
|
||||
_mm_store_si128((__m128i *)(this->clearImageFogBuffer + dstIndex), _mm_or_si128(clearDepth_vec128, clearDepthFogBit_vec128));
|
||||
_mm_store_si128((__m128i *)(this->clearImagePolyIDBuffer + dstIndex), opaquePolyID_vec128);
|
||||
|
||||
dstIndex += 16;
|
||||
}
|
||||
static const __m128i depthBitMask_vec128 = _mm_set1_epi16(0x7FFF);
|
||||
static const __m128i fogBufferBitMask_vec128 = _mm_set1_epi16(BIT(15));
|
||||
|
||||
dstIndex -= GPU_FRAMEBUFFER_NATIVE_WIDTH * 2;
|
||||
// Copy the colors to the color buffer. Since we can only copy 8 elements at once,
|
||||
// we need to load-store twice.
|
||||
_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i + 8), _mm_load_si128((__m128i *)(clearColorBuffer + i + 8)) );
|
||||
_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i), _mm_load_si128((__m128i *)(clearColorBuffer + i)) );
|
||||
|
||||
// Write the depth values to the depth buffer.
|
||||
__m128i clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8));
|
||||
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, depthBitMask_vec128);
|
||||
|
||||
__m128i depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 7)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 6)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 5)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 4)]);
|
||||
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 12), depthValue_vec128);
|
||||
|
||||
depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 3)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 2)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 1)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 0)]);
|
||||
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 8), depthValue_vec128);
|
||||
|
||||
clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i));
|
||||
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, depthBitMask_vec128);
|
||||
|
||||
depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 7)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 6)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 5)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 4)]);
|
||||
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 4), depthValue_vec128);
|
||||
|
||||
depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 3)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 2)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 1)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 0)]);
|
||||
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i), depthValue_vec128);
|
||||
|
||||
// Write the fog flags to the fog flag buffer.
|
||||
clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8)); // Read the upper values
|
||||
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, fogBufferBitMask_vec128);
|
||||
const __m128i clearDepthFogBit_vec128 = _mm_srli_epi16(clearDepth_vec128, 15); // Save the upper bits in another register
|
||||
|
||||
clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i)); // Read the lower values
|
||||
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, fogBufferBitMask_vec128);
|
||||
clearDepth_vec128 = _mm_srli_epi16(clearDepth_vec128, 15); // These are the lower bits
|
||||
|
||||
_mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packus_epi16(clearDepth_vec128, clearDepthFogBit_vec128));
|
||||
|
||||
// The one is easy. Just set the values in the polygon ID buffer.
|
||||
_mm_store_si128((__m128i *)(this->clearImagePolyIDBuffer + i), opaquePolyID_vec128);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -716,12 +624,12 @@ Render3DError Render3D_SSSE3::ClearFramebuffer(const GFX3D_State &renderState)
|
|||
static const __m128i addrRolloverMask = _mm_set1_epi16(0x00FF);
|
||||
const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID);
|
||||
|
||||
for (size_t iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
|
||||
for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
|
||||
{
|
||||
const size_t y = ((iy + yScroll) & 0xFF) << 8;
|
||||
__m128i y_vec128 = _mm_set1_epi16(y);
|
||||
|
||||
for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; ix += 8)
|
||||
for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex += 8, ix += 8)
|
||||
{
|
||||
__m128i addr_vec128 = _mm_set1_epi16(ix + xScroll);
|
||||
addr_vec128 = _mm_add_epi16(addr_vec128, addrOffset);
|
||||
|
@ -756,11 +664,7 @@ Render3DError Render3D_SSSE3::ClearFramebuffer(const GFX3D_State &renderState)
|
|||
this->clearImageFogBuffer[dstIndex+0] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 0)] );
|
||||
|
||||
_mm_storel_epi64((__m128i *)(this->clearImagePolyIDBuffer + dstIndex), opaquePolyID_vec128);
|
||||
|
||||
dstIndex += 8;
|
||||
}
|
||||
|
||||
dstIndex -= GPU_FRAMEBUFFER_NATIVE_WIDTH * 2;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -778,4 +682,4 @@ Render3DError Render3D_SSSE3::ClearFramebuffer(const GFX3D_State &renderState)
|
|||
return error;
|
||||
}
|
||||
|
||||
#endif // ENABLE_SSSE3
|
||||
#endif // ENABLE_SSE2
|
||||
|
|
|
@ -158,16 +158,6 @@ public:
|
|||
#ifdef ENABLE_SSE2
|
||||
|
||||
class Render3D_SSE2 : public Render3D
|
||||
{
|
||||
public:
|
||||
virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSSE3
|
||||
|
||||
class Render3D_SSSE3 : public Render3D_SSE2
|
||||
{
|
||||
protected:
|
||||
virtual Render3DError FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551);
|
||||
|
@ -178,4 +168,13 @@ public:
|
|||
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSSE3
|
||||
|
||||
class Render3D_SSSE3 : public Render3D_SSE2
|
||||
{
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#endif // RENDER3D_H
|
||||
|
|
Loading…
Reference in New Issue