- Do some code cleanup, especially with the SSE2-related code.
- Fix potential crash in the OpenGL renderer when changing the framebuffer size.
This commit is contained in:
rogerman 2015-07-07 22:16:34 +00:00
parent e53e289e77
commit 55516fc20e
6 changed files with 206 additions and 194 deletions

View File

@ -2206,7 +2206,7 @@ PLAIN_CLEAR:
if (gpu->LayersEnable[4])
{
//n.b. - this is clearing the sprite line buffer to the background color,
memset_u16(gpu->sprColor, backdrop_color, GPU_FRAMEBUFFER_NATIVE_WIDTH);
memset_u16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(gpu->sprColor, backdrop_color);
//zero 06-may-09: I properly supported window color effects for backdrop, but I am not sure
//how it interacts with this. I wish we knew why we needed this
@ -2531,14 +2531,14 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
{
if (factor < 16)
{
#ifdef ENABLE_SSE2
static size_t ssePixCount = pixCount - (pixCount % 4);
static const __m128i colorMask = _mm_set1_epi16(0x7FFF);
size_t i = 0;
for (size_t i = 0; i < ssePixCount; i += 8)
#ifdef ENABLE_SSE2
const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8)
{
__m128i dstColor_vec128 = _mm_load_si128((__m128i *)(dstLine + i));
dstColor_vec128 = _mm_and_si128(dstColor_vec128, colorMask);
dstColor_vec128 = _mm_and_si128(dstColor_vec128, _mm_set1_epi16(0x7FFF));
dstLine[i+7] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 7) ];
dstLine[i+6] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 6) ];
@ -2549,17 +2549,11 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
dstLine[i+1] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 1) ];
dstLine[i+0] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 0) ];
}
for (size_t i = ssePixCount; i < pixCount; i++)
{
dstLine[i] = fadeInColors[factor][ dstLine[i] & 0x7FFF ];
}
#else
for (size_t i = 0; i < pixCount; i++)
{
dstLine[i] = fadeInColors[factor][ dstLine[i] & 0x7FFF ];
}
#endif
for (; i < pixCount; i++)
{
dstLine[i] = fadeInColors[factor][ dstLine[i] & 0x7FFF ];
}
}
else
{
@ -2573,14 +2567,14 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
{
if (factor < 16)
{
#ifdef ENABLE_SSE2
static size_t ssePixCount = pixCount - (pixCount % 4);
static const __m128i colorMask = _mm_set1_epi16(0x7FFF);
size_t i = 0;
for (size_t i = 0; i < ssePixCount; i += 8)
#ifdef ENABLE_SSE2
const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8)
{
__m128i dstColor_vec128 = _mm_load_si128((__m128i *)(dstLine + i));
dstColor_vec128 = _mm_and_si128(dstColor_vec128, colorMask);
dstColor_vec128 = _mm_and_si128(dstColor_vec128, _mm_set1_epi16(0x7FFF));
dstLine[i+7] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 7) ];
dstLine[i+6] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 6) ];
@ -2591,17 +2585,11 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
dstLine[i+1] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 1) ];
dstLine[i+0] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 0) ];
}
for (size_t i = ssePixCount; i < pixCount; i++)
{
dstLine[i] = fadeOutColors[factor][ dstLine[i] & 0x7FFF ];
}
#else
for (size_t i = 0; i < pixCount; i++)
{
dstLine[i] = fadeOutColors[factor][ dstLine[i] & 0x7FFF ];
}
#endif
for (; i < pixCount; i++)
{
dstLine[i] = fadeOutColors[factor][ dstLine[i] & 0x7FFF ];
}
}
else
{
@ -2614,7 +2602,6 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
case GPUMasterBrightMode_Reserved:
break;
}
}
template<size_t WIN_NUM>
@ -2818,10 +2805,21 @@ void GPU_RenderLine(NDS_Screen *screen, const u16 l, bool skip)
{
//this has not been tested since the dma timing for dispfifo was changed around the time of
//newemuloop. it may not work.
for (size_t i = 0; i < 128; i++)
#ifdef ENABLE_SSE2
const __m128i fifoMask = _mm_set1_epi32(0x7FFF7FFF);
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++)
{
__m128i fifoColor = _mm_set_epi32(DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv());
fifoColor = _mm_shuffle_epi32(fifoColor, 0x1B); // We need to shuffle the four FIFO values back into the correct order, since they were originally loaded in reverse order.
((__m128i *)dstLine)[i] = fifoColor & fifoMask;
}
#else
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
{
((u32 *)dstLine)[i] = DISP_FIFOrecv() & 0x7FFF7FFF;
}
#endif
if (_gpuFramebufferWidth != GPU_FRAMEBUFFER_NATIVE_WIDTH)
{

View File

@ -893,7 +893,6 @@ void OpenGLRenderer::SetVersion(unsigned int major, unsigned int minor, unsigned
this->versionRevision = revision;
}
#if defined(ENABLE_SSSE3) && defined(LOCAL_LE)
Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
{
// Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL
@ -905,7 +904,10 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
{
for (size_t x = 0; x < ssePixCount; x+=4, ir+=4, iw+=4)
size_t x = 0;
#if defined(ENABLE_SSSE3) && defined(LOCAL_LE)
for (; x < ssePixCount; x += 4, ir += 4, iw += 4)
{
// Convert to RGBA6665
__m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + ir));
@ -923,63 +925,40 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
color = _mm_load_si128((__m128i *)(this->_framebufferColor + ir));
__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x000000F8)); // Read from R
b = _mm_slli_epi32(b, 7); // Shift to B
b = _mm_slli_si128(b, 7); // Shift to B
__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x0000F800)); // Read from G
g = _mm_srli_epi32(g, 6); // Shift in G
g = _mm_srli_si128(g, 6); // Shift in G
__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x00F80000)); // Read from B
r = _mm_srli_epi32(r, 19); // Shift to R
r = _mm_srli_si128(r, 19); // Shift to R
a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A
a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A
a = _mm_and_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A
color = _mm_or_si128(b, g);
color = _mm_or_si128(color, r);
color = _mm_or_si128(color, a);
color = b | g | r | a;
// All the colors are currently placed every other 16 bits, so we need to swizzle them
// to the lower 64 bits of our vector before we store them back to memory.
color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
_mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), color);
}
#endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE)
for (size_t x = ssePixCount; x < pixCount; x++, ir++, iw++)
{
dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(this->_framebufferColor[ir].color);
dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F,
(this->_framebufferColor[ir].g >> 3) & 0x1F,
(this->_framebufferColor[ir].r >> 3) & 0x1F) |
((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
}
}
return RENDER3DERROR_NOERR;
}
#else // Code path where SSSE3 or little-endian is not supported
Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
{
// Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL
// stores pixels using a flipped Y-coordinate, so this needs to be flipped back
// to the DS Y-coordinate.
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
{
for (size_t x = 0; x < this->_framebufferWidth; x++, ir++, iw++)
for (; x < pixCount; x++, ir++, iw++)
{
// Use the correct endian format since OpenGL uses the native endian of
// the architecture it is running on.
#ifdef WORDS_BIGENDIAN
#ifdef LOCAL_BE
dstRGBA6665[iw].color = BGRA8888_32_To_RGBA6665_32(this->_framebufferColor[ir].color);
dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F,
dstRGBA5551[iw] = R5G5B5TORGB15( (this->_framebufferColor[ir].b >> 3) & 0x1F,
(this->_framebufferColor[ir].g >> 3) & 0x1F,
(this->_framebufferColor[ir].r >> 3) & 0x1F) |
((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
#else
dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(this->_framebufferColor[ir].color);
dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F,
dstRGBA5551[iw] = R5G5B5TORGB15( (this->_framebufferColor[ir].b >> 3) & 0x1F,
(this->_framebufferColor[ir].g >> 3) & 0x1F,
(this->_framebufferColor[ir].r >> 3) & 0x1F) |
((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
@ -990,8 +969,6 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
return RENDER3DERROR_NOERR;
}
#endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE)
OpenGLRenderer_1_2::~OpenGLRenderer_1_2()
{
glFinish();
@ -1902,7 +1879,7 @@ Render3DError OpenGLRenderer_1_2::UploadClearImage(const u16 *__restrict colorBu
}
else
{
for (size_t i = 0; i < this->_framebufferWidth * this->_framebufferHeight; i++)
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
{
OGLRef.workingCIDepthStencilBuffer[i] = depthBuffer[i] << 8;
}
@ -2782,6 +2759,11 @@ Render3DError OpenGLRenderer_1_2::SetFramebufferSize(size_t w, size_t h)
return OGLERROR_NOERR;
}
if (!BEGINGL())
{
return OGLERROR_BEGINGL_FAILED;
}
if (this->isFBOSupported)
{
glActiveTextureARB(GL_TEXTURE0_ARB + OGLTextureUnitID_GColor);
@ -2843,6 +2825,8 @@ Render3DError OpenGLRenderer_1_2::SetFramebufferSize(size_t w, size_t h)
free_aligned(oldFramebufferColor);
ENDGL();
return OGLERROR_NOERR;
}
@ -2892,7 +2876,7 @@ Render3DError OpenGLRenderer_1_3::UploadClearImage(const u16 *__restrict colorBu
}
else
{
for (size_t i = 0; i < this->_framebufferWidth * this->_framebufferHeight; i++)
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
{
OGLRef.workingCIDepthStencilBuffer[i] = depthBuffer[i] << 8;
}
@ -2931,6 +2915,11 @@ Render3DError OpenGLRenderer_1_3::SetFramebufferSize(size_t w, size_t h)
return OGLERROR_NOERR;
}
if (!BEGINGL())
{
return OGLERROR_BEGINGL_FAILED;
}
if (this->isFBOSupported)
{
glActiveTexture(GL_TEXTURE0 + OGLTextureUnitID_GColor);
@ -2992,6 +2981,8 @@ Render3DError OpenGLRenderer_1_3::SetFramebufferSize(size_t w, size_t h)
free_aligned(oldFramebufferColor);
ENDGL();
return OGLERROR_NOERR;
}

View File

@ -1148,7 +1148,7 @@ Render3DError OpenGLRenderer_3_2::BeginRender(const GFX3D &engine)
{
OGLRenderRef &OGLRef = *this->ref;
if(!BEGINGL())
if (!BEGINGL())
{
return OGLERROR_BEGINGL_FAILED;
}
@ -1560,6 +1560,11 @@ Render3DError OpenGLRenderer_3_2::SetFramebufferSize(size_t w, size_t h)
return OGLERROR_NOERR;
}
if (!BEGINGL())
{
return OGLERROR_BEGINGL_FAILED;
}
glActiveTexture(GL_TEXTURE0 + OGLTextureUnitID_GColor);
glBindTexture(GL_TEXTURE_2D, OGLRef.texGDepthStencilID);
glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, w, h, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL);
@ -1615,5 +1620,7 @@ Render3DError OpenGLRenderer_3_2::SetFramebufferSize(size_t w, size_t h)
free_aligned(oldFramebufferColor);
ENDGL();
return OGLERROR_NOERR;
}

View File

@ -125,23 +125,37 @@ static void memset_u16(void *dst, const u16 val, const size_t length)
__m128i *dst_vec128 = (__m128i *)dst;
const __m128i val_vec128 = _mm_set1_epi16(val);
const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
//MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128));
for (size_t i = 0; i < length_vec128; i++)
_mm_stream_si128(dst_vec128 + i, val_vec128);
}
template <size_t LENGTH>
static void memset_u16_fast(void *dst, const u16 val)
{
__m128i *dst_vec128 = (__m128i *)dst;
const __m128i val_vec128 = _mm_set1_epi16(val);
MACRODO_N(LENGTH / (sizeof(val_vec128) / sizeof(val)), _mm_store_si128(dst_vec128 + (X), val_vec128));
}
static void memset_u32(void *dst, const u32 val, const size_t length)
{
__m128i *dst_vec128 = (__m128i *)dst;
const __m128i val_vec128 = _mm_set1_epi32(val);
const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
//MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128));
for (size_t i = 0; i < length_vec128; i++)
_mm_stream_si128(dst_vec128 + i, val_vec128);
}
template <size_t LENGTH>
static void memset_u32_fast(void *dst, const u32 val)
{
__m128i *dst_vec128 = (__m128i *)dst;
const __m128i val_vec128 = _mm_set1_epi32(val);
MACRODO_N(LENGTH / (sizeof(val_vec128) / sizeof(val)), _mm_store_si128(dst_vec128 + (X), val_vec128));
}
#else //no sse2
static void memset_u16(void *dst, const u16 val, const size_t length)
@ -150,7 +164,6 @@ static void memset_u16(void *dst, const u16 val, const size_t length)
u64 *dst_u64 = (u64 *)dst;
const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
const size_t length_u64 = length / (sizeof(val_u64) / sizeof(val));
//MACRODO_N(length_u64, (dst_u64[X] = val_u64));
for (size_t i = 0; i < length_u64; i++)
dst_u64[i] = val_u64;
@ -160,13 +173,25 @@ static void memset_u16(void *dst, const u16 val, const size_t length)
#endif
}
template <size_t LENGTH>
static void memset_u16_fast(void *dst, const u16 val)
{
#ifdef HOST_64
u64 *dst_u64 = (u64 *)dst;
const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
MACRODO_N(LENGTH / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64));
#else
for (size_t i = 0; i < LENGTH; i++)
((u16 *)dst)[i] = val;
#endif
}
static void memset_u32(void *dst, const u32 val, const size_t length)
{
#ifdef HOST_64
u64 *dst_u64 = (u64 *)dst;
const u64 val_u64 = ((u64)val << 32) | (u64)val;
const size_t length_u64 = length / (sizeof(val_u64) / sizeof(val));
//MACRODO_N(length_u64, (dst_u64[X] = val_u64));
for (size_t i = 0; i < length_u64; i++)
dst_u64[i] = val_u64;
@ -176,7 +201,20 @@ static void memset_u32(void *dst, const u32 val, const size_t length)
#endif
}
template <size_t LENGTH>
static void memset_u32_fast(void *dst, const u32 val)
{
#ifdef HOST_64
u64 *dst_u64 = (u64 *)dst;
const u64 val_u64 = ((u64)val << 32) | (u64)val;
MACRODO_N(LENGTH / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64));
#else
for (size_t i = 0; i < LENGTH; i++)
((u16 *)dst)[i] = val;
#endif
}
#endif // ENABLE_SSE2
// NOSSE version always used in gfx3d.cpp
void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr);
@ -233,8 +271,6 @@ FORCEINLINE void MatrixMultiply(float * matrix, const float * rightMatrix)
_mm_store_ps(matrix+12,row3);
}
FORCEINLINE void MatrixMultVec4x4(const float *matrix, float *vecPtr)
{
_mm_store_ps(vecPtr,_util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(vecPtr)));
@ -311,18 +347,6 @@ FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
_mm_store_ps(matrix+12,_mm_div_ps(_mm_load_ps(matrix+12),val));
}
//WARNING: I do not think this is as fast as a memset, for some reason.
//at least in vc2005 with sse enabled. better figure out why before using it
template<int NUM>
static FORCEINLINE void memset_u8(void* _dst, u8 val)
{
memset(_dst,val,NUM);
//const u8* dst = (u8*)_dst;
//u32 u32val = (val<<24)|(val<<16)|(val<<8)|val;
//const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
//MACRODO_N(NUM/16,_mm_store_si128((__m128i*)(dst+(X)*16), temp));
}
#else //no sse
void MatrixMultVec4x4 (const float *matrix, float *vecPtr);
@ -345,12 +369,6 @@ FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
matrix[i] /= divisor;
}
template<int NUM>
static FORCEINLINE void memset_u8(void* dst, u8 val)
{
memset(dst,val,NUM);
}
#endif //switched SSE functions
void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr);
@ -360,5 +378,5 @@ void MatrixMultVec4x4_M2(const s32 *matrix, s32 *vecPtr);
void MatrixMultiply(s32* matrix, const s32* rightMatrix);
void MatrixScale(s32 *matrix, const s32 *ptr);
void MatrixTranslate(s32 *matrix, const s32 *ptr);
#endif
#endif // MATRIX_H

View File

@ -577,7 +577,6 @@ public:
FragmentColor shaderOutput;
bool isOpaquePixel;
//FragmentColor &dstColor = this->_softRender->GetFramebuffer()[fragmentIndex];
u32 &dstAttributeDepth = this->_softRender->_framebufferAttributes->depth[fragmentIndex];
u8 &dstAttributeOpaquePolyID = this->_softRender->_framebufferAttributes->opaquePolyID[fragmentIndex];
u8 &dstAttributeTranslucentPolyID = this->_softRender->_framebufferAttributes->translucentPolyID[fragmentIndex];
@ -2063,9 +2062,6 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor
convertedClearColor.g = GFX3D_5TO6(clearColor.g);
convertedClearColor.b = GFX3D_5TO6(clearColor.b);
const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
const size_t ssePixCount = pixCount - (pixCount % 16);
const __m128i color_vec128 = _mm_set1_epi32(convertedClearColor.color);
const __m128i attrDepth_vec128 = _mm_set1_epi32(clearAttributes.depth);
const __m128i attrOpaquePolyID_vec128 = _mm_set1_epi8(clearAttributes.opaquePolyID);
@ -2074,7 +2070,11 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor
const __m128i attrIsFogged_vec128 = _mm_set1_epi8(clearAttributes.isFogged);
const __m128i attrIsTranslucentPoly_vec128 = _mm_set1_epi8(clearAttributes.isTranslucentPoly);
for (size_t i = 0; i < ssePixCount; i += 16)
size_t i = 0;
const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
const size_t ssePixCount = pixCount - (pixCount % 16);
for (; i < ssePixCount; i += 16)
{
_mm_stream_si128((__m128i *)(this->_framebufferColor + i + 0), color_vec128);
_mm_stream_si128((__m128i *)(this->_framebufferColor + i + 4), color_vec128);
@ -2093,7 +2093,7 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor
_mm_stream_si128((__m128i *)(this->_framebufferAttributes->isTranslucentPoly + i), attrIsTranslucentPoly_vec128);
}
for (size_t i = ssePixCount; i < pixCount; i++)
for (; i < pixCount; i++)
{
this->_framebufferColor[i] = convertedClearColor;
this->_framebufferAttributes->SetAtIndex(i, clearAttributes);

View File

@ -158,9 +158,9 @@ void FragmentAttributesBuffer::SetAtIndex(const size_t index, const FragmentAttr
void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
{
#ifdef ENABLE_SSE2
const size_t sseCount = count - (count % 16);
size_t i = 0;
#ifdef ENABLE_SSE2
const __m128i attrDepth_vec128 = _mm_set1_epi32(attr.depth);
const __m128i attrOpaquePolyID_vec128 = _mm_set1_epi8(attr.opaquePolyID);
const __m128i attrTranslucentPolyID_vec128 = _mm_set1_epi8(attr.translucentPolyID);
@ -168,7 +168,8 @@ void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
const __m128i attrIsFogged_vec128 = _mm_set1_epi8(attr.isFogged);
const __m128i attrIsTranslucentPoly_vec128 = _mm_set1_epi8(attr.isTranslucentPoly);
for (size_t i = 0; i < sseCount; i += 16)
const size_t sseCount = count - (count % 16);
for (; i < sseCount; i += 16)
{
_mm_stream_si128((__m128i *)(this->depth + 0), attrDepth_vec128);
_mm_stream_si128((__m128i *)(this->depth + 4), attrDepth_vec128);
@ -181,17 +182,12 @@ void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
_mm_stream_si128((__m128i *)this->isFogged, attrIsFogged_vec128);
_mm_stream_si128((__m128i *)this->isTranslucentPoly, attrIsTranslucentPoly_vec128);
}
for (size_t i = sseCount; i < count; i++)
{
this->SetAtIndex(i, attr);
}
#else
for (size_t i = 0; i < count; i++)
{
this->SetAtIndex(i, attr);
}
#endif
for (; i < count; i++)
{
this->SetAtIndex(i, attr);
}
}
Render3D::Render3D()
@ -345,6 +341,18 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState)
const u8 xScroll = scrollBits & 0xFF;
const u8 yScroll = (scrollBits >> 8) & 0xFF;
if (xScroll == 0 && yScroll == 0)
{
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
{
this->clearImageColor16Buffer[i] = clearColorBuffer[i];
this->clearImageDepthBuffer[i] = dsDepthToD24_LUT[clearDepthBuffer[i] & 0x7FFF];
this->clearImageFogBuffer[i] = BIT15(clearDepthBuffer[i]);
this->clearImagePolyIDBuffer[i] = clearFragment.opaquePolyID;
}
}
else
{
for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
{
const size_t y = ((iy + yScroll) & 0xFF) << 8;
@ -367,6 +375,7 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState)
this->clearImagePolyIDBuffer[dstIndex] = clearFragment.opaquePolyID;
}
}
}
error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer);
if (error != RENDER3DERROR_NOERR)
@ -470,11 +479,13 @@ Render3DError Render3D::VramReconfigureSignal()
Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
{
static const __m128i zeroColor = _mm_set1_epi32(0);
const __m128i zero_vec128 = _mm_setzero_si128();
size_t i = 0;
const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
const size_t ssePixCount = pixCount - (pixCount % 4);
for (size_t i = 0; i < ssePixCount; i += 4)
for (; i < ssePixCount; i += 4)
{
// Copy the framebufferColor buffer
__m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i));
@ -482,16 +493,16 @@ Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6
// Convert to RGBA5551
__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E)); // Read from R
r = _mm_srli_epi32(r, 1); // Shift to R
r = _mm_srli_si128(r, 1); // Shift to R
__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00)); // Read from G
g = _mm_srli_epi32(g, 4); // Shift in G
g = _mm_srli_si128(g, 4); // Shift in G
__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000)); // Read from B
b = _mm_srli_epi32(b, 7); // Shift to B
b = _mm_srli_si128(b, 7); // Shift to B
__m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A
a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A
a = _mm_cmpgt_epi32(a, zero_vec128); // Determine A
// From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned
// 16-bit. Since SSE2 only has packssdw (signed 16-bit pack), then the alpha bit
@ -504,21 +515,18 @@ Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6
// alpha vector with the post-packed color vector to get the final color.
a = _mm_and_si128(a, _mm_set1_epi32(0x00004000)); // Mask out the bit before A
a = _mm_packs_epi32(a, zeroColor); // Pack 32-bit down to 16-bit
a = _mm_slli_epi16(a, 1); // Shift the A bit back to where it needs to be
a = _mm_packs_epi32(a, zero_vec128); // Pack 32-bit down to 16-bit
a = _mm_slli_si128(a, 1); // Shift the A bit back to where it needs to be
// Assemble the RGB colors
color = _mm_or_si128(r, g);
color = _mm_or_si128(color, b);
// Pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
color = _mm_packs_epi32(color, zeroColor);
color = _mm_or_si128(color, a);
// Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
color = r | g | b;
color = _mm_packs_epi32(color, zero_vec128);
color |= a;
_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color);
}
for (size_t i = ssePixCount; i < pixCount; i++)
for (; i < pixCount; i++)
{
dstRGBA6665[i] = this->_framebufferColor[i];
dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000);
@ -560,59 +568,49 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
if (xScroll == 0 && yScroll == 0)
{
const __m128i depthBitMask_vec128 = _mm_set1_epi16(0x7FFF);
const __m128i fogBufferBitMask_vec128 = _mm_set1_epi16(BIT(15));
const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID);
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i += 16)
{
static const __m128i depthBitMask_vec128 = _mm_set1_epi16(0x7FFF);
static const __m128i fogBufferBitMask_vec128 = _mm_set1_epi16(BIT(15));
// Copy the colors to the color buffer. Since we can only copy 8 elements at once,
// we need to load-store twice.
_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i + 8), _mm_load_si128((__m128i *)(clearColorBuffer + i + 8)) );
_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i), _mm_load_si128((__m128i *)(clearColorBuffer + i)) );
// Write the depth values to the depth buffer.
__m128i clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8));
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, depthBitMask_vec128);
__m128i clearDepthHi_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8));
__m128i clearDepthLo_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i));
clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, depthBitMask_vec128);
clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, depthBitMask_vec128);
__m128i depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 7)],
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 6)],
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 5)],
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 4)]);
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 12), depthValue_vec128);
depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 3)],
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 2)],
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 1)],
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 0)]);
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 8), depthValue_vec128);
clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i));
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, depthBitMask_vec128);
depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 7)],
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 6)],
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 5)],
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 4)]);
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 4), depthValue_vec128);
depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 3)],
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 2)],
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 1)],
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 0)]);
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i), depthValue_vec128);
this->clearImageDepthBuffer[i+15] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 7)];
this->clearImageDepthBuffer[i+14] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 6)];
this->clearImageDepthBuffer[i+13] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 5)];
this->clearImageDepthBuffer[i+12] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 4)];
this->clearImageDepthBuffer[i+11] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 3)];
this->clearImageDepthBuffer[i+10] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 2)];
this->clearImageDepthBuffer[i+ 9] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 1)];
this->clearImageDepthBuffer[i+ 8] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 0)];
this->clearImageDepthBuffer[i+ 7] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 7)];
this->clearImageDepthBuffer[i+ 6] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 6)];
this->clearImageDepthBuffer[i+ 5] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 5)];
this->clearImageDepthBuffer[i+ 4] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 4)];
this->clearImageDepthBuffer[i+ 3] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 3)];
this->clearImageDepthBuffer[i+ 2] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 2)];
this->clearImageDepthBuffer[i+ 1] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 1)];
this->clearImageDepthBuffer[i+ 0] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 0)];
// Write the fog flags to the fog flag buffer.
clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8)); // Read the upper values
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, fogBufferBitMask_vec128);
const __m128i clearDepthFogBit_vec128 = _mm_srli_epi16(clearDepth_vec128, 15); // Save the upper bits in another register
clearDepthHi_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8));
clearDepthLo_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i));
clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, fogBufferBitMask_vec128);
clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, fogBufferBitMask_vec128);
clearDepthHi_vec128 = _mm_srli_si128(clearDepthHi_vec128, 15);
clearDepthLo_vec128 = _mm_srli_si128(clearDepthLo_vec128, 15);
clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i)); // Read the lower values
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, fogBufferBitMask_vec128);
clearDepth_vec128 = _mm_srli_epi16(clearDepth_vec128, 15); // These are the lower bits
_mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packus_epi16(clearDepth_vec128, clearDepthFogBit_vec128));
_mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packus_epi16(clearDepthLo_vec128, clearDepthHi_vec128));
// The one is easy. Just set the values in the polygon ID buffer.
_mm_store_si128((__m128i *)(this->clearImagePolyIDBuffer + i), opaquePolyID_vec128);
@ -620,8 +618,8 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
}
else
{
static const __m128i addrOffset = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
static const __m128i addrRolloverMask = _mm_set1_epi16(0x00FF);
const __m128i addrOffset = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
const __m128i addrRolloverMask = _mm_set1_epi16(0x00FF);
const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID);
for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)