diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 4a4c8da94..b632749c6 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -2206,7 +2206,7 @@ PLAIN_CLEAR: if (gpu->LayersEnable[4]) { //n.b. - this is clearing the sprite line buffer to the background color, - memset_u16(gpu->sprColor, backdrop_color, GPU_FRAMEBUFFER_NATIVE_WIDTH); + memset_u16_fast(gpu->sprColor, backdrop_color); //zero 06-may-09: I properly supported window color effects for backdrop, but I am not sure //how it interacts with this. I wish we knew why we needed this @@ -2531,14 +2531,14 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod { if (factor < 16) { -#ifdef ENABLE_SSE2 - static size_t ssePixCount = pixCount - (pixCount % 4); - static const __m128i colorMask = _mm_set1_epi16(0x7FFF); + size_t i = 0; - for (size_t i = 0; i < ssePixCount; i += 8) +#ifdef ENABLE_SSE2 + const size_t ssePixCount = pixCount - (pixCount % 8); + for (; i < ssePixCount; i += 8) { __m128i dstColor_vec128 = _mm_load_si128((__m128i *)(dstLine + i)); - dstColor_vec128 = _mm_and_si128(dstColor_vec128, colorMask); + dstColor_vec128 = _mm_and_si128(dstColor_vec128, _mm_set1_epi16(0x7FFF)); dstLine[i+7] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 7) ]; dstLine[i+6] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 6) ]; @@ -2549,17 +2549,11 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod dstLine[i+1] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 1) ]; dstLine[i+0] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 0) ]; } - - for (size_t i = ssePixCount; i < pixCount; i++) - { - dstLine[i] = fadeInColors[factor][ dstLine[i] & 0x7FFF ]; - } -#else - for (size_t i = 0; i < pixCount; i++) - { - dstLine[i] = fadeInColors[factor][ dstLine[i] & 0x7FFF ]; - } #endif + for (; i < pixCount; i++) + { + dstLine[i] = fadeInColors[factor][ dstLine[i] & 0x7FFF ]; + } } else { @@ -2573,14 +2567,14 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod { if (factor < 16) { -#ifdef ENABLE_SSE2 - static size_t ssePixCount = pixCount - (pixCount % 4); - static const __m128i colorMask = _mm_set1_epi16(0x7FFF); + size_t i = 0; - for (size_t i = 0; i < ssePixCount; i += 8) +#ifdef ENABLE_SSE2 + const size_t ssePixCount = pixCount - (pixCount % 8); + for (; i < ssePixCount; i += 8) { __m128i dstColor_vec128 = _mm_load_si128((__m128i *)(dstLine + i)); - dstColor_vec128 = _mm_and_si128(dstColor_vec128, colorMask); + dstColor_vec128 = _mm_and_si128(dstColor_vec128, _mm_set1_epi16(0x7FFF)); dstLine[i+7] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 7) ]; dstLine[i+6] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 6) ]; @@ -2591,17 +2585,11 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod dstLine[i+1] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 1) ]; dstLine[i+0] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 0) ]; } - - for (size_t i = ssePixCount; i < pixCount; i++) - { - dstLine[i] = fadeOutColors[factor][ dstLine[i] & 0x7FFF ]; - } -#else - for (size_t i = 0; i < pixCount; i++) - { - dstLine[i] = fadeOutColors[factor][ dstLine[i] & 0x7FFF ]; - } #endif + for (; i < pixCount; i++) + { + dstLine[i] = fadeOutColors[factor][ dstLine[i] & 0x7FFF ]; + } } else { @@ -2614,7 +2602,6 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod case GPUMasterBrightMode_Reserved: break; } - } template @@ -2818,10 +2805,21 @@ void GPU_RenderLine(NDS_Screen *screen, const u16 l, bool skip) { //this has not been tested since the dma timing for dispfifo was changed around the time of //newemuloop. it may not work. - for (size_t i = 0; i < 128; i++) +#ifdef ENABLE_SSE2 + const __m128i fifoMask = _mm_set1_epi32(0x7FFF7FFF); + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++) + { + __m128i fifoColor = _mm_set_epi32(DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv()); + fifoColor = _mm_shuffle_epi32(fifoColor, 0x1B); // We need to shuffle the four FIFO values back into the correct order, since they were originally loaded in reverse order. + + ((__m128i *)dstLine)[i] = fifoColor & fifoMask; + } +#else + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++) { ((u32 *)dstLine)[i] = DISP_FIFOrecv() & 0x7FFF7FFF; } +#endif if (_gpuFramebufferWidth != GPU_FRAMEBUFFER_NATIVE_WIDTH) { diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index f04ca0bbc..cb6bce604 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -893,7 +893,6 @@ void OpenGLRenderer::SetVersion(unsigned int major, unsigned int minor, unsigned this->versionRevision = revision; } -#if defined(ENABLE_SSSE3) && defined(LOCAL_LE) Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551) { // Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL @@ -905,7 +904,10 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) { - for (size_t x = 0; x < ssePixCount; x+=4, ir+=4, iw+=4) + size_t x = 0; + +#if defined(ENABLE_SSSE3) && defined(LOCAL_LE) + for (; x < ssePixCount; x += 4, ir += 4, iw += 4) { // Convert to RGBA6665 __m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + ir)); @@ -923,65 +925,42 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA color = _mm_load_si128((__m128i *)(this->_framebufferColor + ir)); __m128i b = _mm_and_si128(color, _mm_set1_epi32(0x000000F8)); // Read from R - b = _mm_slli_epi32(b, 7); // Shift to B + b = _mm_slli_si128(b, 7); // Shift to B __m128i g = _mm_and_si128(color, _mm_set1_epi32(0x0000F800)); // Read from G - g = _mm_srli_epi32(g, 6); // Shift in G + g = _mm_srli_si128(g, 6); // Shift in G __m128i r = _mm_and_si128(color, _mm_set1_epi32(0x00F80000)); // Read from B - r = _mm_srli_epi32(r, 19); // Shift to R + r = _mm_srli_si128(r, 19); // Shift to R a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A a = _mm_and_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A - color = _mm_or_si128(b, g); - color = _mm_or_si128(color, r); - color = _mm_or_si128(color, a); + color = b | g | r | a; // All the colors are currently placed every other 16 bits, so we need to swizzle them // to the lower 64 bits of our vector before we store them back to memory. color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); _mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), color); } +#endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE) - for (size_t x = ssePixCount; x < pixCount; x++, ir++, iw++) - { - dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(this->_framebufferColor[ir].color); - dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F, - (this->_framebufferColor[ir].g >> 3) & 0x1F, - (this->_framebufferColor[ir].r >> 3) & 0x1F) | - ((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000); - } - } - - return RENDER3DERROR_NOERR; -} - -#else // Code path where SSSE3 or little-endian is not supported - -Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551) -{ - // Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL - // stores pixels using a flipped Y-coordinate, so this needs to be flipped back - // to the DS Y-coordinate. - for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) - { - for (size_t x = 0; x < this->_framebufferWidth; x++, ir++, iw++) + for (; x < pixCount; x++, ir++, iw++) { // Use the correct endian format since OpenGL uses the native endian of // the architecture it is running on. -#ifdef WORDS_BIGENDIAN +#ifdef LOCAL_BE dstRGBA6665[iw].color = BGRA8888_32_To_RGBA6665_32(this->_framebufferColor[ir].color); - dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F, - (this->_framebufferColor[ir].g >> 3) & 0x1F, - (this->_framebufferColor[ir].r >> 3) & 0x1F) | + dstRGBA5551[iw] = R5G5B5TORGB15( (this->_framebufferColor[ir].b >> 3) & 0x1F, + (this->_framebufferColor[ir].g >> 3) & 0x1F, + (this->_framebufferColor[ir].r >> 3) & 0x1F) | ((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000); #else dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(this->_framebufferColor[ir].color); - dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F, - (this->_framebufferColor[ir].g >> 3) & 0x1F, - (this->_framebufferColor[ir].r >> 3) & 0x1F) | + dstRGBA5551[iw] = R5G5B5TORGB15( (this->_framebufferColor[ir].b >> 3) & 0x1F, + (this->_framebufferColor[ir].g >> 3) & 0x1F, + (this->_framebufferColor[ir].r >> 3) & 0x1F) | ((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000); #endif } @@ -990,8 +969,6 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA return RENDER3DERROR_NOERR; } -#endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE) - OpenGLRenderer_1_2::~OpenGLRenderer_1_2() { glFinish(); @@ -1902,7 +1879,7 @@ Render3DError OpenGLRenderer_1_2::UploadClearImage(const u16 *__restrict colorBu } else { - for (size_t i = 0; i < this->_framebufferWidth * this->_framebufferHeight; i++) + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++) { OGLRef.workingCIDepthStencilBuffer[i] = depthBuffer[i] << 8; } @@ -2782,6 +2759,11 @@ Render3DError OpenGLRenderer_1_2::SetFramebufferSize(size_t w, size_t h) return OGLERROR_NOERR; } + if (!BEGINGL()) + { + return OGLERROR_BEGINGL_FAILED; + } + if (this->isFBOSupported) { glActiveTextureARB(GL_TEXTURE0_ARB + OGLTextureUnitID_GColor); @@ -2843,6 +2825,8 @@ Render3DError OpenGLRenderer_1_2::SetFramebufferSize(size_t w, size_t h) free_aligned(oldFramebufferColor); + ENDGL(); + return OGLERROR_NOERR; } @@ -2892,7 +2876,7 @@ Render3DError OpenGLRenderer_1_3::UploadClearImage(const u16 *__restrict colorBu } else { - for (size_t i = 0; i < this->_framebufferWidth * this->_framebufferHeight; i++) + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++) { OGLRef.workingCIDepthStencilBuffer[i] = depthBuffer[i] << 8; } @@ -2931,6 +2915,11 @@ Render3DError OpenGLRenderer_1_3::SetFramebufferSize(size_t w, size_t h) return OGLERROR_NOERR; } + if (!BEGINGL()) + { + return OGLERROR_BEGINGL_FAILED; + } + if (this->isFBOSupported) { glActiveTexture(GL_TEXTURE0 + OGLTextureUnitID_GColor); @@ -2992,6 +2981,8 @@ Render3DError OpenGLRenderer_1_3::SetFramebufferSize(size_t w, size_t h) free_aligned(oldFramebufferColor); + ENDGL(); + return OGLERROR_NOERR; } diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp index e876fc6a4..148680123 100644 --- a/desmume/src/OGLRender_3_2.cpp +++ b/desmume/src/OGLRender_3_2.cpp @@ -1148,7 +1148,7 @@ Render3DError OpenGLRenderer_3_2::BeginRender(const GFX3D &engine) { OGLRenderRef &OGLRef = *this->ref; - if(!BEGINGL()) + if (!BEGINGL()) { return OGLERROR_BEGINGL_FAILED; } @@ -1560,6 +1560,11 @@ Render3DError OpenGLRenderer_3_2::SetFramebufferSize(size_t w, size_t h) return OGLERROR_NOERR; } + if (!BEGINGL()) + { + return OGLERROR_BEGINGL_FAILED; + } + glActiveTexture(GL_TEXTURE0 + OGLTextureUnitID_GColor); glBindTexture(GL_TEXTURE_2D, OGLRef.texGDepthStencilID); glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, w, h, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL); @@ -1615,5 +1620,7 @@ Render3DError OpenGLRenderer_3_2::SetFramebufferSize(size_t w, size_t h) free_aligned(oldFramebufferColor); + ENDGL(); + return OGLERROR_NOERR; } diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h index ba27a3c9f..e6eb5862e 100644 --- a/desmume/src/matrix.h +++ b/desmume/src/matrix.h @@ -125,23 +125,37 @@ static void memset_u16(void *dst, const u16 val, const size_t length) __m128i *dst_vec128 = (__m128i *)dst; const __m128i val_vec128 = _mm_set1_epi16(val); const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val)); - //MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128)); for (size_t i = 0; i < length_vec128; i++) _mm_stream_si128(dst_vec128 + i, val_vec128); } +template +static void memset_u16_fast(void *dst, const u16 val) +{ + __m128i *dst_vec128 = (__m128i *)dst; + const __m128i val_vec128 = _mm_set1_epi16(val); + MACRODO_N(LENGTH / (sizeof(val_vec128) / sizeof(val)), _mm_store_si128(dst_vec128 + (X), val_vec128)); +} + static void memset_u32(void *dst, const u32 val, const size_t length) { __m128i *dst_vec128 = (__m128i *)dst; const __m128i val_vec128 = _mm_set1_epi32(val); const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val)); - //MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128)); for (size_t i = 0; i < length_vec128; i++) _mm_stream_si128(dst_vec128 + i, val_vec128); } +template +static void memset_u32_fast(void *dst, const u32 val) +{ + __m128i *dst_vec128 = (__m128i *)dst; + const __m128i val_vec128 = _mm_set1_epi32(val); + MACRODO_N(LENGTH / (sizeof(val_vec128) / sizeof(val)), _mm_store_si128(dst_vec128 + (X), val_vec128)); +} + #else //no sse2 static void memset_u16(void *dst, const u16 val, const size_t length) @@ -150,7 +164,6 @@ static void memset_u16(void *dst, const u16 val, const size_t length) u64 *dst_u64 = (u64 *)dst; const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val; const size_t length_u64 = length / (sizeof(val_u64) / sizeof(val)); - //MACRODO_N(length_u64, (dst_u64[X] = val_u64)); for (size_t i = 0; i < length_u64; i++) dst_u64[i] = val_u64; @@ -160,13 +173,25 @@ static void memset_u16(void *dst, const u16 val, const size_t length) #endif } +template +static void memset_u16_fast(void *dst, const u16 val) +{ +#ifdef HOST_64 + u64 *dst_u64 = (u64 *)dst; + const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val; + MACRODO_N(LENGTH / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64)); +#else + for (size_t i = 0; i < LENGTH; i++) + ((u16 *)dst)[i] = val; +#endif +} + static void memset_u32(void *dst, const u32 val, const size_t length) { #ifdef HOST_64 u64 *dst_u64 = (u64 *)dst; const u64 val_u64 = ((u64)val << 32) | (u64)val; const size_t length_u64 = length / (sizeof(val_u64) / sizeof(val)); - //MACRODO_N(length_u64, (dst_u64[X] = val_u64)); for (size_t i = 0; i < length_u64; i++) dst_u64[i] = val_u64; @@ -176,7 +201,20 @@ static void memset_u32(void *dst, const u32 val, const size_t length) #endif } +template +static void memset_u32_fast(void *dst, const u32 val) +{ +#ifdef HOST_64 + u64 *dst_u64 = (u64 *)dst; + const u64 val_u64 = ((u64)val << 32) | (u64)val; + MACRODO_N(LENGTH / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64)); +#else + for (size_t i = 0; i < LENGTH; i++) + ((u16 *)dst)[i] = val; #endif +} + +#endif // ENABLE_SSE2 // NOSSE version always used in gfx3d.cpp void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr); @@ -233,8 +271,6 @@ FORCEINLINE void MatrixMultiply(float * matrix, const float * rightMatrix) _mm_store_ps(matrix+12,row3); } - - FORCEINLINE void MatrixMultVec4x4(const float *matrix, float *vecPtr) { _mm_store_ps(vecPtr,_util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(vecPtr))); @@ -311,18 +347,6 @@ FORCEINLINE void vector_fix2float(float* matrix, const float divisor) _mm_store_ps(matrix+12,_mm_div_ps(_mm_load_ps(matrix+12),val)); } -//WARNING: I do not think this is as fast as a memset, for some reason. -//at least in vc2005 with sse enabled. better figure out why before using it -template -static FORCEINLINE void memset_u8(void* _dst, u8 val) -{ - memset(_dst,val,NUM); - //const u8* dst = (u8*)_dst; - //u32 u32val = (val<<24)|(val<<16)|(val<<8)|val; - //const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val); - //MACRODO_N(NUM/16,_mm_store_si128((__m128i*)(dst+(X)*16), temp)); -} - #else //no sse void MatrixMultVec4x4 (const float *matrix, float *vecPtr); @@ -345,12 +369,6 @@ FORCEINLINE void vector_fix2float(float* matrix, const float divisor) matrix[i] /= divisor; } -template -static FORCEINLINE void memset_u8(void* dst, u8 val) -{ - memset(dst,val,NUM); -} - #endif //switched SSE functions void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr); @@ -360,5 +378,5 @@ void MatrixMultVec4x4_M2(const s32 *matrix, s32 *vecPtr); void MatrixMultiply(s32* matrix, const s32* rightMatrix); void MatrixScale(s32 *matrix, const s32 *ptr); void MatrixTranslate(s32 *matrix, const s32 *ptr); -#endif +#endif // MATRIX_H diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index 7b4719eb8..8df34f24f 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -577,7 +577,6 @@ public: FragmentColor shaderOutput; bool isOpaquePixel; - //FragmentColor &dstColor = this->_softRender->GetFramebuffer()[fragmentIndex]; u32 &dstAttributeDepth = this->_softRender->_framebufferAttributes->depth[fragmentIndex]; u8 &dstAttributeOpaquePolyID = this->_softRender->_framebufferAttributes->opaquePolyID[fragmentIndex]; u8 &dstAttributeTranslucentPolyID = this->_softRender->_framebufferAttributes->translucentPolyID[fragmentIndex]; @@ -2063,9 +2062,6 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor convertedClearColor.g = GFX3D_5TO6(clearColor.g); convertedClearColor.b = GFX3D_5TO6(clearColor.b); - const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight; - const size_t ssePixCount = pixCount - (pixCount % 16); - const __m128i color_vec128 = _mm_set1_epi32(convertedClearColor.color); const __m128i attrDepth_vec128 = _mm_set1_epi32(clearAttributes.depth); const __m128i attrOpaquePolyID_vec128 = _mm_set1_epi8(clearAttributes.opaquePolyID); @@ -2074,7 +2070,11 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor const __m128i attrIsFogged_vec128 = _mm_set1_epi8(clearAttributes.isFogged); const __m128i attrIsTranslucentPoly_vec128 = _mm_set1_epi8(clearAttributes.isTranslucentPoly); - for (size_t i = 0; i < ssePixCount; i += 16) + size_t i = 0; + const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight; + const size_t ssePixCount = pixCount - (pixCount % 16); + + for (; i < ssePixCount; i += 16) { _mm_stream_si128((__m128i *)(this->_framebufferColor + i + 0), color_vec128); _mm_stream_si128((__m128i *)(this->_framebufferColor + i + 4), color_vec128); @@ -2093,7 +2093,7 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor _mm_stream_si128((__m128i *)(this->_framebufferAttributes->isTranslucentPoly + i), attrIsTranslucentPoly_vec128); } - for (size_t i = ssePixCount; i < pixCount; i++) + for (; i < pixCount; i++) { this->_framebufferColor[i] = convertedClearColor; this->_framebufferAttributes->SetAtIndex(i, clearAttributes); diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 0933f0d2e..9f895870e 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -158,9 +158,9 @@ void FragmentAttributesBuffer::SetAtIndex(const size_t index, const FragmentAttr void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr) { -#ifdef ENABLE_SSE2 - const size_t sseCount = count - (count % 16); + size_t i = 0; +#ifdef ENABLE_SSE2 const __m128i attrDepth_vec128 = _mm_set1_epi32(attr.depth); const __m128i attrOpaquePolyID_vec128 = _mm_set1_epi8(attr.opaquePolyID); const __m128i attrTranslucentPolyID_vec128 = _mm_set1_epi8(attr.translucentPolyID); @@ -168,7 +168,8 @@ void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr) const __m128i attrIsFogged_vec128 = _mm_set1_epi8(attr.isFogged); const __m128i attrIsTranslucentPoly_vec128 = _mm_set1_epi8(attr.isTranslucentPoly); - for (size_t i = 0; i < sseCount; i += 16) + const size_t sseCount = count - (count % 16); + for (; i < sseCount; i += 16) { _mm_stream_si128((__m128i *)(this->depth + 0), attrDepth_vec128); _mm_stream_si128((__m128i *)(this->depth + 4), attrDepth_vec128); @@ -181,17 +182,12 @@ void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr) _mm_stream_si128((__m128i *)this->isFogged, attrIsFogged_vec128); _mm_stream_si128((__m128i *)this->isTranslucentPoly, attrIsTranslucentPoly_vec128); } - - for (size_t i = sseCount; i < count; i++) - { - this->SetAtIndex(i, attr); - } -#else - for (size_t i = 0; i < count; i++) - { - this->SetAtIndex(i, attr); - } #endif + + for (; i < count; i++) + { + this->SetAtIndex(i, attr); + } } Render3D::Render3D() @@ -345,26 +341,39 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState) const u8 xScroll = scrollBits & 0xFF; const u8 yScroll = (scrollBits >> 8) & 0xFF; - for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++) + if (xScroll == 0 && yScroll == 0) { - const size_t y = ((iy + yScroll) & 0xFF) << 8; - - for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex++, ix++) + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++) { - const size_t x = (ix + xScroll) & 0xFF; - const size_t srcIndex = y | x; + this->clearImageColor16Buffer[i] = clearColorBuffer[i]; + this->clearImageDepthBuffer[i] = dsDepthToD24_LUT[clearDepthBuffer[i] & 0x7FFF]; + this->clearImageFogBuffer[i] = BIT15(clearDepthBuffer[i]); + this->clearImagePolyIDBuffer[i] = clearFragment.opaquePolyID; + } + } + else + { + for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++) + { + const size_t y = ((iy + yScroll) & 0xFF) << 8; - //this is tested by harry potter and the order of the phoenix. - //TODO (optimization) dont do this if we are mapped to blank memory (such as in sonic chronicles) - //(or use a special zero fill in the bulk clearing above) - this->clearImageColor16Buffer[dstIndex] = clearColorBuffer[srcIndex]; - - //this is tested quite well in the sonic chronicles main map mode - //where depth values are used for trees etc you can walk behind - this->clearImageDepthBuffer[dstIndex] = dsDepthToD24_LUT[clearDepthBuffer[srcIndex] & 0x7FFF]; - - this->clearImageFogBuffer[dstIndex] = BIT15(clearDepthBuffer[srcIndex]); - this->clearImagePolyIDBuffer[dstIndex] = clearFragment.opaquePolyID; + for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex++, ix++) + { + const size_t x = (ix + xScroll) & 0xFF; + const size_t srcIndex = y | x; + + //this is tested by harry potter and the order of the phoenix. + //TODO (optimization) dont do this if we are mapped to blank memory (such as in sonic chronicles) + //(or use a special zero fill in the bulk clearing above) + this->clearImageColor16Buffer[dstIndex] = clearColorBuffer[srcIndex]; + + //this is tested quite well in the sonic chronicles main map mode + //where depth values are used for trees etc you can walk behind + this->clearImageDepthBuffer[dstIndex] = dsDepthToD24_LUT[clearDepthBuffer[srcIndex] & 0x7FFF]; + + this->clearImageFogBuffer[dstIndex] = BIT15(clearDepthBuffer[srcIndex]); + this->clearImagePolyIDBuffer[dstIndex] = clearFragment.opaquePolyID; + } } } @@ -470,11 +479,13 @@ Render3DError Render3D::VramReconfigureSignal() Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551) { - static const __m128i zeroColor = _mm_set1_epi32(0); + const __m128i zero_vec128 = _mm_setzero_si128(); + + size_t i = 0; const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight; const size_t ssePixCount = pixCount - (pixCount % 4); - for (size_t i = 0; i < ssePixCount; i += 4) + for (; i < ssePixCount; i += 4) { // Copy the framebufferColor buffer __m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i)); @@ -482,16 +493,16 @@ Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6 // Convert to RGBA5551 __m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E)); // Read from R - r = _mm_srli_epi32(r, 1); // Shift to R + r = _mm_srli_si128(r, 1); // Shift to R __m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00)); // Read from G - g = _mm_srli_epi32(g, 4); // Shift in G + g = _mm_srli_si128(g, 4); // Shift in G __m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000)); // Read from B - b = _mm_srli_epi32(b, 7); // Shift to B + b = _mm_srli_si128(b, 7); // Shift to B __m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A - a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A + a = _mm_cmpgt_epi32(a, zero_vec128); // Determine A // From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned // 16-bit. Since SSE2 only has packssdw (signed 16-bit pack), then the alpha bit @@ -504,21 +515,18 @@ Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6 // alpha vector with the post-packed color vector to get the final color. a = _mm_and_si128(a, _mm_set1_epi32(0x00004000)); // Mask out the bit before A - a = _mm_packs_epi32(a, zeroColor); // Pack 32-bit down to 16-bit - a = _mm_slli_epi16(a, 1); // Shift the A bit back to where it needs to be + a = _mm_packs_epi32(a, zero_vec128); // Pack 32-bit down to 16-bit + a = _mm_slli_si128(a, 1); // Shift the A bit back to where it needs to be - // Assemble the RGB colors - color = _mm_or_si128(r, g); - color = _mm_or_si128(color, b); - - // Pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in. - color = _mm_packs_epi32(color, zeroColor); - color = _mm_or_si128(color, a); + // Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in. + color = r | g | b; + color = _mm_packs_epi32(color, zero_vec128); + color |= a; _mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color); } - for (size_t i = ssePixCount; i < pixCount; i++) + for (; i < pixCount; i++) { dstRGBA6665[i] = this->_framebufferColor[i]; dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000); @@ -560,59 +568,49 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState) if (xScroll == 0 && yScroll == 0) { + const __m128i depthBitMask_vec128 = _mm_set1_epi16(0x7FFF); + const __m128i fogBufferBitMask_vec128 = _mm_set1_epi16(BIT(15)); const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID); for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i += 16) { - static const __m128i depthBitMask_vec128 = _mm_set1_epi16(0x7FFF); - static const __m128i fogBufferBitMask_vec128 = _mm_set1_epi16(BIT(15)); - // Copy the colors to the color buffer. Since we can only copy 8 elements at once, // we need to load-store twice. _mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i + 8), _mm_load_si128((__m128i *)(clearColorBuffer + i + 8)) ); _mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i), _mm_load_si128((__m128i *)(clearColorBuffer + i)) ); // Write the depth values to the depth buffer. - __m128i clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8)); - clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, depthBitMask_vec128); + __m128i clearDepthHi_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8)); + __m128i clearDepthLo_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i)); + clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, depthBitMask_vec128); + clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, depthBitMask_vec128); - __m128i depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 7)], - dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 6)], - dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 5)], - dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 4)]); - _mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 12), depthValue_vec128); - - depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 3)], - dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 2)], - dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 1)], - dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 0)]); - _mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 8), depthValue_vec128); - - clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i)); - clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, depthBitMask_vec128); - - depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 7)], - dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 6)], - dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 5)], - dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 4)]); - _mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 4), depthValue_vec128); - - depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 3)], - dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 2)], - dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 1)], - dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 0)]); - _mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i), depthValue_vec128); + this->clearImageDepthBuffer[i+15] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 7)]; + this->clearImageDepthBuffer[i+14] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 6)]; + this->clearImageDepthBuffer[i+13] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 5)]; + this->clearImageDepthBuffer[i+12] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 4)]; + this->clearImageDepthBuffer[i+11] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 3)]; + this->clearImageDepthBuffer[i+10] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 2)]; + this->clearImageDepthBuffer[i+ 9] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 1)]; + this->clearImageDepthBuffer[i+ 8] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 0)]; + this->clearImageDepthBuffer[i+ 7] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 7)]; + this->clearImageDepthBuffer[i+ 6] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 6)]; + this->clearImageDepthBuffer[i+ 5] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 5)]; + this->clearImageDepthBuffer[i+ 4] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 4)]; + this->clearImageDepthBuffer[i+ 3] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 3)]; + this->clearImageDepthBuffer[i+ 2] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 2)]; + this->clearImageDepthBuffer[i+ 1] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 1)]; + this->clearImageDepthBuffer[i+ 0] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 0)]; // Write the fog flags to the fog flag buffer. - clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8)); // Read the upper values - clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, fogBufferBitMask_vec128); - const __m128i clearDepthFogBit_vec128 = _mm_srli_epi16(clearDepth_vec128, 15); // Save the upper bits in another register + clearDepthHi_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8)); + clearDepthLo_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i)); + clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, fogBufferBitMask_vec128); + clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, fogBufferBitMask_vec128); + clearDepthHi_vec128 = _mm_srli_si128(clearDepthHi_vec128, 15); + clearDepthLo_vec128 = _mm_srli_si128(clearDepthLo_vec128, 15); - clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i)); // Read the lower values - clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, fogBufferBitMask_vec128); - clearDepth_vec128 = _mm_srli_epi16(clearDepth_vec128, 15); // These are the lower bits - - _mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packus_epi16(clearDepth_vec128, clearDepthFogBit_vec128)); + _mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packus_epi16(clearDepthLo_vec128, clearDepthHi_vec128)); // The one is easy. Just set the values in the polygon ID buffer. _mm_store_si128((__m128i *)(this->clearImagePolyIDBuffer + i), opaquePolyID_vec128); @@ -620,8 +618,8 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState) } else { - static const __m128i addrOffset = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); - static const __m128i addrRolloverMask = _mm_set1_epi16(0x00FF); + const __m128i addrOffset = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); + const __m128i addrRolloverMask = _mm_set1_epi16(0x00FF); const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID); for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)