From e41857fa826f329fcf5c2c868f40143c8c8472bc Mon Sep 17 00:00:00 2001 From: rogerman Date: Mon, 27 Jul 2015 05:06:53 +0000 Subject: [PATCH] Render3D: - Revert the SSE2 bit shift optimizations that were done in r5216. Fixes a regression related to fog, as well as a regression that caused a flickering problem in the title screen of Pokemon Ranger: Shadows of Almia. (Fixes bug #1487.) --- desmume/src/OGLRender.cpp | 12 ++++++------ desmume/src/render3D.cpp | 16 ++++++++-------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index cb6bce604..b931887f8 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -925,19 +925,19 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA color = _mm_load_si128((__m128i *)(this->_framebufferColor + ir)); __m128i b = _mm_and_si128(color, _mm_set1_epi32(0x000000F8)); // Read from R - b = _mm_slli_si128(b, 7); // Shift to B + b = _mm_slli_epi32(b, 7); // Shift to B __m128i g = _mm_and_si128(color, _mm_set1_epi32(0x0000F800)); // Read from G - g = _mm_srli_si128(g, 6); // Shift in G + g = _mm_srli_epi32(g, 6); // Shift in G __m128i r = _mm_and_si128(color, _mm_set1_epi32(0x00F80000)); // Read from B - r = _mm_srli_si128(r, 19); // Shift to R + r = _mm_srli_epi32(r, 19); // Shift to R a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A - a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A - a = _mm_and_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A + a = _mm_cmpeq_epi32(a, _mm_setzero_si128()); // Determine A + a = _mm_andnot_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A - color = b | g | r | a; + color = _mm_or_si128(_mm_or_si128(_mm_or_si128(b, g), r), a); // All the colors are currently placed every other 16 bits, so we need to swizzle them // to the lower 64 bits of our vector before we store them back to memory. diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 7cafb8723..10d7db7c4 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -493,16 +493,16 @@ Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6 // Convert to RGBA5551 __m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E)); // Read from R - r = _mm_srli_si128(r, 1); // Shift to R + r = _mm_srli_epi32(r, 1); // Shift to R __m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00)); // Read from G - g = _mm_srli_si128(g, 4); // Shift in G + g = _mm_srli_epi32(g, 4); // Shift in G __m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000)); // Read from B - b = _mm_srli_si128(b, 7); // Shift to B + b = _mm_srli_epi32(b, 7); // Shift to B __m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A - a = _mm_cmpgt_epi32(a, zero_vec128); // Determine A + a = _mm_cmpeq_epi32(a, zero_vec128); // Determine A // From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned // 16-bit. Since SSE2 only has packssdw (signed 16-bit pack), then the alpha bit @@ -514,9 +514,9 @@ Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6 // packssdw, then shift the bit back to its original position. Then we por the // alpha vector with the post-packed color vector to get the final color. - a = _mm_and_si128(a, _mm_set1_epi32(0x00004000)); // Mask out the bit before A + a = _mm_andnot_si128(a, _mm_set1_epi32(0x00004000)); // Mask out the bit before A a = _mm_packs_epi32(a, zero_vec128); // Pack 32-bit down to 16-bit - a = _mm_slli_si128(a, 1); // Shift the A bit back to where it needs to be + a = _mm_slli_epi16(a, 1); // Shift the A bit back to where it needs to be // Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in. color = _mm_or_si128(_mm_or_si128(r, g), b); @@ -607,8 +607,8 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState) clearDepthLo_vec128 = _mm_loadu_si128((__m128i *)(clearDepthBuffer + i)); clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, fogBufferBitMask_vec128); clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, fogBufferBitMask_vec128); - clearDepthHi_vec128 = _mm_srli_si128(clearDepthHi_vec128, 15); - clearDepthLo_vec128 = _mm_srli_si128(clearDepthLo_vec128, 15); + clearDepthHi_vec128 = _mm_srli_epi16(clearDepthHi_vec128, 15); + clearDepthLo_vec128 = _mm_srli_epi16(clearDepthLo_vec128, 15); _mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packus_epi16(clearDepthLo_vec128, clearDepthHi_vec128));