From e41857fa826f329fcf5c2c868f40143c8c8472bc Mon Sep 17 00:00:00 2001
From: rogerman <rogerman@users.sf.net>
Date: Mon, 27 Jul 2015 05:06:53 +0000
Subject: [PATCH] Render3D: - Revert the SSE2 bit shift optimizations that were
 done in r5216. Fixes a regression related to fog, as well as a regression
 that caused a flickering problem in the title screen of Pokemon Ranger:
 Shadows of Almia. (Fixes bug #1487.)

---
 desmume/src/OGLRender.cpp | 12 ++++++------
 desmume/src/render3D.cpp  | 16 ++++++++--------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp
index cb6bce604..b931887f8 100644
--- a/desmume/src/OGLRender.cpp
+++ b/desmume/src/OGLRender.cpp
@@ -925,19 +925,19 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
 			color = _mm_load_si128((__m128i *)(this->_framebufferColor + ir));
 			
 			__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x000000F8));	// Read from R
-			b = _mm_slli_si128(b, 7);										// Shift to B
+			b = _mm_slli_epi32(b, 7);										// Shift to B
 			
 			__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x0000F800));	// Read from G
-			g = _mm_srli_si128(g, 6);										// Shift in G
+			g = _mm_srli_epi32(g, 6);										// Shift in G
 			
 			__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x00F80000));	// Read from B
-			r = _mm_srli_si128(r, 19);										// Shift to R
+			r = _mm_srli_epi32(r, 19);										// Shift to R
 			
 			a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000));			// Read from A
-			a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000));				// Determine A
-			a = _mm_and_si128(a, _mm_set1_epi32(0x00008000));				// Mask to A
+			a = _mm_cmpeq_epi32(a, _mm_setzero_si128());					// Determine A
+			a = _mm_andnot_si128(a, _mm_set1_epi32(0x00008000));			// Mask to A
 			
-			color = b | g | r | a;
+			color = _mm_or_si128(_mm_or_si128(_mm_or_si128(b, g), r), a);
 			
 			// All the colors are currently placed every other 16 bits, so we need to swizzle them
 			// to the lower 64 bits of our vector before we store them back to memory.
diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp
index 7cafb8723..10d7db7c4 100644
--- a/desmume/src/render3D.cpp
+++ b/desmume/src/render3D.cpp
@@ -493,16 +493,16 @@ Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6
 		
 		// Convert to RGBA5551
 		__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E));	// Read from R
-		r = _mm_srli_si128(r, 1);										// Shift to R
+		r = _mm_srli_epi32(r, 1);										// Shift to R
 		
 		__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00));	// Read from G
-		g = _mm_srli_si128(g, 4);										// Shift in G
+		g = _mm_srli_epi32(g, 4);										// Shift in G
 		
 		__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000));	// Read from B
-		b = _mm_srli_si128(b, 7);										// Shift to B
+		b = _mm_srli_epi32(b, 7);										// Shift to B
 		
 		__m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000));	// Read from A
-		a = _mm_cmpgt_epi32(a, zero_vec128);							// Determine A
+		a = _mm_cmpeq_epi32(a, zero_vec128);							// Determine A
 		
 		// From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned
 		// 16-bit. Since SSE2 only has packssdw (signed 16-bit pack), then the alpha bit
@@ -514,9 +514,9 @@ Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6
 		// packssdw, then shift the bit back to its original position. Then we por the
 		// alpha vector with the post-packed color vector to get the final color.
 		
-		a = _mm_and_si128(a, _mm_set1_epi32(0x00004000));				// Mask out the bit before A
+		a = _mm_andnot_si128(a, _mm_set1_epi32(0x00004000));			// Mask out the bit before A
 		a = _mm_packs_epi32(a, zero_vec128);							// Pack 32-bit down to 16-bit
-		a = _mm_slli_si128(a, 1);										// Shift the A bit back to where it needs to be
+		a = _mm_slli_epi16(a, 1);										// Shift the A bit back to where it needs to be
 		
 		// Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
 		color = _mm_or_si128(_mm_or_si128(r, g), b);
@@ -607,8 +607,8 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
 				clearDepthLo_vec128 = _mm_loadu_si128((__m128i *)(clearDepthBuffer + i));
 				clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, fogBufferBitMask_vec128);
 				clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, fogBufferBitMask_vec128);
-				clearDepthHi_vec128 = _mm_srli_si128(clearDepthHi_vec128, 15);
-				clearDepthLo_vec128 = _mm_srli_si128(clearDepthLo_vec128, 15);
+				clearDepthHi_vec128 = _mm_srli_epi16(clearDepthHi_vec128, 15);
+				clearDepthLo_vec128 = _mm_srli_epi16(clearDepthLo_vec128, 15);
 				
 				_mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packus_epi16(clearDepthLo_vec128, clearDepthHi_vec128));