diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp
index 04e356722..a288a6adc 100644
--- a/desmume/src/OGLRender.cpp
+++ b/desmume/src/OGLRender.cpp
@@ -938,7 +938,12 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
 			a = _mm_andnot_si128(a, _mm_set1_epi32(0x00008000));			// Mask to A
 			
 			color = _mm_or_si128(_mm_or_si128(_mm_or_si128(b, g), r), a);
-			color = _mm_packs_epi32(color, _mm_setzero_si128());
+			
+			// All the colors are currently placed on 32 bit boundaries, so we need to swizzle them
+			// to the lower 64 bits of our vector before we store them back to memory.
+			// Note: Do not attempt to use packssdw here since packing with the 0x8000 bit set will
+			// result in values of 0x7FFF, which are incorrect values in this case.
+			color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
 			_mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), color);
 		}
 #endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE)
diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp
index ac31258b3..3430027c0 100644
--- a/desmume/src/render3D.cpp
+++ b/desmume/src/render3D.cpp
@@ -529,11 +529,28 @@ Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6
 		
 		__m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000));	// Read from A
 		a = _mm_cmpeq_epi32(a, zero_vec128);							// Determine A
-		a = _mm_andnot_si128(a, _mm_set1_epi16(0x00008000));			// Retrieve the alpha bit
 		
-		// Assemble, pack, and store the color.
-		color = _mm_or_si128(_mm_or_si128(_mm_or_si128(r, g), b), a);
+		// From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned
+		// 16-bit. Since SSE2 only has packssdw (signed saturated 16-bit pack), using
+		// packssdw on the alpha bit (0x8000) will result in a value of 0x7FFF, which is
+		// incorrect. Now if we were to use SSE4.1's packusdw (unsigned saturated 16-bit
+		// pack), we  wouldn't have to go through this hassle. But not everyone has an
+		// SSE4.1-capable CPU, so doing this the SSE2 way is more guaranteed to work for
+		// everyone's CPU.
+		//
+		// To use packssdw, we take a bit one position lower for the alpha bit, run
+		// packssdw, then shift the bit back to its original position. Then we por the
+		// alpha vector with the post-packed color vector to get the final color.
+		
+		a = _mm_andnot_si128(a, _mm_set1_epi32(0x00004000));			// Mask out the bit before A
+		a = _mm_packs_epi32(a, zero_vec128);							// Pack 32-bit down to 16-bit
+		a = _mm_slli_epi16(a, 1);										// Shift the A bit back to where it needs to be
+		
+		// Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
+		color = _mm_or_si128(_mm_or_si128(r, g), b);
 		color = _mm_packs_epi32(color, zero_vec128);
+		color = _mm_or_si128(color, a);
+		
 		_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color);
 	}