diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp
index 4a4c8da94..b632749c6 100644
--- a/desmume/src/GPU.cpp
+++ b/desmume/src/GPU.cpp
@@ -2206,7 +2206,7 @@ PLAIN_CLEAR:
 	if (gpu->LayersEnable[4]) 
 	{
 		//n.b. - this is clearing the sprite line buffer to the background color,
-		memset_u16(gpu->sprColor, backdrop_color, GPU_FRAMEBUFFER_NATIVE_WIDTH);
+		memset_u16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(gpu->sprColor, backdrop_color);
 		
 		//zero 06-may-09: I properly supported window color effects for backdrop, but I am not sure
 		//how it interacts with this. I wish we knew why we needed this
@@ -2531,14 +2531,14 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
 		{
 			if (factor < 16)
 			{
-#ifdef ENABLE_SSE2
-				static size_t ssePixCount = pixCount - (pixCount % 4);
-				static const __m128i colorMask = _mm_set1_epi16(0x7FFF);
+				size_t i = 0;
 				
-				for (size_t i = 0; i < ssePixCount; i += 8)
+#ifdef ENABLE_SSE2
+				const size_t ssePixCount = pixCount - (pixCount % 8);
+				for (; i < ssePixCount; i += 8)
 				{
 					__m128i dstColor_vec128 = _mm_load_si128((__m128i *)(dstLine + i));
-					dstColor_vec128 = _mm_and_si128(dstColor_vec128, colorMask);
+					dstColor_vec128 = _mm_and_si128(dstColor_vec128, _mm_set1_epi16(0x7FFF));
 					
 					dstLine[i+7] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 7) ];
 					dstLine[i+6] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 6) ];
@@ -2549,17 +2549,11 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
 					dstLine[i+1] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 1) ];
 					dstLine[i+0] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 0) ];
 				}
-				
-				for (size_t i = ssePixCount; i < pixCount; i++)
-				{
-					dstLine[i] = fadeInColors[factor][ dstLine[i] & 0x7FFF ];
-				}
-#else
-				for (size_t i = 0; i < pixCount; i++)
-				{
-					dstLine[i] = fadeInColors[factor][ dstLine[i] & 0x7FFF ];
-				}
 #endif
+				for (; i < pixCount; i++)
+				{
+					dstLine[i] = fadeInColors[factor][ dstLine[i] & 0x7FFF ];
+				}
 			}
 			else
 			{
@@ -2573,14 +2567,14 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
 		{
 			if (factor < 16)
 			{
-#ifdef ENABLE_SSE2
-				static size_t ssePixCount = pixCount - (pixCount % 4);
-				static const __m128i colorMask = _mm_set1_epi16(0x7FFF);
+				size_t i = 0;
 				
-				for (size_t i = 0; i < ssePixCount; i += 8)
+#ifdef ENABLE_SSE2
+				const size_t ssePixCount = pixCount - (pixCount % 8);
+				for (; i < ssePixCount; i += 8)
 				{
 					__m128i dstColor_vec128 = _mm_load_si128((__m128i *)(dstLine + i));
-					dstColor_vec128 = _mm_and_si128(dstColor_vec128, colorMask);
+					dstColor_vec128 = _mm_and_si128(dstColor_vec128, _mm_set1_epi16(0x7FFF));
 					
 					dstLine[i+7] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 7) ];
 					dstLine[i+6] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 6) ];
@@ -2591,17 +2585,11 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
 					dstLine[i+1] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 1) ];
 					dstLine[i+0] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 0) ];
 				}
-				
-				for (size_t i = ssePixCount; i < pixCount; i++)
-				{
-					dstLine[i] = fadeOutColors[factor][ dstLine[i] & 0x7FFF ];
-				}
-#else
-				for (size_t i = 0; i < pixCount; i++)
-				{
-					dstLine[i] = fadeOutColors[factor][ dstLine[i] & 0x7FFF ];
-				}
 #endif
+				for (; i < pixCount; i++)
+				{
+					dstLine[i] = fadeOutColors[factor][ dstLine[i] & 0x7FFF ];
+				}
 			}
 			else
 			{
@@ -2614,7 +2602,6 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
 		case GPUMasterBrightMode_Reserved:
 			break;
 	}
-	
 }
 
 template<size_t WIN_NUM>
@@ -2818,10 +2805,21 @@ void GPU_RenderLine(NDS_Screen *screen, const u16 l, bool skip)
 			{
 				//this has not been tested since the dma timing for dispfifo was changed around the time of
 				//newemuloop. it may not work.
-				for (size_t i = 0; i < 128; i++)
+#ifdef ENABLE_SSE2
+				const __m128i fifoMask = _mm_set1_epi32(0x7FFF7FFF);
+				for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++)
+				{
+					__m128i fifoColor = _mm_set_epi32(DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv());
+					fifoColor = _mm_shuffle_epi32(fifoColor, 0x1B); // We need to shuffle the four FIFO values back into the correct order, since they were originally loaded in reverse order.
+					
+					((__m128i *)dstLine)[i] = fifoColor & fifoMask;
+				}
+#else
+				for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
 				{
 					((u32 *)dstLine)[i] = DISP_FIFOrecv() & 0x7FFF7FFF;
 				}
+#endif
 				
 				if (_gpuFramebufferWidth != GPU_FRAMEBUFFER_NATIVE_WIDTH)
 				{
diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp
index f04ca0bbc..cb6bce604 100644
--- a/desmume/src/OGLRender.cpp
+++ b/desmume/src/OGLRender.cpp
@@ -893,7 +893,6 @@ void OpenGLRenderer::SetVersion(unsigned int major, unsigned int minor, unsigned
 	this->versionRevision = revision;
 }
 
-#if defined(ENABLE_SSSE3) && defined(LOCAL_LE)
 Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
 {
 	// Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL
@@ -905,7 +904,10 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
 	
 	for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
 	{
-		for (size_t x = 0; x < ssePixCount; x+=4, ir+=4, iw+=4)
+		size_t x = 0;
+		
+#if defined(ENABLE_SSSE3) && defined(LOCAL_LE)
+		for (; x < ssePixCount; x += 4, ir += 4, iw += 4)
 		{
 			// Convert to RGBA6665
 			__m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + ir));
@@ -923,65 +925,42 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
 			color = _mm_load_si128((__m128i *)(this->_framebufferColor + ir));
 			
 			__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x000000F8));	// Read from R
-			b = _mm_slli_epi32(b, 7);										// Shift to B
+			b = _mm_slli_si128(b, 7);										// Shift to B
 			
 			__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x0000F800));	// Read from G
-			g = _mm_srli_epi32(g, 6);										// Shift in G
+			g = _mm_srli_si128(g, 6);										// Shift in G
 			
 			__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x00F80000));	// Read from B
-			r = _mm_srli_epi32(r, 19);										// Shift to R
+			r = _mm_srli_si128(r, 19);										// Shift to R
 			
 			a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000));			// Read from A
 			a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000));				// Determine A
 			a = _mm_and_si128(a, _mm_set1_epi32(0x00008000));				// Mask to A
 			
-			color = _mm_or_si128(b, g);
-			color = _mm_or_si128(color, r);
-			color = _mm_or_si128(color, a);
+			color = b | g | r | a;
 			
 			// All the colors are currently placed every other 16 bits, so we need to swizzle them
 			// to the lower 64 bits of our vector before we store them back to memory.
 			color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
 			_mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), color);
 		}
+#endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE)
 		
-		for (size_t x = ssePixCount; x < pixCount; x++, ir++, iw++)
-		{
-			dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(this->_framebufferColor[ir].color);
-			dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F,
-											(this->_framebufferColor[ir].g >> 3) & 0x1F,
-											(this->_framebufferColor[ir].r >> 3) & 0x1F) |
-											((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
-		}
-	}
-	
-	return RENDER3DERROR_NOERR;
-}
-
-#else // Code path where SSSE3 or little-endian is not supported
-
-Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
-{
-	// Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL
-	// stores pixels using a flipped Y-coordinate, so this needs to be flipped back
-	// to the DS Y-coordinate.
-	for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
-	{
-		for (size_t x = 0; x < this->_framebufferWidth; x++, ir++, iw++)
+		for (; x < pixCount; x++, ir++, iw++)
 		{
 			// Use the correct endian format since OpenGL uses the native endian of
 			// the architecture it is running on.
-#ifdef WORDS_BIGENDIAN
+#ifdef LOCAL_BE
 			dstRGBA6665[iw].color = BGRA8888_32_To_RGBA6665_32(this->_framebufferColor[ir].color);
-			dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F,
-											(this->_framebufferColor[ir].g >> 3) & 0x1F,
-											(this->_framebufferColor[ir].r >> 3) & 0x1F) |
+			dstRGBA5551[iw] = R5G5B5TORGB15( (this->_framebufferColor[ir].b >> 3) & 0x1F,
+											 (this->_framebufferColor[ir].g >> 3) & 0x1F,
+											 (this->_framebufferColor[ir].r >> 3) & 0x1F) |
 											((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
 #else
 			dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(this->_framebufferColor[ir].color);
-			dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F,
-											(this->_framebufferColor[ir].g >> 3) & 0x1F,
-											(this->_framebufferColor[ir].r >> 3) & 0x1F) |
+			dstRGBA5551[iw] = R5G5B5TORGB15( (this->_framebufferColor[ir].b >> 3) & 0x1F,
+											 (this->_framebufferColor[ir].g >> 3) & 0x1F,
+											 (this->_framebufferColor[ir].r >> 3) & 0x1F) |
 											((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
 #endif
 		}
@@ -990,8 +969,6 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
 	return RENDER3DERROR_NOERR;
 }
 
-#endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE)
-
 OpenGLRenderer_1_2::~OpenGLRenderer_1_2()
 {
 	glFinish();
@@ -1902,7 +1879,7 @@ Render3DError OpenGLRenderer_1_2::UploadClearImage(const u16 *__restrict colorBu
 	}
 	else
 	{
-		for (size_t i = 0; i < this->_framebufferWidth * this->_framebufferHeight; i++)
+		for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
 		{
 			OGLRef.workingCIDepthStencilBuffer[i] = depthBuffer[i] << 8;
 		}
@@ -2782,6 +2759,11 @@ Render3DError OpenGLRenderer_1_2::SetFramebufferSize(size_t w, size_t h)
 		return OGLERROR_NOERR;
 	}
 	
+	if (!BEGINGL())
+	{
+		return OGLERROR_BEGINGL_FAILED;
+	}
+	
 	if (this->isFBOSupported)
 	{
 		glActiveTextureARB(GL_TEXTURE0_ARB + OGLTextureUnitID_GColor);
@@ -2843,6 +2825,8 @@ Render3DError OpenGLRenderer_1_2::SetFramebufferSize(size_t w, size_t h)
 	
 	free_aligned(oldFramebufferColor);
 	
+	ENDGL();
+	
 	return OGLERROR_NOERR;
 }
 
@@ -2892,7 +2876,7 @@ Render3DError OpenGLRenderer_1_3::UploadClearImage(const u16 *__restrict colorBu
 	}
 	else
 	{
-		for (size_t i = 0; i < this->_framebufferWidth * this->_framebufferHeight; i++)
+		for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
 		{
 			OGLRef.workingCIDepthStencilBuffer[i] = depthBuffer[i] << 8;
 		}
@@ -2931,6 +2915,11 @@ Render3DError OpenGLRenderer_1_3::SetFramebufferSize(size_t w, size_t h)
 		return OGLERROR_NOERR;
 	}
 	
+	if (!BEGINGL())
+	{
+		return OGLERROR_BEGINGL_FAILED;
+	}
+	
 	if (this->isFBOSupported)
 	{
 		glActiveTexture(GL_TEXTURE0 + OGLTextureUnitID_GColor);
@@ -2992,6 +2981,8 @@ Render3DError OpenGLRenderer_1_3::SetFramebufferSize(size_t w, size_t h)
 	
 	free_aligned(oldFramebufferColor);
 	
+	ENDGL();
+	
 	return OGLERROR_NOERR;
 }
 
diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp
index e876fc6a4..148680123 100644
--- a/desmume/src/OGLRender_3_2.cpp
+++ b/desmume/src/OGLRender_3_2.cpp
@@ -1148,7 +1148,7 @@ Render3DError OpenGLRenderer_3_2::BeginRender(const GFX3D &engine)
 {
 	OGLRenderRef &OGLRef = *this->ref;
 	
-	if(!BEGINGL())
+	if (!BEGINGL())
 	{
 		return OGLERROR_BEGINGL_FAILED;
 	}
@@ -1560,6 +1560,11 @@ Render3DError OpenGLRenderer_3_2::SetFramebufferSize(size_t w, size_t h)
 		return OGLERROR_NOERR;
 	}
 	
+	if (!BEGINGL())
+	{
+		return OGLERROR_BEGINGL_FAILED;
+	}
+	
 	glActiveTexture(GL_TEXTURE0 + OGLTextureUnitID_GColor);
 	glBindTexture(GL_TEXTURE_2D, OGLRef.texGDepthStencilID);
 	glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, w, h, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL);
@@ -1615,5 +1620,7 @@ Render3DError OpenGLRenderer_3_2::SetFramebufferSize(size_t w, size_t h)
 	
 	free_aligned(oldFramebufferColor);
 	
+	ENDGL();
+	
 	return OGLERROR_NOERR;
 }
diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h
index ba27a3c9f..e6eb5862e 100644
--- a/desmume/src/matrix.h
+++ b/desmume/src/matrix.h
@@ -125,23 +125,37 @@ static void memset_u16(void *dst, const u16 val, const size_t length)
 	__m128i *dst_vec128 = (__m128i *)dst;
 	const __m128i val_vec128 = _mm_set1_epi16(val);
 	const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
-	//MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128));
 	
 	for (size_t i = 0; i < length_vec128; i++)
 		_mm_stream_si128(dst_vec128 + i, val_vec128);
 }
 
+template <size_t LENGTH>
+static void memset_u16_fast(void *dst, const u16 val)
+{
+	__m128i *dst_vec128 = (__m128i *)dst;
+	const __m128i val_vec128 = _mm_set1_epi16(val);
+	MACRODO_N(LENGTH / (sizeof(val_vec128) / sizeof(val)), _mm_store_si128(dst_vec128 + (X), val_vec128));
+}
+
 static void memset_u32(void *dst, const u32 val, const size_t length)
 {
 	__m128i *dst_vec128 = (__m128i *)dst;
 	const __m128i val_vec128 = _mm_set1_epi32(val);
 	const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
-	//MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128));
 	
 	for (size_t i = 0; i < length_vec128; i++)
 		_mm_stream_si128(dst_vec128 + i, val_vec128);
 }
 
+template <size_t LENGTH>
+static void memset_u32_fast(void *dst, const u32 val)
+{
+	__m128i *dst_vec128 = (__m128i *)dst;
+	const __m128i val_vec128 = _mm_set1_epi32(val);
+	MACRODO_N(LENGTH / (sizeof(val_vec128) / sizeof(val)), _mm_store_si128(dst_vec128 + (X), val_vec128));
+}
+
 #else //no sse2
 
 static void memset_u16(void *dst, const u16 val, const size_t length)
@@ -150,7 +164,6 @@ static void memset_u16(void *dst, const u16 val, const size_t length)
 	u64 *dst_u64 = (u64 *)dst;
 	const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
 	const size_t length_u64 = length / (sizeof(val_u64) / sizeof(val));
-	//MACRODO_N(length_u64, (dst_u64[X] = val_u64));
 	
 	for (size_t i = 0; i < length_u64; i++)
 		dst_u64[i] = val_u64;
@@ -160,13 +173,25 @@ static void memset_u16(void *dst, const u16 val, const size_t length)
 #endif
 }
 
+template <size_t LENGTH>
+static void memset_u16_fast(void *dst, const u16 val)
+{
+#ifdef HOST_64
+	u64 *dst_u64 = (u64 *)dst;
+	const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
+	MACRODO_N(LENGTH / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64));
+#else
+	for (size_t i = 0; i < LENGTH; i++)
+		((u16 *)dst)[i] = val;
+#endif
+}
+
 static void memset_u32(void *dst, const u32 val, const size_t length)
 {
 #ifdef HOST_64
 	u64 *dst_u64 = (u64 *)dst;
 	const u64 val_u64 = ((u64)val << 32) | (u64)val;
 	const size_t length_u64 = length / (sizeof(val_u64) / sizeof(val));
-	//MACRODO_N(length_u64, (dst_u64[X] = val_u64));
 	
 	for (size_t i = 0; i < length_u64; i++)
 		dst_u64[i] = val_u64;
@@ -176,7 +201,20 @@ static void memset_u32(void *dst, const u32 val, const size_t length)
 #endif
 }
 
+template <size_t LENGTH>
+static void memset_u32_fast(void *dst, const u32 val)
+{
+#ifdef HOST_64
+	u64 *dst_u64 = (u64 *)dst;
+	const u64 val_u64 = ((u64)val << 32) | (u64)val;
+	MACRODO_N(LENGTH / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64));
+#else
+	for (size_t i = 0; i < LENGTH; i++)
+		((u16 *)dst)[i] = val;
 #endif
+}
+
+#endif // ENABLE_SSE2
 
 // NOSSE version always used in gfx3d.cpp
 void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr);
@@ -233,8 +271,6 @@ FORCEINLINE void MatrixMultiply(float * matrix, const float * rightMatrix)
 	_mm_store_ps(matrix+12,row3);
 }
 
-
-
 FORCEINLINE void MatrixMultVec4x4(const float *matrix, float *vecPtr)
 {
 	_mm_store_ps(vecPtr,_util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(vecPtr)));
@@ -311,18 +347,6 @@ FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
 		_mm_store_ps(matrix+12,_mm_div_ps(_mm_load_ps(matrix+12),val));
 }
 
-//WARNING: I do not think this is as fast as a memset, for some reason.
-//at least in vc2005 with sse enabled. better figure out why before using it
-template<int NUM>
-static FORCEINLINE void memset_u8(void* _dst, u8 val)
-{
-	memset(_dst,val,NUM);
-	//const u8* dst = (u8*)_dst;
-	//u32 u32val = (val<<24)|(val<<16)|(val<<8)|val;
-	//const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
-	//MACRODO_N(NUM/16,_mm_store_si128((__m128i*)(dst+(X)*16), temp));
-}
-
 #else //no sse
 
 void MatrixMultVec4x4 (const float *matrix, float *vecPtr);
@@ -345,12 +369,6 @@ FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
 		matrix[i] /= divisor;
 }
 
-template<int NUM>
-static FORCEINLINE void memset_u8(void* dst, u8 val)
-{
-	memset(dst,val,NUM);
-}
-
 #endif //switched SSE functions
 
 void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr);
@@ -360,5 +378,5 @@ void MatrixMultVec4x4_M2(const s32 *matrix, s32 *vecPtr);
 void MatrixMultiply(s32* matrix, const s32* rightMatrix);
 void MatrixScale(s32 *matrix, const s32 *ptr);
 void MatrixTranslate(s32 *matrix, const s32 *ptr);
-#endif
 
+#endif // MATRIX_H
diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp
index 7b4719eb8..8df34f24f 100644
--- a/desmume/src/rasterize.cpp
+++ b/desmume/src/rasterize.cpp
@@ -577,7 +577,6 @@ public:
 		FragmentColor shaderOutput;
 		bool isOpaquePixel;
 		
-		//FragmentColor &dstColor				= this->_softRender->GetFramebuffer()[fragmentIndex];
 		u32 &dstAttributeDepth				= this->_softRender->_framebufferAttributes->depth[fragmentIndex];
 		u8 &dstAttributeOpaquePolyID		= this->_softRender->_framebufferAttributes->opaquePolyID[fragmentIndex];
 		u8 &dstAttributeTranslucentPolyID	= this->_softRender->_framebufferAttributes->translucentPolyID[fragmentIndex];
@@ -2063,9 +2062,6 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor
 	convertedClearColor.g = GFX3D_5TO6(clearColor.g);
 	convertedClearColor.b = GFX3D_5TO6(clearColor.b);
 	
-	const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
-	const size_t ssePixCount = pixCount - (pixCount % 16);
-	
 	const __m128i color_vec128					= _mm_set1_epi32(convertedClearColor.color);
 	const __m128i attrDepth_vec128				= _mm_set1_epi32(clearAttributes.depth);
 	const __m128i attrOpaquePolyID_vec128		= _mm_set1_epi8(clearAttributes.opaquePolyID);
@@ -2074,7 +2070,11 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor
 	const __m128i attrIsFogged_vec128			= _mm_set1_epi8(clearAttributes.isFogged);
 	const __m128i attrIsTranslucentPoly_vec128	= _mm_set1_epi8(clearAttributes.isTranslucentPoly);
 	
-	for (size_t i = 0; i < ssePixCount; i += 16)
+	size_t i = 0;
+	const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
+	const size_t ssePixCount = pixCount - (pixCount % 16);
+	
+	for (; i < ssePixCount; i += 16)
 	{
 		_mm_stream_si128((__m128i *)(this->_framebufferColor + i +  0), color_vec128);
 		_mm_stream_si128((__m128i *)(this->_framebufferColor + i +  4), color_vec128);
@@ -2093,7 +2093,7 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor
 		_mm_stream_si128((__m128i *)(this->_framebufferAttributes->isTranslucentPoly + i), attrIsTranslucentPoly_vec128);
 	}
 	
-	for (size_t i = ssePixCount; i < pixCount; i++)
+	for (; i < pixCount; i++)
 	{
 		this->_framebufferColor[i] = convertedClearColor;
 		this->_framebufferAttributes->SetAtIndex(i, clearAttributes);
diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp
index 0933f0d2e..9f895870e 100644
--- a/desmume/src/render3D.cpp
+++ b/desmume/src/render3D.cpp
@@ -158,9 +158,9 @@ void FragmentAttributesBuffer::SetAtIndex(const size_t index, const FragmentAttr
 
 void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
 {
-#ifdef ENABLE_SSE2
-	const size_t sseCount = count - (count % 16);
+	size_t i = 0;
 	
+#ifdef ENABLE_SSE2
 	const __m128i attrDepth_vec128				= _mm_set1_epi32(attr.depth);
 	const __m128i attrOpaquePolyID_vec128		= _mm_set1_epi8(attr.opaquePolyID);
 	const __m128i attrTranslucentPolyID_vec128	= _mm_set1_epi8(attr.translucentPolyID);
@@ -168,7 +168,8 @@ void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
 	const __m128i attrIsFogged_vec128			= _mm_set1_epi8(attr.isFogged);
 	const __m128i attrIsTranslucentPoly_vec128	= _mm_set1_epi8(attr.isTranslucentPoly);
 	
-	for (size_t i = 0; i < sseCount; i += 16)
+	const size_t sseCount = count - (count % 16);
+	for (; i < sseCount; i += 16)
 	{
 		_mm_stream_si128((__m128i *)(this->depth +  0), attrDepth_vec128);
 		_mm_stream_si128((__m128i *)(this->depth +  4), attrDepth_vec128);
@@ -181,17 +182,12 @@ void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
 		_mm_stream_si128((__m128i *)this->isFogged, attrIsFogged_vec128);
 		_mm_stream_si128((__m128i *)this->isTranslucentPoly, attrIsTranslucentPoly_vec128);
 	}
-	
-	for (size_t i = sseCount; i < count; i++)
-	{
-		this->SetAtIndex(i, attr);
-	}
-#else
-	for (size_t i = 0; i < count; i++)
-	{
-		this->SetAtIndex(i, attr);
-	}
 #endif
+	
+	for (; i < count; i++)
+	{
+		this->SetAtIndex(i, attr);
+	}
 }
 
 Render3D::Render3D()
@@ -345,26 +341,39 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState)
 		const u8 xScroll = scrollBits & 0xFF;
 		const u8 yScroll = (scrollBits >> 8) & 0xFF;
 		
-		for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
+		if (xScroll == 0 && yScroll == 0)
 		{
-			const size_t y = ((iy + yScroll) & 0xFF) << 8;
-			
-			for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex++, ix++)
+			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
 			{
-				const size_t x = (ix + xScroll) & 0xFF;
-				const size_t srcIndex = y | x;
+				this->clearImageColor16Buffer[i] = clearColorBuffer[i];
+				this->clearImageDepthBuffer[i] = dsDepthToD24_LUT[clearDepthBuffer[i] & 0x7FFF];
+				this->clearImageFogBuffer[i] = BIT15(clearDepthBuffer[i]);
+				this->clearImagePolyIDBuffer[i] = clearFragment.opaquePolyID;
+			}
+		}
+		else
+		{
+			for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
+			{
+				const size_t y = ((iy + yScroll) & 0xFF) << 8;
 				
-				//this is tested by harry potter and the order of the phoenix.
-				//TODO (optimization) dont do this if we are mapped to blank memory (such as in sonic chronicles)
-				//(or use a special zero fill in the bulk clearing above)
-				this->clearImageColor16Buffer[dstIndex] = clearColorBuffer[srcIndex];
-				
-				//this is tested quite well in the sonic chronicles main map mode
-				//where depth values are used for trees etc you can walk behind
-				this->clearImageDepthBuffer[dstIndex] = dsDepthToD24_LUT[clearDepthBuffer[srcIndex] & 0x7FFF];
-				
-				this->clearImageFogBuffer[dstIndex] = BIT15(clearDepthBuffer[srcIndex]);
-				this->clearImagePolyIDBuffer[dstIndex] = clearFragment.opaquePolyID;
+				for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex++, ix++)
+				{
+					const size_t x = (ix + xScroll) & 0xFF;
+					const size_t srcIndex = y | x;
+					
+					//this is tested by harry potter and the order of the phoenix.
+					//TODO (optimization) dont do this if we are mapped to blank memory (such as in sonic chronicles)
+					//(or use a special zero fill in the bulk clearing above)
+					this->clearImageColor16Buffer[dstIndex] = clearColorBuffer[srcIndex];
+					
+					//this is tested quite well in the sonic chronicles main map mode
+					//where depth values are used for trees etc you can walk behind
+					this->clearImageDepthBuffer[dstIndex] = dsDepthToD24_LUT[clearDepthBuffer[srcIndex] & 0x7FFF];
+					
+					this->clearImageFogBuffer[dstIndex] = BIT15(clearDepthBuffer[srcIndex]);
+					this->clearImagePolyIDBuffer[dstIndex] = clearFragment.opaquePolyID;
+				}
 			}
 		}
 		
@@ -470,11 +479,13 @@ Render3DError Render3D::VramReconfigureSignal()
 
 Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
 {
-	static const __m128i zeroColor = _mm_set1_epi32(0);
+	const __m128i zero_vec128 = _mm_setzero_si128();
+	
+	size_t i = 0;
 	const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
 	const size_t ssePixCount = pixCount - (pixCount % 4);
 	
-	for (size_t i = 0; i < ssePixCount; i += 4)
+	for (; i < ssePixCount; i += 4)
 	{
 		// Copy the framebufferColor buffer
 		__m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i));
@@ -482,16 +493,16 @@ Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6
 		
 		// Convert to RGBA5551
 		__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E));	// Read from R
-		r = _mm_srli_epi32(r, 1);										// Shift to R
+		r = _mm_srli_si128(r, 1);										// Shift to R
 		
 		__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00));	// Read from G
-		g = _mm_srli_epi32(g, 4);										// Shift in G
+		g = _mm_srli_si128(g, 4);										// Shift in G
 		
 		__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000));	// Read from B
-		b = _mm_srli_epi32(b, 7);										// Shift to B
+		b = _mm_srli_si128(b, 7);										// Shift to B
 		
 		__m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000));	// Read from A
-		a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000));				// Determine A
+		a = _mm_cmpgt_epi32(a, zero_vec128);							// Determine A
 		
 		// From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned
 		// 16-bit. Since SSE2 only has packssdw (signed 16-bit pack), then the alpha bit
@@ -504,21 +515,18 @@ Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6
 		// alpha vector with the post-packed color vector to get the final color.
 		
 		a = _mm_and_si128(a, _mm_set1_epi32(0x00004000));				// Mask out the bit before A
-		a = _mm_packs_epi32(a, zeroColor);								// Pack 32-bit down to 16-bit
-		a = _mm_slli_epi16(a, 1);										// Shift the A bit back to where it needs to be
+		a = _mm_packs_epi32(a, zero_vec128);							// Pack 32-bit down to 16-bit
+		a = _mm_slli_si128(a, 1);										// Shift the A bit back to where it needs to be
 		
-		// Assemble the RGB colors
-		color = _mm_or_si128(r, g);
-		color = _mm_or_si128(color, b);
-		
-		// Pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
-		color = _mm_packs_epi32(color, zeroColor);
-		color = _mm_or_si128(color, a);
+		// Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
+		color = r | g | b;
+		color = _mm_packs_epi32(color, zero_vec128);
+		color |= a;
 		
 		_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color);
 	}
 	
-	for (size_t i = ssePixCount; i < pixCount; i++)
+	for (; i < pixCount; i++)
 	{
 		dstRGBA6665[i] = this->_framebufferColor[i];
 		dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000);
@@ -560,59 +568,49 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
 				
 		if (xScroll == 0 && yScroll == 0)
 		{
+			const __m128i depthBitMask_vec128 = _mm_set1_epi16(0x7FFF);
+			const __m128i fogBufferBitMask_vec128 = _mm_set1_epi16(BIT(15));
 			const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID);
 			
 			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i += 16)
 			{
-				static const __m128i depthBitMask_vec128 = _mm_set1_epi16(0x7FFF);
-				static const __m128i fogBufferBitMask_vec128 = _mm_set1_epi16(BIT(15));
-				
 				// Copy the colors to the color buffer. Since we can only copy 8 elements at once,
 				// we need to load-store twice.
 				_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i + 8), _mm_load_si128((__m128i *)(clearColorBuffer + i + 8)) );
 				_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i), _mm_load_si128((__m128i *)(clearColorBuffer + i)) );
 				
 				// Write the depth values to the depth buffer.
-				__m128i clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8));
-				clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, depthBitMask_vec128);
+				__m128i clearDepthHi_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8));
+				__m128i clearDepthLo_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i));
+				clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, depthBitMask_vec128);
+				clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, depthBitMask_vec128);
 				
-				__m128i depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 7)],
-														  dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 6)],
-														  dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 5)],
-														  dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 4)]);
-				_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 12), depthValue_vec128);
-				
-				depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 3)],
-												  dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 2)],
-												  dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 1)],
-												  dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 0)]);
-				_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 8), depthValue_vec128);
-				
-				clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i));
-				clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, depthBitMask_vec128);
-				
-				depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 7)],
-												  dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 6)],
-												  dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 5)],
-												  dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 4)]);
-				_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 4), depthValue_vec128);
-				
-				depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 3)],
-												  dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 2)],
-												  dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 1)],
-												  dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 0)]);
-				_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i), depthValue_vec128);
+				this->clearImageDepthBuffer[i+15] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 7)];
+				this->clearImageDepthBuffer[i+14] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 6)];
+				this->clearImageDepthBuffer[i+13] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 5)];
+				this->clearImageDepthBuffer[i+12] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 4)];
+				this->clearImageDepthBuffer[i+11] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 3)];
+				this->clearImageDepthBuffer[i+10] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 2)];
+				this->clearImageDepthBuffer[i+ 9] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 1)];
+				this->clearImageDepthBuffer[i+ 8] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 0)];
+				this->clearImageDepthBuffer[i+ 7] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 7)];
+				this->clearImageDepthBuffer[i+ 6] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 6)];
+				this->clearImageDepthBuffer[i+ 5] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 5)];
+				this->clearImageDepthBuffer[i+ 4] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 4)];
+				this->clearImageDepthBuffer[i+ 3] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 3)];
+				this->clearImageDepthBuffer[i+ 2] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 2)];
+				this->clearImageDepthBuffer[i+ 1] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 1)];
+				this->clearImageDepthBuffer[i+ 0] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 0)];
 				
 				// Write the fog flags to the fog flag buffer.
-				clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8)); // Read the upper values
-				clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, fogBufferBitMask_vec128);
-				const __m128i clearDepthFogBit_vec128 = _mm_srli_epi16(clearDepth_vec128, 15); // Save the upper bits in another register
+				clearDepthHi_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8));
+				clearDepthLo_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i));
+				clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, fogBufferBitMask_vec128);
+				clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, fogBufferBitMask_vec128);
+				clearDepthHi_vec128 = _mm_srli_si128(clearDepthHi_vec128, 15);
+				clearDepthLo_vec128 = _mm_srli_si128(clearDepthLo_vec128, 15);
 				
-				clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i)); // Read the lower values
-				clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, fogBufferBitMask_vec128);
-				clearDepth_vec128 = _mm_srli_epi16(clearDepth_vec128, 15); // These are the lower bits
-				
-				_mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packus_epi16(clearDepth_vec128, clearDepthFogBit_vec128));
+				_mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packus_epi16(clearDepthLo_vec128, clearDepthHi_vec128));
 				
 				// The one is easy. Just set the values in the polygon ID buffer.
 				_mm_store_si128((__m128i *)(this->clearImagePolyIDBuffer + i), opaquePolyID_vec128);
@@ -620,8 +618,8 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
 		}
 		else
 		{
-			static const __m128i addrOffset = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
-			static const __m128i addrRolloverMask = _mm_set1_epi16(0x00FF);
+			const __m128i addrOffset = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+			const __m128i addrRolloverMask = _mm_set1_epi16(0x00FF);
 			const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID);
 			
 			for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)