From c49a480d51b4a44c6873734a1cc8905f6685933f Mon Sep 17 00:00:00 2001
From: rogerman <rogerman@users.noreply.github.com>
Date: Sat, 4 Sep 2021 19:31:55 -0700
Subject: [PATCH] SoftRasterizer: Due to how the class inheritance works,
 SoftRasterizer_AVX actually requires AVX2 instead of just AVX. And in
 reality, AVX2 is just more practical than AVX for what we're doing here. -
 Also do some minor refactoring of Render3D.

---
 desmume/src/OGLRender.h   |   4 +-
 desmume/src/rasterize.cpp |  14 +-
 desmume/src/rasterize.h   |  12 +-
 desmume/src/render3D.cpp  | 338 +++++++++++---------------------------
 desmume/src/render3D.h    |  15 +-
 5 files changed, 116 insertions(+), 267 deletions(-)
diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h
index 5ef0bcf90..c9ca4794f 100755
--- a/desmume/src/OGLRender.h
+++ b/desmume/src/OGLRender.h
@@ -679,8 +679,8 @@ public:
 	void SetUpscalingBuffer(void *upscaleBuffer);
 };
 
-#if defined(ENABLE_AVX)
-class OpenGLRenderer : public Render3D_AVX
+#if defined(ENABLE_AVX2)
+class OpenGLRenderer : public Render3D_AVX2
 #elif defined(ENABLE_SSE2)
 class OpenGLRenderer : public Render3D_SSE2
 #elif defined(ENABLE_ALTIVEC)
diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp
index 5c4609ec8..c7c4b5bb4 100644
--- a/desmume/src/rasterize.cpp
+++ b/desmume/src/rasterize.cpp
@@ -1419,8 +1419,8 @@ static void* SoftRasterizer_RunClearUsingValues(void *arg)
 
 static Render3D* SoftRasterizerRendererCreate()
 {
-#if defined(ENABLE_AVX)
-	return new SoftRasterizerRenderer_AVX;
+#if defined(ENABLE_AVX2)
+	return new SoftRasterizerRenderer_AVX2;
 #elif defined(ENABLE_SSE2)
 	return new SoftRasterizerRenderer_SSE2;
 #elif defined(ENABLE_ALTIVEC)
@@ -1434,8 +1434,8 @@ static void SoftRasterizerRendererDestroy()
 {
 	if (CurrentRenderer != BaseRenderer)
 	{
-#if defined(ENABLE_AVX)
-		SoftRasterizerRenderer_AVX *oldRenderer = (SoftRasterizerRenderer_AVX *)CurrentRenderer;
+#if defined(ENABLE_AVX2)
+		SoftRasterizerRenderer_AVX2 *oldRenderer = (SoftRasterizerRenderer_AVX2 *)CurrentRenderer;
 #elif defined(ENABLE_SSE2)
 		SoftRasterizerRenderer_SSE2 *oldRenderer = (SoftRasterizerRenderer_SSE2 *)CurrentRenderer;
 #elif defined(ENABLE_ALTIVEC)
@@ -2606,9 +2606,9 @@ Render3DError SoftRasterizer_SIMD<SIMDBYTES>::SetFramebufferSize(size_t w, size_
 
 #endif
 
-#if defined(ENABLE_AVX)
+#if defined(ENABLE_AVX2)
 
-void SoftRasterizerRenderer_AVX::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes)
+void SoftRasterizerRenderer_AVX2::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes)
 {
 	this->_clearColor_v256u32					= _mm256_set1_epi32(clearColor6665.color);
 	this->_clearDepth_v256u32					= _mm256_set1_epi32(clearAttributes.depth);
@@ -2620,7 +2620,7 @@ void SoftRasterizerRenderer_AVX::LoadClearValues(const FragmentColor &clearColor
 	this->_clearAttrPolyFacing_v256u8			= _mm256_set1_epi8(clearAttributes.polyFacing);
 }
 
-void SoftRasterizerRenderer_AVX::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel)
+void SoftRasterizerRenderer_AVX2::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel)
 {
 	for (size_t i = startPixel; i < endPixel; i+=sizeof(v256u8))
 	{
diff --git a/desmume/src/rasterize.h b/desmume/src/rasterize.h
index 419793352..85e0976dc 100644
--- a/desmume/src/rasterize.h
+++ b/desmume/src/rasterize.h
@@ -1,5 +1,5 @@
 /*
-	Copyright (C) 2009-2019 DeSmuME team
+	Copyright (C) 2009-2021 DeSmuME team
 
 	This file is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
@@ -134,8 +134,8 @@ public:
 	template<bool SLI, bool USELINEHACK> FORCEINLINE void Render();
 };
 
-#if defined(ENABLE_AVX)
-class SoftRasterizerRenderer : public Render3D_AVX
+#if defined(ENABLE_AVX2)
+class SoftRasterizerRenderer : public Render3D_AVX2
 #elif defined(ENABLE_SSE2)
 class SoftRasterizerRenderer : public Render3D_SSE2
 #elif defined(ENABLE_ALTIVEC)
@@ -218,7 +218,7 @@ template <size_t SIMDBYTES>
 class SoftRasterizer_SIMD : public SoftRasterizerRenderer
 {
 protected:
-#if defined(ENABLE_AVX)
+#if defined(ENABLE_AVX2)
 	v256u32 _clearColor_v256u32;
 	v256u32 _clearDepth_v256u32;
 	v256u8 _clearAttrOpaquePolyID_v256u8;
@@ -247,8 +247,8 @@ public:
 	virtual Render3DError SetFramebufferSize(size_t w, size_t h);
 };
 
-#if defined(ENABLE_AVX)
-class SoftRasterizerRenderer_AVX : public SoftRasterizer_SIMD<32>
+#if defined(ENABLE_AVX2)
+class SoftRasterizerRenderer_AVX2 : public SoftRasterizer_SIMD<32>
 {
 protected:
 	virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes);
diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp
index 1bd08bd99..890f62203 100644
--- a/desmume/src/render3D.cpp
+++ b/desmume/src/render3D.cpp
@@ -503,6 +503,16 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram
 	return RENDER3DERROR_NOERR;
 }
 
+void Render3D::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
+{
+	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
+	{
+		outColor16[i] = inColor16[i];
+		outDepth24[i] = DS_DEPTH15TO24(inDepth16[i]);
+		outFog[i] = BIT15(inDepth16[i]);
+	}
+}
+
 template <bool ISCOLORBLANK, bool ISDEPTHBLANK>
 void Render3D::_ClearImageScrolledLoop(const u8 xScroll, const u8 yScroll, const u16 *__restrict inColor16, const u16 *__restrict inDepth16,
 									   u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
@@ -585,12 +595,7 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState)
 		
 		if (xScroll == 0 && yScroll == 0)
 		{
-			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
-			{
-				this->clearImageColor16Buffer[i] = clearColorBuffer[i];
-				this->clearImageDepthBuffer[i] = DS_DEPTH15TO24(clearDepthBuffer[i]);
-				this->clearImageFogBuffer[i] = BIT15(clearDepthBuffer[i]);
-			}
+			this->_ClearImageBaseLoop(clearColorBuffer, clearDepthBuffer, this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
 		}
 		else
 		{
@@ -773,252 +778,95 @@ Render3DError Render3D_SIMD<SIMDBYTES>::SetFramebufferSize(size_t w, size_t h)
 	return error;
 }
 
-#if defined(ENABLE_AVX) || defined(ENABLE_SSE2)
-
-#if defined(ENABLE_AVX)
-Render3DError Render3D_AVX::ClearFramebuffer(const GFX3D_State &renderState)
-#elif defined(ENABLE_SSE2)
-Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
-#endif
+#if defined(ENABLE_AVX2)
+void Render3D_AVX2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
 {
-	Render3DError error = RENDER3DERROR_NOERR;
+	const __m256i calcDepthConstants = _mm256_set1_epi32(0x01FF0200);
 	
-	if (renderState.enableClearImage)
+	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v256u16))
 	{
-		//the lion, the witch, and the wardrobe (thats book 1, suck it you new-school numberers)
-		//uses the scroll registers in the main game engine
-		const u16 *__restrict clearColorBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[2];
-		const u16 *__restrict clearDepthBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[3];
-		const u16 scrollBits = T1ReadWord(MMU.ARM9_REG, 0x356); //CLRIMAGE_OFFSET
-		const u8 xScroll = scrollBits & 0xFF;
-		const u8 yScroll = (scrollBits >> 8) & 0xFF;
+		// Copy the colors to the color buffer.
+		_mm256_store_si256( (__m256i *)(outColor16 + i) + 0, _mm256_load_si256((__m256i *)(inColor16 + i) + 0) );
+		_mm256_store_si256( (__m256i *)(outColor16 + i) + 1, _mm256_load_si256((__m256i *)(inColor16 + i) + 1) );
 		
-#ifdef ENABLE_AVX2
-		const __m256i calcDepthConstants = _mm256_set1_epi32(0x01FF0200);
-#else
-		const __m128i calcDepthConstants = _mm_set1_epi32(0x01FF0200);
-#endif
-		if (xScroll == 0 && yScroll == 0)
-		{
-#ifdef ENABLE_AVX2
-			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v256u16))
-			{
-				// Copy the colors to the color buffer.
-				_mm256_store_si256( (__m256i *)(this->clearImageColor16Buffer + i) + 0, _mm256_load_si256((__m256i *)(clearColorBuffer + i) + 0) );
-				_mm256_store_si256( (__m256i *)(this->clearImageColor16Buffer + i) + 1, _mm256_load_si256((__m256i *)(clearColorBuffer + i) + 1) );
-				
-				// Write the depth values to the depth buffer using the following formula from GBATEK.
-				// 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane
-				//    D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF);
-				//
-				// For now, let's forget GBATEK (which could be wrong) and try using a simpified formula:
-				//    D24 = (D15 * 0x0200) + 0x01FF;
-				const __m256i clearDepthLo = _mm256_load_si256((__m256i *)(clearDepthBuffer + i) + 0);
-				const __m256i clearDepthHi = _mm256_load_si256((__m256i *)(clearDepthBuffer + i) + 1);
-				
-				const __m256i clearDepthValueLo = _mm256_permute4x64_epi64( _mm256_and_si256(clearDepthLo, _mm256_set1_epi16(0x7FFF)), 0xD8 );
-				const __m256i clearDepthValueHi = _mm256_permute4x64_epi64( _mm256_and_si256(clearDepthHi, _mm256_set1_epi16(0x7FFF)), 0xD8 );
-				
-				__m256i calcDepth0 = _mm256_unpacklo_epi16(clearDepthValueLo, _mm256_set1_epi16(1));
-				__m256i calcDepth1 = _mm256_unpackhi_epi16(clearDepthValueLo, _mm256_set1_epi16(1));
-				__m256i calcDepth2 = _mm256_unpacklo_epi16(clearDepthValueHi, _mm256_set1_epi16(1));
-				__m256i calcDepth3 = _mm256_unpackhi_epi16(clearDepthValueHi, _mm256_set1_epi16(1));
-				
-				calcDepth0 = _mm256_madd_epi16(calcDepth0, calcDepthConstants);
-				calcDepth1 = _mm256_madd_epi16(calcDepth1, calcDepthConstants);
-				calcDepth2 = _mm256_madd_epi16(calcDepth2, calcDepthConstants);
-				calcDepth3 = _mm256_madd_epi16(calcDepth3, calcDepthConstants);
-				
-				_mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 0, calcDepth0);
-				_mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 1, calcDepth1);
-				_mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 2, calcDepth2);
-				_mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 3, calcDepth3);
-				
-				// Write the fog flags to the fog flag buffer.
-				const __m256i clearFogLo = _mm256_srli_epi16(clearDepthLo, 15);
-				const __m256i clearFogHi = _mm256_srli_epi16(clearDepthHi, 15);
-				_mm256_store_si256( (__m256i *)(this->clearImageFogBuffer + i), _mm256_permute4x64_epi64(_mm256_packus_epi16(clearFogLo, clearFogHi), 0xD8) );
-			}
-#else
-			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v128u16))
-			{
-				// Copy the colors to the color buffer.
-				_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i) + 0, _mm_load_si128((__m128i *)(clearColorBuffer + i) + 0) );
-				_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i) + 1, _mm_load_si128((__m128i *)(clearColorBuffer + i) + 1) );
-				
-				// Write the depth values to the depth buffer using the following formula from GBATEK.
-				// 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane
-				//    D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF);
-				//
-				// For now, let's forget GBATEK (which could be wrong) and try using a simpified formula:
-				//    D24 = (D15 * 0x0200) + 0x01FF;
-				const __m128i clearDepthLo = _mm_load_si128((__m128i *)(clearDepthBuffer + i) + 0);
-				const __m128i clearDepthHi = _mm_load_si128((__m128i *)(clearDepthBuffer + i) + 1);
-				
-				const __m128i clearDepthValueLo = _mm_and_si128(clearDepthLo, _mm_set1_epi16(0x7FFF));
-				const __m128i clearDepthValueHi = _mm_and_si128(clearDepthHi, _mm_set1_epi16(0x7FFF));
-				
-				__m128i calcDepth0 = _mm_unpacklo_epi16(clearDepthValueLo, _mm_set1_epi16(1));
-				__m128i calcDepth1 = _mm_unpackhi_epi16(clearDepthValueLo, _mm_set1_epi16(1));
-				__m128i calcDepth2 = _mm_unpacklo_epi16(clearDepthValueHi, _mm_set1_epi16(1));
-				__m128i calcDepth3 = _mm_unpackhi_epi16(clearDepthValueHi, _mm_set1_epi16(1));
-				
-				calcDepth0 = _mm_madd_epi16(calcDepth0, calcDepthConstants);
-				calcDepth1 = _mm_madd_epi16(calcDepth1, calcDepthConstants);
-				calcDepth2 = _mm_madd_epi16(calcDepth2, calcDepthConstants);
-				calcDepth3 = _mm_madd_epi16(calcDepth3, calcDepthConstants);
-				
-				_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 0, calcDepth0);
-				_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 1, calcDepth1);
-				_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 2, calcDepth2);
-				_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 3, calcDepth3);
-				
-				// Write the fog flags to the fog flag buffer.
-				const __m128i clearFogLo = _mm_srli_epi16(clearDepthLo, 15);
-				const __m128i clearFogHi = _mm_srli_epi16(clearDepthHi, 15);
-				_mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packs_epi16(clearFogLo, clearFogHi));
-			}
-#endif
-		}
-		else
-		{
-			// FIXME: Fix SSE2 support for scrolled clear images.
-			// The depth-related code below doesn't actually work, and I don't know why
-			// this is, so just use the scalar version for now.
-			// - rogerman, 2018/09/19
-			/*
-			 const size_t shiftCount = xScroll & 0x07;
-			
-			for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
-			{
-				const size_t y = ((iy + yScroll) & 0xFF) << 8;
-				
-				for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex += 8, ix += 8)
-				{
-					const size_t x = (ix + xScroll) & 0xFF;
-					
-					__m128i clearColor;
-					__m128i clearDepth_vec128;
-					
-					if (shiftCount == 0)
-					{
-						const size_t srcIndex = y | x;
-						
-						clearColor = _mm_load_si128((__m128i *)(clearColorBuffer + srcIndex));
-						clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + srcIndex));
-					}
-					else
-					{
-						const size_t x1 = x & 0xF8;
-						const size_t x0 = (x1 == 0) ? (GPU_FRAMEBUFFER_NATIVE_WIDTH - 8) : x1 - 8;
-						const size_t srcIndex0 = y | x0;
-						const size_t srcIndex1 = y | x1;
-						
-						const __m128i clearColor0 = _mm_load_si128((__m128i *)(clearColorBuffer + srcIndex0));
-						const __m128i clearColor1 = _mm_load_si128((__m128i *)(clearColorBuffer + srcIndex1));
-						const __m128i clearDepth0 = _mm_load_si128((__m128i *)(clearDepthBuffer + srcIndex0));
-						const __m128i clearDepth1 = _mm_load_si128((__m128i *)(clearDepthBuffer + srcIndex1));
-						
-						switch (shiftCount)
-						{
-							case 1:
-								clearColor        = _mm_alignr_epi8(clearColor1, clearColor0, 14);
-								clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 14);
-								break;
-								
-							case 2:
-								clearColor        = _mm_alignr_epi8(clearColor1, clearColor0, 12);
-								clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 12);
-								break;
-								
-							case 3:
-								clearColor        = _mm_alignr_epi8(clearColor1, clearColor0, 10);
-								clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 10);
-								break;
-								
-							case 4:
-								clearColor        = _mm_alignr_epi8(clearColor1, clearColor0, 8);
-								clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 8);
-								break;
-								
-							case 5:
-								clearColor        = _mm_alignr_epi8(clearColor1, clearColor0, 6);
-								clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 6);
-								break;
-								
-							case 6:
-								clearColor        = _mm_alignr_epi8(clearColor1, clearColor0, 4);
-								clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 4);
-								break;
-								
-							case 7:
-								clearColor        = _mm_alignr_epi8(clearColor1, clearColor0, 2);
-								clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 2);
-								break;
-								
-							default:
-								clearColor        = _mm_setzero_si128();
-								clearDepth_vec128 = _mm_setzero_si128();
-								break;
-						}
-					}
-					
-					const __m128i clearDepthValue = _mm_and_si128(clearDepth_vec128, _mm_set1_epi16(0x7FFF));
-					const __m128i depthPlusOne = _mm_srli_epi16( _mm_adds_epu16(clearDepthValue, _mm_set1_epi16(1)), 15);
-					const __m128i clearFog = _mm_srli_epi16(clearDepth_vec128, 15);
-					
-					__m128i calcDepth0 = _mm_unpacklo_epi16(clearDepthValue, depthPlusOne);
-					__m128i calcDepth1 = _mm_unpackhi_epi16(clearDepthValue, depthPlusOne);
-					calcDepth0 = _mm_madd_epi16(calcDepth0, calcDepthConstants);
-					calcDepth1 = _mm_madd_epi16(calcDepth1, calcDepthConstants);
-					
-					_mm_store_si128((__m128i *)(this->clearImageColor16Buffer + dstIndex), clearColor);
-					_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + dstIndex + 0), calcDepth0);
-					_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + dstIndex + 4), calcDepth1);
-					_mm_storel_epi64((__m128i *)(this->clearImageFogBuffer + dstIndex), _mm_packs_epi16(clearFog, _mm_setzero_si128()));
-				}
-			}
-			*/
-			const bool isClearColorBlank = (clearColorBuffer >= (u16 *)MMU.blank_memory);
-			const bool isClearDepthBlank = (clearDepthBuffer >= (u16 *)MMU.blank_memory);
-			
-			if (!isClearColorBlank && !isClearDepthBlank)
-			{
-				this->_ClearImageScrolledLoop<false, false>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer,
-															this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
-			}
-			else if (isClearColorBlank)
-			{
-				this->_ClearImageScrolledLoop< true, false>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer,
-															this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
-			}
-			else if (isClearDepthBlank)
-			{
-				this->_ClearImageScrolledLoop<false,  true>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer,
-															this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
-			}
-			else
-			{
-				this->_ClearImageScrolledLoop< true,  true>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer,
-															this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
-			}
-		}
+		// Write the depth values to the depth buffer using the following formula from GBATEK.
+		// 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane
+		//    D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF);
+		//
+		// For now, let's forget GBATEK (which could be wrong) and try using a simpified formula:
+		//    D24 = (D15 * 0x0200) + 0x01FF;
+		const __m256i clearDepthLo = _mm256_load_si256((__m256i *)(inDepth16 + i) + 0);
+		const __m256i clearDepthHi = _mm256_load_si256((__m256i *)(inDepth16 + i) + 1);
 		
-		error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->_clearAttributes.opaquePolyID);
-		if (error != RENDER3DERROR_NOERR)
-		{
-			error = this->ClearUsingValues(this->_clearColor6665, this->_clearAttributes);
-		}
+		const __m256i clearDepthValueLo = _mm256_permute4x64_epi64( _mm256_and_si256(clearDepthLo, _mm256_set1_epi16(0x7FFF)), 0xD8 );
+		const __m256i clearDepthValueHi = _mm256_permute4x64_epi64( _mm256_and_si256(clearDepthHi, _mm256_set1_epi16(0x7FFF)), 0xD8 );
+		
+		__m256i calcDepth0 = _mm256_unpacklo_epi16(clearDepthValueLo, _mm256_set1_epi16(1));
+		__m256i calcDepth1 = _mm256_unpackhi_epi16(clearDepthValueLo, _mm256_set1_epi16(1));
+		__m256i calcDepth2 = _mm256_unpacklo_epi16(clearDepthValueHi, _mm256_set1_epi16(1));
+		__m256i calcDepth3 = _mm256_unpackhi_epi16(clearDepthValueHi, _mm256_set1_epi16(1));
+		
+		calcDepth0 = _mm256_madd_epi16(calcDepth0, calcDepthConstants);
+		calcDepth1 = _mm256_madd_epi16(calcDepth1, calcDepthConstants);
+		calcDepth2 = _mm256_madd_epi16(calcDepth2, calcDepthConstants);
+		calcDepth3 = _mm256_madd_epi16(calcDepth3, calcDepthConstants);
+		
+		_mm256_store_si256((__m256i *)(outDepth24 + i) + 0, calcDepth0);
+		_mm256_store_si256((__m256i *)(outDepth24 + i) + 1, calcDepth1);
+		_mm256_store_si256((__m256i *)(outDepth24 + i) + 2, calcDepth2);
+		_mm256_store_si256((__m256i *)(outDepth24 + i) + 3, calcDepth3);
+		
+		// Write the fog flags to the fog flag buffer.
+		const __m256i clearFogLo = _mm256_srli_epi16(clearDepthLo, 15);
+		const __m256i clearFogHi = _mm256_srli_epi16(clearDepthHi, 15);
+		_mm256_store_si256( (__m256i *)(outFog + i), _mm256_permute4x64_epi64(_mm256_packus_epi16(clearFogLo, clearFogHi), 0xD8) );
 	}
-	else
-	{
-		error = this->ClearUsingValues(this->_clearColor6665, this->_clearAttributes);
-	}
-	
-	return error;
 }
-
-#endif // defined(ENABLE_AVX) || defined(ENABLE_SSE2)
+#elif defined(ENABLE_SSE2)
+void Render3D_SSE2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
+{
+	const __m128i calcDepthConstants = _mm_set1_epi32(0x01FF0200);
+	
+	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v128u16))
+	{
+		// Copy the colors to the color buffer.
+		_mm_store_si128( (__m128i *)(outColor16 + i) + 0, _mm_load_si128((__m128i *)(inColor16 + i) + 0) );
+		_mm_store_si128( (__m128i *)(outColor16 + i) + 1, _mm_load_si128((__m128i *)(inColor16 + i) + 1) );
+		
+		// Write the depth values to the depth buffer using the following formula from GBATEK.
+		// 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane
+		//    D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF);
+		//
+		// For now, let's forget GBATEK (which could be wrong) and try using a simpified formula:
+		//    D24 = (D15 * 0x0200) + 0x01FF;
+		const __m128i clearDepthLo = _mm_load_si128((__m128i *)(inDepth16 + i) + 0);
+		const __m128i clearDepthHi = _mm_load_si128((__m128i *)(inDepth16 + i) + 1);
+		
+		const __m128i clearDepthValueLo = _mm_and_si128(clearDepthLo, _mm_set1_epi16(0x7FFF));
+		const __m128i clearDepthValueHi = _mm_and_si128(clearDepthHi, _mm_set1_epi16(0x7FFF));
+		
+		__m128i calcDepth0 = _mm_unpacklo_epi16(clearDepthValueLo, _mm_set1_epi16(1));
+		__m128i calcDepth1 = _mm_unpackhi_epi16(clearDepthValueLo, _mm_set1_epi16(1));
+		__m128i calcDepth2 = _mm_unpacklo_epi16(clearDepthValueHi, _mm_set1_epi16(1));
+		__m128i calcDepth3 = _mm_unpackhi_epi16(clearDepthValueHi, _mm_set1_epi16(1));
+		
+		calcDepth0 = _mm_madd_epi16(calcDepth0, calcDepthConstants);
+		calcDepth1 = _mm_madd_epi16(calcDepth1, calcDepthConstants);
+		calcDepth2 = _mm_madd_epi16(calcDepth2, calcDepthConstants);
+		calcDepth3 = _mm_madd_epi16(calcDepth3, calcDepthConstants);
+		
+		_mm_store_si128((__m128i *)(outDepth24 + i) + 0, calcDepth0);
+		_mm_store_si128((__m128i *)(outDepth24 + i) + 1, calcDepth1);
+		_mm_store_si128((__m128i *)(outDepth24 + i) + 2, calcDepth2);
+		_mm_store_si128((__m128i *)(outDepth24 + i) + 3, calcDepth3);
+		
+		// Write the fog flags to the fog flag buffer.
+		const __m128i clearFogLo = _mm_srli_epi16(clearDepthLo, 15);
+		const __m128i clearFogHi = _mm_srli_epi16(clearDepthHi, 15);
+		_mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packs_epi16(clearFogLo, clearFogHi));
+	}
+}
+#endif
 
 template Render3D_SIMD<16>::Render3D_SIMD();
 template Render3D_SIMD<32>::Render3D_SIMD();
diff --git a/desmume/src/render3D.h b/desmume/src/render3D.h
index 587affc22..d169da18c 100644
--- a/desmume/src/render3D.h
+++ b/desmume/src/render3D.h
@@ -192,7 +192,8 @@ protected:
 	CACHE_ALIGN u16 clearImageColor16Buffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
 	CACHE_ALIGN u32 clearImageDepthBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
 	CACHE_ALIGN u8 clearImageFogBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
-	
+	
+	virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
 	template<bool ISCOLORBLANK, bool ISDEPTHBLANK> void _ClearImageScrolledLoop(const u8 xScroll, const u8 yScroll, const u16 *__restrict inColor16, const u16 *__restrict inDepth16,
 																				u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
 	
@@ -281,20 +282,20 @@ public:
 	virtual Render3DError SetFramebufferSize(size_t w, size_t h);
 };
 
-#if defined(ENABLE_AVX)
+#if defined(ENABLE_AVX2)
 
-class Render3D_AVX : public Render3D_SIMD<32>
+class Render3D_AVX2 : public Render3D_SIMD<32>
 {
-public:
-	virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState);
+public:
+	virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
 };
 
 #elif defined(ENABLE_SSE2)
 
 class Render3D_SSE2 : public Render3D_SIMD<16>
 {
-public:
-	virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState);
+public:
+	virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
 };
 
 #elif defined(ENABLE_ALTIVEC)