From e8328eda3374f72c7902be15ff93e21f5f4110cb Mon Sep 17 00:00:00 2001
From: rogerman <rogerman@users.noreply.github.com>
Date: Tue, 5 Apr 2022 23:15:51 -0700
Subject: [PATCH] GPU: Clean up some old header stuff now that the SIMD code
 has been factored out.

---
 desmume/src/GPU.h                   | 37 +----------------------------
 desmume/src/GPU_Operations_AVX2.cpp |  3 ++-
 desmume/src/GPU_Operations_SSE2.cpp |  4 ++--
 desmume/src/types.h                 | 22 ++++++++++++++++-
 4 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h
index f480aca2e..5266e22cc 100644
--- a/desmume/src/GPU.h
+++ b/desmume/src/GPU.h
@@ -2,7 +2,7 @@
 	Copyright (C) 2006 yopyop
 	Copyright (C) 2006-2007 Theo Berkau
 	Copyright (C) 2007 shash
-	Copyright (C) 2009-2021 DeSmuME team
+	Copyright (C) 2009-2022 DeSmuME team
 
 	This file is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
@@ -27,41 +27,6 @@
 #include "types.h"
 #include "./utils/colorspacehandler/colorspacehandler.h"
 
-#ifdef ENABLE_SSE2
-#include "./utils/colorspacehandler/colorspacehandler_SSE2.h"
-#endif
-
-#ifdef ENABLE_SSSE3
-#include <tmmintrin.h>
-#endif
-
-#ifdef ENABLE_SSE4_1
-#include <smmintrin.h>
-#endif
-
-#ifdef ENABLE_AVX2
-#include "./utils/colorspacehandler/colorspacehandler_AVX2.h"
-#endif
-
-#ifdef ENABLE_AVX512_1
-#include "./utils/colorspacehandler/colorspacehandler_AVX512.h"
-#endif
-
-// Note: Technically, the shift count of palignr can be any value of [0-255]. But practically speaking, the
-// shift count should be a value of [0-15]. If we assume that the value range will always be [0-15], we can
-// then substitute the palignr instruction with an SSE2 equivalent.
-#if defined(ENABLE_SSE2) && !defined(ENABLE_SSSE3)
-	#define _mm_alignr_epi8(a, b, immShiftCount) _mm_or_si128(_mm_slli_si128(a, 16-(immShiftCount)), _mm_srli_si128(b, (immShiftCount)))
-#endif
-
-// Note: The SSE4.1 version of pblendvb only requires that the MSBs of the 8-bit mask vector are set in order to
-// pass the b byte through. However, our SSE2 substitute of pblendvb requires that all of the bits of the 8-bit
-// mask vector are set. So when using this intrinsic in practice, just set/clear all mask bits together, and it
-// should work fine for both SSE4.1 and SSE2.
-#if defined(ENABLE_SSE2) && !defined(ENABLE_SSE4_1)
-	#define _mm_blendv_epi8(a, b, fullmask) _mm_or_si128(_mm_and_si128((fullmask), (b)), _mm_andnot_si128((fullmask), (a)))
-#endif
-
 class GPUEngineBase;
 class NDSDisplay;
 class EMUFILE;
diff --git a/desmume/src/GPU_Operations_AVX2.cpp b/desmume/src/GPU_Operations_AVX2.cpp
index 820233a57..f27115eeb 100644
--- a/desmume/src/GPU_Operations_AVX2.cpp
+++ b/desmume/src/GPU_Operations_AVX2.cpp
@@ -1,5 +1,5 @@
 /*
-	Copyright (C) 2021 DeSmuME team
+	Copyright (C) 2021-2022 DeSmuME team
 
 	This file is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
@@ -21,6 +21,7 @@
 #else
 
 #include "GPU_Operations_AVX2.h"
+#include "./utils/colorspacehandler/colorspacehandler_AVX2.h"
 
 
 static const ColorOperation_AVX2 colorop_vec;
diff --git a/desmume/src/GPU_Operations_SSE2.cpp b/desmume/src/GPU_Operations_SSE2.cpp
index 4a1ea7990..3bc6bd9ff 100644
--- a/desmume/src/GPU_Operations_SSE2.cpp
+++ b/desmume/src/GPU_Operations_SSE2.cpp
@@ -1,5 +1,5 @@
 /*
-	Copyright (C) 2021 DeSmuME team
+	Copyright (C) 2021-2022 DeSmuME team
 
 	This file is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
@@ -21,7 +21,7 @@
 #else
 
 #include "GPU_Operations_SSE2.h"
-#include <emmintrin.h>
+#include "./utils/colorspacehandler/colorspacehandler_SSE2.h"
 
 
 static const ColorOperation_SSE2 colorop_vec;
diff --git a/desmume/src/types.h b/desmume/src/types.h
index 0fe31b347..01f9a8a81 100644
--- a/desmume/src/types.h
+++ b/desmume/src/types.h
@@ -288,7 +288,27 @@ typedef __m128i v128u16;
 typedef __m128i v128s16;
 typedef __m128i v128u32;
 typedef __m128i v128s32;
-#endif
+
+#ifdef ENABLE_SSSE3
+	#include <tmmintrin.h>
+#else
+	// Note: Technically, the shift count of palignr can be any value of [0-255]. But practically speaking, the
+	// shift count should be a value of [0-15]. If we assume that the value range will always be [0-15], we can
+	// then substitute the palignr instruction with an SSE2 equivalent.
+	#define _mm_alignr_epi8(a, b, immShiftCount) _mm_or_si128(_mm_slli_si128(a, 16-(immShiftCount)), _mm_srli_si128(b, (immShiftCount)))
+#endif // ENABLE_SSSE3
+
+#ifdef ENABLE_SSE4_1
+	#include <smmintrin.h>
+#else
+	// Note: The SSE4.1 version of pblendvb only requires that the MSBs of the 8-bit mask vector are set in order to
+	// pass the b byte through. However, our SSE2 substitute of pblendvb requires that all of the bits of the 8-bit
+	// mask vector are set. So when using this intrinsic in practice, just set/clear all mask bits together, and it
+	// should work fine for both SSE4.1 and SSE2.
+	#define _mm_blendv_epi8(a, b, fullmask) _mm_or_si128(_mm_and_si128((fullmask), (b)), _mm_andnot_si128((fullmask), (a)))
+#endif // ENABLE_SSE4_1
+
+#endif // ENABLE_SSE2
 
 #if defined(ENABLE_AVX) || defined(ENABLE_AVX512_0)