From e8328eda3374f72c7902be15ff93e21f5f4110cb Mon Sep 17 00:00:00 2001 From: rogerman Date: Tue, 5 Apr 2022 23:15:51 -0700 Subject: [PATCH] GPU: Clean up some old header stuff now that the SIMD code has been factored out. --- desmume/src/GPU.h | 37 +---------------------------- desmume/src/GPU_Operations_AVX2.cpp | 3 ++- desmume/src/GPU_Operations_SSE2.cpp | 4 ++-- desmume/src/types.h | 22 ++++++++++++++++- 4 files changed, 26 insertions(+), 40 deletions(-) diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index f480aca2e..5266e22cc 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -2,7 +2,7 @@ Copyright (C) 2006 yopyop Copyright (C) 2006-2007 Theo Berkau Copyright (C) 2007 shash - Copyright (C) 2009-2021 DeSmuME team + Copyright (C) 2009-2022 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,41 +27,6 @@ #include "types.h" #include "./utils/colorspacehandler/colorspacehandler.h" -#ifdef ENABLE_SSE2 -#include "./utils/colorspacehandler/colorspacehandler_SSE2.h" -#endif - -#ifdef ENABLE_SSSE3 -#include -#endif - -#ifdef ENABLE_SSE4_1 -#include -#endif - -#ifdef ENABLE_AVX2 -#include "./utils/colorspacehandler/colorspacehandler_AVX2.h" -#endif - -#ifdef ENABLE_AVX512_1 -#include "./utils/colorspacehandler/colorspacehandler_AVX512.h" -#endif - -// Note: Technically, the shift count of palignr can be any value of [0-255]. But practically speaking, the -// shift count should be a value of [0-15]. If we assume that the value range will always be [0-15], we can -// then substitute the palignr instruction with an SSE2 equivalent. -#if defined(ENABLE_SSE2) && !defined(ENABLE_SSSE3) - #define _mm_alignr_epi8(a, b, immShiftCount) _mm_or_si128(_mm_slli_si128(a, 16-(immShiftCount)), _mm_srli_si128(b, (immShiftCount))) -#endif - -// Note: The SSE4.1 version of pblendvb only requires that the MSBs of the 8-bit mask vector are set in order to -// pass the b byte through. However, our SSE2 substitute of pblendvb requires that all of the bits of the 8-bit -// mask vector are set. So when using this intrinsic in practice, just set/clear all mask bits together, and it -// should work fine for both SSE4.1 and SSE2. -#if defined(ENABLE_SSE2) && !defined(ENABLE_SSE4_1) - #define _mm_blendv_epi8(a, b, fullmask) _mm_or_si128(_mm_and_si128((fullmask), (b)), _mm_andnot_si128((fullmask), (a))) -#endif - class GPUEngineBase; class NDSDisplay; class EMUFILE; diff --git a/desmume/src/GPU_Operations_AVX2.cpp b/desmume/src/GPU_Operations_AVX2.cpp index 820233a57..f27115eeb 100644 --- a/desmume/src/GPU_Operations_AVX2.cpp +++ b/desmume/src/GPU_Operations_AVX2.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2021 DeSmuME team + Copyright (C) 2021-2022 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,6 +21,7 @@ #else #include "GPU_Operations_AVX2.h" +#include "./utils/colorspacehandler/colorspacehandler_AVX2.h" static const ColorOperation_AVX2 colorop_vec; diff --git a/desmume/src/GPU_Operations_SSE2.cpp b/desmume/src/GPU_Operations_SSE2.cpp index 4a1ea7990..3bc6bd9ff 100644 --- a/desmume/src/GPU_Operations_SSE2.cpp +++ b/desmume/src/GPU_Operations_SSE2.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2021 DeSmuME team + Copyright (C) 2021-2022 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,7 +21,7 @@ #else #include "GPU_Operations_SSE2.h" -#include +#include "./utils/colorspacehandler/colorspacehandler_SSE2.h" static const ColorOperation_SSE2 colorop_vec; diff --git a/desmume/src/types.h b/desmume/src/types.h index 0fe31b347..01f9a8a81 100644 --- a/desmume/src/types.h +++ b/desmume/src/types.h @@ -288,7 +288,27 @@ typedef __m128i v128u16; typedef __m128i v128s16; typedef __m128i v128u32; typedef __m128i v128s32; -#endif + +#ifdef ENABLE_SSSE3 + #include +#else + // Note: Technically, the shift count of palignr can be any value of [0-255]. But practically speaking, the + // shift count should be a value of [0-15]. If we assume that the value range will always be [0-15], we can + // then substitute the palignr instruction with an SSE2 equivalent. + #define _mm_alignr_epi8(a, b, immShiftCount) _mm_or_si128(_mm_slli_si128(a, 16-(immShiftCount)), _mm_srli_si128(b, (immShiftCount))) +#endif // ENABLE_SSSE3 + +#ifdef ENABLE_SSE4_1 + #include +#else + // Note: The SSE4.1 version of pblendvb only requires that the MSBs of the 8-bit mask vector are set in order to + // pass the b byte through. However, our SSE2 substitute of pblendvb requires that all of the bits of the 8-bit + // mask vector are set. So when using this intrinsic in practice, just set/clear all mask bits together, and it + // should work fine for both SSE4.1 and SSE2. + #define _mm_blendv_epi8(a, b, fullmask) _mm_or_si128(_mm_and_si128((fullmask), (b)), _mm_andnot_si128((fullmask), (a))) +#endif // ENABLE_SSE4_1 + +#endif // ENABLE_SSE2 #if defined(ENABLE_AVX) || defined(ENABLE_AVX512_0)