diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index 5266e22cc..e76e8419f 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -27,6 +27,31 @@ #include "types.h" #include "./utils/colorspacehandler/colorspacehandler.h" +// For now, let's keep these SSE2 compatibility functions here to avoid build issues with Linux. +// These should be moved to a more universal file like "types.h" so that they are available +// everywhere, but Linux builds seem to be very finicky with their include structure. So let's +// not rock the boat and make Linux builds happy. +// - rogerman, 2022/04/06 + +#if defined(ENABLE_SSSE3) + #include +#elif defined(ENABLE_SSE2) + // Note: Technically, the shift count of palignr can be any value of [0-255]. But practically speaking, the + // shift count should be a value of [0-15]. If we assume that the value range will always be [0-15], we can + // then substitute the palignr instruction with an SSE2 equivalent. + #define _mm_alignr_epi8(a, b, immShiftCount) _mm_or_si128(_mm_slli_si128((a), 16-(immShiftCount)), _mm_srli_si128((b), (immShiftCount))) +#endif // ENABLE_SSSE3 + +#if defined(ENABLE_SSE4_1) + #include +#elif defined(ENABLE_SSE2) + // Note: The SSE4.1 version of pblendvb only requires that the MSBs of the 8-bit mask vector are set in order to + // pass the b byte through. However, our SSE2 substitute of pblendvb requires that all of the bits of the 8-bit + // mask vector are set. So when using this intrinsic in practice, just set/clear all mask bits together, and it + // should work fine for both SSE4.1 and SSE2. + #define _mm_blendv_epi8(a, b, fullmask) _mm_or_si128(_mm_and_si128((fullmask), (b)), _mm_andnot_si128((fullmask), (a))) +#endif // ENABLE_SSE4_1 + class GPUEngineBase; class NDSDisplay; class EMUFILE; diff --git a/desmume/src/types.h b/desmume/src/types.h index 0e18c39c2..0fe31b347 100644 --- a/desmume/src/types.h +++ b/desmume/src/types.h @@ -288,27 +288,7 @@ typedef __m128i v128u16; typedef __m128i v128s16; typedef __m128i v128u32; typedef __m128i v128s32; - -#ifdef ENABLE_SSSE3 - #include -#else - // Note: Technically, the shift count of palignr can be any value of [0-255]. But practically speaking, the - // shift count should be a value of [0-15]. If we assume that the value range will always be [0-15], we can - // then substitute the palignr instruction with an SSE2 equivalent. - #define _mm_alignr_epi8(a, b, immShiftCount) _mm_or_si128(_mm_slli_si128((a), 16-(immShiftCount)), _mm_srli_si128((b), (immShiftCount))) -#endif // ENABLE_SSSE3 - -#ifdef ENABLE_SSE4_1 - #include -#else - // Note: The SSE4.1 version of pblendvb only requires that the MSBs of the 8-bit mask vector are set in order to - // pass the b byte through. However, our SSE2 substitute of pblendvb requires that all of the bits of the 8-bit - // mask vector are set. So when using this intrinsic in practice, just set/clear all mask bits together, and it - // should work fine for both SSE4.1 and SSE2. - #define _mm_blendv_epi8(a, b, fullmask) _mm_or_si128(_mm_and_si128((fullmask), (b)), _mm_andnot_si128((fullmask), (a))) -#endif // ENABLE_SSE4_1 - -#endif // ENABLE_SSE2 +#endif #if defined(ENABLE_AVX) || defined(ENABLE_AVX512_0)