From dde0da24ab8b9a53bccca012b428c64981296473 Mon Sep 17 00:00:00 2001 From: rogerman Date: Thu, 23 Jun 2016 20:30:24 +0000 Subject: [PATCH] GPU: - Avoid generating autovectorized SSE2 code for loops where a hand-coded SSE2 loop already exists. (MSVC and Clang only.) --- desmume/src/GPU.cpp | 49 ++++++++++++++++++++++++ desmume/src/GPU.h | 5 ++- desmume/src/OGLRender.cpp | 16 ++++++++ desmume/src/rasterize.cpp | 3 ++ desmume/src/render3D.cpp | 3 ++ desmume/src/types.h | 80 ++++++++++++++++++++++----------------- 6 files changed, 120 insertions(+), 36 deletions(-) diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 6cedf3d4f..7632060f8 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -2917,6 +2917,10 @@ void GPUEngineBase::_RenderPixelsCustom(void *__restrict dstColorLine, u8 *__res (GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i), dstLayerID + i); } +#endif + +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE #endif for (; i < dstPixCount; i++) { @@ -2969,6 +2973,10 @@ void GPUEngineBase::_RenderPixelsCustomVRAM(void *__restrict dstColorLine, u8 *_ (GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i), dstLayerID + i); } +#endif + +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE #endif for (; i < dstPixCount; i++) { @@ -4004,6 +4012,10 @@ void GPUEngineBase::ApplyMasterBrightness() const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i)); _mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectIncreaseBrightness(dstColor_vec128, intensity_vec128) ); } +#endif + +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE #endif for (; i < pixCount; i++) { @@ -4077,6 +4089,10 @@ void GPUEngineBase::ApplyMasterBrightness() const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i)); _mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectDecreaseBrightness(dstColor_vec128, intensity_vec128) ); } +#endif + +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE #endif for (; i < pixCount; i++) { @@ -5041,6 +5057,10 @@ void* GPUEngineA::_RenderLine_Layers(const u16 l) (dispInfo.colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + dstX) : (u16 *)(dstColorLine32 + dstX), dstLayerIDPtr + dstX); } +#endif + +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE #endif for (; dstX < customLineWidth; dstX++) { @@ -5669,6 +5689,10 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const u16 *src, u16 *dst, const si { _mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) ); } +#endif + +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE #endif for (; i < pixCountExt; i++) { @@ -5686,6 +5710,10 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const u16 *src, u16 *dst, const si { _mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) ); } +#endif + +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE #endif for (; i < captureLengthExt; i++) { @@ -5824,6 +5852,9 @@ void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const u16 *srcA, } #endif +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE +#endif for (; i < length; i++) { const u16 colorA = (!CAPTUREFROMNATIVESRCA) ? srcA[i] : srcA[offset + i]; @@ -7095,6 +7126,9 @@ void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restric } #endif +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE +#endif for (; i < pixCount; i++) { dst[i] = ConvertColor555To8888Opaque(src[i]); @@ -7127,6 +7161,9 @@ void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restric } #endif +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE +#endif for (; i < pixCount; i++) { dst[i] = ConvertColor555To6665Opaque(src[i]); @@ -7146,6 +7183,9 @@ void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) } #endif +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE +#endif for (; i < pixCount; i++) { dst[i] = ConvertColor8888To6665(src[i]); @@ -7165,6 +7205,9 @@ void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) } #endif +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE +#endif for (; i < pixCount; i++) { dst[i] = ConvertColor6665To8888(src[i]); @@ -7191,6 +7234,9 @@ void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst } #endif +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE +#endif for (; i < pixCount; i++) { dst[i] = ConvertColor8888To5551(src[i]); @@ -7217,6 +7263,9 @@ void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst } #endif +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE +#endif for (; i < pixCount; i++) { dst[i] = ConvertColor6665To5551(src[i]); diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index 7b3a3c905..95676e7ed 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -1802,7 +1802,7 @@ template FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo, __m128i &dstHi) { // Conversion algorithm: - // RGB 5-bit to 6-bit formula: dstRGB8 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) if (SWAP_RB) { dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 17), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src, 12), _mm_set1_epi32(0x00010000))); @@ -1836,6 +1836,9 @@ FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo, template FORCEINLINE __m128i ConvertColor8888To6665(const __m128i src) { + // Conversion algorithm: + // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) + // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) __m128i rgb; const __m128i a = _mm_and_si128( _mm_srli_epi32(src, 3), _mm_set1_epi32(0x1F000000) ); diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index f87f25a02..a04aed888 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -994,6 +994,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor _mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), ConvertColor8888To6665(srcColorHi) ); _mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551(srcColorLo, srcColorHi) ); } +#endif + +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE #endif for (; i < pixCount; i++) { @@ -1025,6 +1029,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor _mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), srcColorHi ); _mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551(srcColorLo, srcColorHi) ); } +#endif + +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE #endif for (; i < pixCount; i++) { @@ -1064,6 +1072,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), ConvertColor8888To6665(srcColorHi) ); _mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551(srcColorLo, srcColorHi) ); } +#endif + +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE #endif for (; x < pixCount; x++, ir++, iw++) { @@ -1105,6 +1117,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), srcColorHi ); _mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551(srcColorLo, srcColorHi) ); } +#endif + +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE #endif for (; x < pixCount; x++, ir++, iw++) { diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index ad4362d20..38e27fa1d 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -2059,6 +2059,9 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor _mm_stream_si128((__m128i *)(this->_framebufferAttributes->isTranslucentPoly + i), attrIsTranslucentPoly_vec128); } +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE +#endif for (; i < pixCount; i++) { this->_framebufferColor[i] = clearColor6665; diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 5ac323d92..5f99aaf9c 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -237,6 +237,9 @@ void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr) } #endif +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE +#endif for (; i < count; i++) { this->SetAtIndex(i, attr); diff --git a/desmume/src/types.h b/desmume/src/types.h index f51d04dcf..f721eac85 100644 --- a/desmume/src/types.h +++ b/desmume/src/types.h @@ -35,13 +35,13 @@ //enforce a constraint: gdb stub requires developer #if defined(GDB_STUB) && !defined(DEVELOPER) -#define DEVELOPER + #define DEVELOPER #endif #ifdef DEVELOPER -#define IF_DEVELOPER(X) X + #define IF_DEVELOPER(X) X #else -#define IF_DEVELOPER(X) + #define IF_DEVELOPER(X) #endif #ifdef HOST_WINDOWS @@ -96,19 +96,19 @@ //dont apply these to types without further testing. it only works portably here on declarations of variables //cant we find a pattern other people use more successfully? #if _MSC_VER >= 1900 -#define DS_ALIGN(X) alignas(X) + #define DS_ALIGN(X) alignas(X) #elif defined(_MSC_VER) || defined(__INTEL_COMPILER) -#define DS_ALIGN(X) __declspec(align(X)) + #define DS_ALIGN(X) __declspec(align(X)) #elif defined(__GNUC__) -#define DS_ALIGN(X) __attribute__ ((aligned (X))) + #define DS_ALIGN(X) __attribute__ ((aligned (X))) #else -#define DS_ALIGN(X) + #define DS_ALIGN(X) #endif #ifdef HOST_64 -#define CACHE_ALIGN_SIZE 64 + #define CACHE_ALIGN_SIZE 64 #else -#define CACHE_ALIGN_SIZE 32 + #define CACHE_ALIGN_SIZE 32 #endif //use this for example when you want a byte value to be better-aligned @@ -117,49 +117,59 @@ //--------------------------------------------- #ifdef __MINGW32__ -#define FASTCALL __attribute__((fastcall)) -#define ASMJIT_CALL_CONV kX86FuncConvGccFastCall + #define FASTCALL __attribute__((fastcall)) + #define ASMJIT_CALL_CONV kX86FuncConvGccFastCall #elif defined (__i386__) && !defined(__clang__) -#define FASTCALL __attribute__((regparm(3))) -#define ASMJIT_CALL_CONV kX86FuncConvGccRegParm3 + #define FASTCALL __attribute__((regparm(3))) + #define ASMJIT_CALL_CONV kX86FuncConvGccRegParm3 #elif defined(_MSC_VER) || defined(__INTEL_COMPILER) -#define FASTCALL -#define ASMJIT_CALL_CONV kX86FuncConvDefault + #define FASTCALL + #define ASMJIT_CALL_CONV kX86FuncConvDefault #else -#define FASTCALL -#define ASMJIT_CALL_CONV kX86FuncConvDefault + #define FASTCALL + #define ASMJIT_CALL_CONV kX86FuncConvDefault #endif #ifdef _MSC_VER -#define _CDECL_ __cdecl + #define _CDECL_ __cdecl #else -#define _CDECL_ + #define _CDECL_ #endif #ifndef INLINE -#if defined(_MSC_VER) || defined(__INTEL_COMPILER) -#define INLINE _inline -#else -#define INLINE inline -#endif + #if defined(_MSC_VER) || defined(__INTEL_COMPILER) + #define INLINE _inline + #else + #define INLINE inline + #endif #endif #ifndef FORCEINLINE -#if defined(_MSC_VER) || defined(__INTEL_COMPILER) -#define FORCEINLINE __forceinline -#define MSC_FORCEINLINE __forceinline -#else -#define FORCEINLINE inline __attribute__((always_inline)) -#define MSC_FORCEINLINE -#endif + #if defined(_MSC_VER) || defined(__INTEL_COMPILER) + #define FORCEINLINE __forceinline + #define MSC_FORCEINLINE __forceinline + #else + #define FORCEINLINE inline __attribute__((always_inline)) + #define MSC_FORCEINLINE + #endif #endif #ifndef NOINLINE -#ifdef __GNUC__ -#define NOINLINE __attribute__((noinline)) -#else -#define NOINLINE + #ifdef __GNUC__ + #define NOINLINE __attribute__((noinline)) + #else + #define NOINLINE + #endif #endif + +#ifndef LOOPVECTORIZE_DISABLE + #if defined(_MSC_VER) + #define LOOPVECTORIZE_DISABLE loop(no_vector) + #elif defined(__clang__) + #define LOOPVECTORIZE_DISABLE clang loop vectorize(disable) + #else + #define LOOPVECTORIZE_DISABLE + #endif #endif #if defined(__LP64__)