- Avoid generating autovectorized SSE2 code for loops where a hand-coded SSE2 loop already exists. (MSVC and Clang only.)
This commit is contained in:
rogerman 2016-06-23 20:30:24 +00:00
parent 3f895b85fb
commit dde0da24ab
6 changed files with 120 additions and 36 deletions

View File

@ -2917,6 +2917,10 @@ void GPUEngineBase::_RenderPixelsCustom(void *__restrict dstColorLine, u8 *__res
(GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i), (GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i),
dstLayerID + i); dstLayerID + i);
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < dstPixCount; i++) for (; i < dstPixCount; i++)
{ {
@ -2969,6 +2973,10 @@ void GPUEngineBase::_RenderPixelsCustomVRAM(void *__restrict dstColorLine, u8 *_
(GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i), (GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i),
dstLayerID + i); dstLayerID + i);
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < dstPixCount; i++) for (; i < dstPixCount; i++)
{ {
@ -4004,6 +4012,10 @@ void GPUEngineBase::ApplyMasterBrightness()
const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i)); const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i));
_mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectIncreaseBrightness(dstColor_vec128, intensity_vec128) ); _mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectIncreaseBrightness(dstColor_vec128, intensity_vec128) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
@ -4077,6 +4089,10 @@ void GPUEngineBase::ApplyMasterBrightness()
const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i)); const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i));
_mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectDecreaseBrightness(dstColor_vec128, intensity_vec128) ); _mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectDecreaseBrightness(dstColor_vec128, intensity_vec128) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
@ -5041,6 +5057,10 @@ void* GPUEngineA::_RenderLine_Layers(const u16 l)
(dispInfo.colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + dstX) : (u16 *)(dstColorLine32 + dstX), (dispInfo.colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + dstX) : (u16 *)(dstColorLine32 + dstX),
dstLayerIDPtr + dstX); dstLayerIDPtr + dstX);
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; dstX < customLineWidth; dstX++) for (; dstX < customLineWidth; dstX++)
{ {
@ -5669,6 +5689,10 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const u16 *src, u16 *dst, const si
{ {
_mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) ); _mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < pixCountExt; i++) for (; i < pixCountExt; i++)
{ {
@ -5686,6 +5710,10 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const u16 *src, u16 *dst, const si
{ {
_mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) ); _mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < captureLengthExt; i++) for (; i < captureLengthExt; i++)
{ {
@ -5824,6 +5852,9 @@ void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const u16 *srcA,
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < length; i++) for (; i < length; i++)
{ {
const u16 colorA = (!CAPTUREFROMNATIVESRCA) ? srcA[i] : srcA[offset + i]; const u16 colorA = (!CAPTUREFROMNATIVESRCA) ? srcA[i] : srcA[offset + i];
@ -7095,6 +7126,9 @@ void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restric
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
dst[i] = ConvertColor555To8888Opaque<SWAP_RB>(src[i]); dst[i] = ConvertColor555To8888Opaque<SWAP_RB>(src[i]);
@ -7127,6 +7161,9 @@ void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restric
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
dst[i] = ConvertColor555To6665Opaque<SWAP_RB>(src[i]); dst[i] = ConvertColor555To6665Opaque<SWAP_RB>(src[i]);
@ -7146,6 +7183,9 @@ void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount)
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
dst[i] = ConvertColor8888To6665<SWAP_RB>(src[i]); dst[i] = ConvertColor8888To6665<SWAP_RB>(src[i]);
@ -7165,6 +7205,9 @@ void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount)
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
dst[i] = ConvertColor6665To8888<SWAP_RB>(src[i]); dst[i] = ConvertColor6665To8888<SWAP_RB>(src[i]);
@ -7191,6 +7234,9 @@ void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
dst[i] = ConvertColor8888To5551<SWAP_RB>(src[i]); dst[i] = ConvertColor8888To5551<SWAP_RB>(src[i]);
@ -7217,6 +7263,9 @@ void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
dst[i] = ConvertColor6665To5551<SWAP_RB>(src[i]); dst[i] = ConvertColor6665To5551<SWAP_RB>(src[i]);

View File

@ -1802,7 +1802,7 @@ template <bool SWAP_RB>
FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo, __m128i &dstHi) FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo, __m128i &dstHi)
{ {
// Conversion algorithm: // Conversion algorithm:
// RGB 5-bit to 6-bit formula: dstRGB8 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
if (SWAP_RB) if (SWAP_RB)
{ {
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 17), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src, 12), _mm_set1_epi32(0x00010000))); dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 17), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src, 12), _mm_set1_epi32(0x00010000)));
@ -1836,6 +1836,9 @@ FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo,
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE __m128i ConvertColor8888To6665(const __m128i src) FORCEINLINE __m128i ConvertColor8888To6665(const __m128i src)
{ {
// Conversion algorithm:
// RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2)
// Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3)
__m128i rgb; __m128i rgb;
const __m128i a = _mm_and_si128( _mm_srli_epi32(src, 3), _mm_set1_epi32(0x1F000000) ); const __m128i a = _mm_and_si128( _mm_srli_epi32(src, 3), _mm_set1_epi32(0x1F000000) );

View File

@ -994,6 +994,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
_mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), ConvertColor8888To6665<true>(srcColorHi) ); _mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), ConvertColor8888To6665<true>(srcColorHi) );
_mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) ); _mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
@ -1025,6 +1029,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
_mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), srcColorHi ); _mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), srcColorHi );
_mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) ); _mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
@ -1064,6 +1072,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
_mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), ConvertColor8888To6665<true>(srcColorHi) ); _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), ConvertColor8888To6665<true>(srcColorHi) );
_mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) ); _mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; x < pixCount; x++, ir++, iw++) for (; x < pixCount; x++, ir++, iw++)
{ {
@ -1105,6 +1117,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
_mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), srcColorHi ); _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), srcColorHi );
_mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) ); _mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; x < pixCount; x++, ir++, iw++) for (; x < pixCount; x++, ir++, iw++)
{ {

View File

@ -2059,6 +2059,9 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor
_mm_stream_si128((__m128i *)(this->_framebufferAttributes->isTranslucentPoly + i), attrIsTranslucentPoly_vec128); _mm_stream_si128((__m128i *)(this->_framebufferAttributes->isTranslucentPoly + i), attrIsTranslucentPoly_vec128);
} }
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
this->_framebufferColor[i] = clearColor6665; this->_framebufferColor[i] = clearColor6665;

View File

@ -237,6 +237,9 @@ void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < count; i++) for (; i < count; i++)
{ {
this->SetAtIndex(i, attr); this->SetAtIndex(i, attr);

View File

@ -35,13 +35,13 @@
//enforce a constraint: gdb stub requires developer //enforce a constraint: gdb stub requires developer
#if defined(GDB_STUB) && !defined(DEVELOPER) #if defined(GDB_STUB) && !defined(DEVELOPER)
#define DEVELOPER #define DEVELOPER
#endif #endif
#ifdef DEVELOPER #ifdef DEVELOPER
#define IF_DEVELOPER(X) X #define IF_DEVELOPER(X) X
#else #else
#define IF_DEVELOPER(X) #define IF_DEVELOPER(X)
#endif #endif
#ifdef HOST_WINDOWS #ifdef HOST_WINDOWS
@ -96,19 +96,19 @@
//dont apply these to types without further testing. it only works portably here on declarations of variables //dont apply these to types without further testing. it only works portably here on declarations of variables
//cant we find a pattern other people use more successfully? //cant we find a pattern other people use more successfully?
#if _MSC_VER >= 1900 #if _MSC_VER >= 1900
#define DS_ALIGN(X) alignas(X) #define DS_ALIGN(X) alignas(X)
#elif defined(_MSC_VER) || defined(__INTEL_COMPILER) #elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
#define DS_ALIGN(X) __declspec(align(X)) #define DS_ALIGN(X) __declspec(align(X))
#elif defined(__GNUC__) #elif defined(__GNUC__)
#define DS_ALIGN(X) __attribute__ ((aligned (X))) #define DS_ALIGN(X) __attribute__ ((aligned (X)))
#else #else
#define DS_ALIGN(X) #define DS_ALIGN(X)
#endif #endif
#ifdef HOST_64 #ifdef HOST_64
#define CACHE_ALIGN_SIZE 64 #define CACHE_ALIGN_SIZE 64
#else #else
#define CACHE_ALIGN_SIZE 32 #define CACHE_ALIGN_SIZE 32
#endif #endif
//use this for example when you want a byte value to be better-aligned //use this for example when you want a byte value to be better-aligned
@ -117,49 +117,59 @@
//--------------------------------------------- //---------------------------------------------
#ifdef __MINGW32__ #ifdef __MINGW32__
#define FASTCALL __attribute__((fastcall)) #define FASTCALL __attribute__((fastcall))
#define ASMJIT_CALL_CONV kX86FuncConvGccFastCall #define ASMJIT_CALL_CONV kX86FuncConvGccFastCall
#elif defined (__i386__) && !defined(__clang__) #elif defined (__i386__) && !defined(__clang__)
#define FASTCALL __attribute__((regparm(3))) #define FASTCALL __attribute__((regparm(3)))
#define ASMJIT_CALL_CONV kX86FuncConvGccRegParm3 #define ASMJIT_CALL_CONV kX86FuncConvGccRegParm3
#elif defined(_MSC_VER) || defined(__INTEL_COMPILER) #elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
#define FASTCALL #define FASTCALL
#define ASMJIT_CALL_CONV kX86FuncConvDefault #define ASMJIT_CALL_CONV kX86FuncConvDefault
#else #else
#define FASTCALL #define FASTCALL
#define ASMJIT_CALL_CONV kX86FuncConvDefault #define ASMJIT_CALL_CONV kX86FuncConvDefault
#endif #endif
#ifdef _MSC_VER #ifdef _MSC_VER
#define _CDECL_ __cdecl #define _CDECL_ __cdecl
#else #else
#define _CDECL_ #define _CDECL_
#endif #endif
#ifndef INLINE #ifndef INLINE
#if defined(_MSC_VER) || defined(__INTEL_COMPILER) #if defined(_MSC_VER) || defined(__INTEL_COMPILER)
#define INLINE _inline #define INLINE _inline
#else #else
#define INLINE inline #define INLINE inline
#endif #endif
#endif #endif
#ifndef FORCEINLINE #ifndef FORCEINLINE
#if defined(_MSC_VER) || defined(__INTEL_COMPILER) #if defined(_MSC_VER) || defined(__INTEL_COMPILER)
#define FORCEINLINE __forceinline #define FORCEINLINE __forceinline
#define MSC_FORCEINLINE __forceinline #define MSC_FORCEINLINE __forceinline
#else #else
#define FORCEINLINE inline __attribute__((always_inline)) #define FORCEINLINE inline __attribute__((always_inline))
#define MSC_FORCEINLINE #define MSC_FORCEINLINE
#endif #endif
#endif #endif
#ifndef NOINLINE #ifndef NOINLINE
#ifdef __GNUC__ #ifdef __GNUC__
#define NOINLINE __attribute__((noinline)) #define NOINLINE __attribute__((noinline))
#else #else
#define NOINLINE #define NOINLINE
#endif
#endif #endif
#ifndef LOOPVECTORIZE_DISABLE
#if defined(_MSC_VER)
#define LOOPVECTORIZE_DISABLE loop(no_vector)
#elif defined(__clang__)
#define LOOPVECTORIZE_DISABLE clang loop vectorize(disable)
#else
#define LOOPVECTORIZE_DISABLE
#endif
#endif #endif
#if defined(__LP64__) #if defined(__LP64__)