- Avoid generating autovectorized SSE2 code for loops where a hand-coded SSE2 loop already exists. (MSVC and Clang only.)
This commit is contained in:
rogerman 2016-06-23 20:30:24 +00:00
parent 3f895b85fb
commit dde0da24ab
6 changed files with 120 additions and 36 deletions

View File

@ -2917,6 +2917,10 @@ void GPUEngineBase::_RenderPixelsCustom(void *__restrict dstColorLine, u8 *__res
(GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i), (GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i),
dstLayerID + i); dstLayerID + i);
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < dstPixCount; i++) for (; i < dstPixCount; i++)
{ {
@ -2969,6 +2973,10 @@ void GPUEngineBase::_RenderPixelsCustomVRAM(void *__restrict dstColorLine, u8 *_
(GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i), (GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i),
dstLayerID + i); dstLayerID + i);
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < dstPixCount; i++) for (; i < dstPixCount; i++)
{ {
@ -4004,6 +4012,10 @@ void GPUEngineBase::ApplyMasterBrightness()
const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i)); const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i));
_mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectIncreaseBrightness(dstColor_vec128, intensity_vec128) ); _mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectIncreaseBrightness(dstColor_vec128, intensity_vec128) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
@ -4077,6 +4089,10 @@ void GPUEngineBase::ApplyMasterBrightness()
const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i)); const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i));
_mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectDecreaseBrightness(dstColor_vec128, intensity_vec128) ); _mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectDecreaseBrightness(dstColor_vec128, intensity_vec128) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
@ -5041,6 +5057,10 @@ void* GPUEngineA::_RenderLine_Layers(const u16 l)
(dispInfo.colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + dstX) : (u16 *)(dstColorLine32 + dstX), (dispInfo.colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + dstX) : (u16 *)(dstColorLine32 + dstX),
dstLayerIDPtr + dstX); dstLayerIDPtr + dstX);
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; dstX < customLineWidth; dstX++) for (; dstX < customLineWidth; dstX++)
{ {
@ -5669,6 +5689,10 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const u16 *src, u16 *dst, const si
{ {
_mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) ); _mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < pixCountExt; i++) for (; i < pixCountExt; i++)
{ {
@ -5686,6 +5710,10 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const u16 *src, u16 *dst, const si
{ {
_mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) ); _mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < captureLengthExt; i++) for (; i < captureLengthExt; i++)
{ {
@ -5824,6 +5852,9 @@ void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const u16 *srcA,
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < length; i++) for (; i < length; i++)
{ {
const u16 colorA = (!CAPTUREFROMNATIVESRCA) ? srcA[i] : srcA[offset + i]; const u16 colorA = (!CAPTUREFROMNATIVESRCA) ? srcA[i] : srcA[offset + i];
@ -7095,6 +7126,9 @@ void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restric
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
dst[i] = ConvertColor555To8888Opaque<SWAP_RB>(src[i]); dst[i] = ConvertColor555To8888Opaque<SWAP_RB>(src[i]);
@ -7127,6 +7161,9 @@ void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restric
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
dst[i] = ConvertColor555To6665Opaque<SWAP_RB>(src[i]); dst[i] = ConvertColor555To6665Opaque<SWAP_RB>(src[i]);
@ -7146,6 +7183,9 @@ void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount)
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
dst[i] = ConvertColor8888To6665<SWAP_RB>(src[i]); dst[i] = ConvertColor8888To6665<SWAP_RB>(src[i]);
@ -7165,6 +7205,9 @@ void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount)
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
dst[i] = ConvertColor6665To8888<SWAP_RB>(src[i]); dst[i] = ConvertColor6665To8888<SWAP_RB>(src[i]);
@ -7191,6 +7234,9 @@ void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
dst[i] = ConvertColor8888To5551<SWAP_RB>(src[i]); dst[i] = ConvertColor8888To5551<SWAP_RB>(src[i]);
@ -7217,6 +7263,9 @@ void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
dst[i] = ConvertColor6665To5551<SWAP_RB>(src[i]); dst[i] = ConvertColor6665To5551<SWAP_RB>(src[i]);

View File

@ -1802,7 +1802,7 @@ template <bool SWAP_RB>
FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo, __m128i &dstHi) FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo, __m128i &dstHi)
{ {
// Conversion algorithm: // Conversion algorithm:
// RGB 5-bit to 6-bit formula: dstRGB8 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
if (SWAP_RB) if (SWAP_RB)
{ {
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 17), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src, 12), _mm_set1_epi32(0x00010000))); dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 17), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src, 12), _mm_set1_epi32(0x00010000)));
@ -1836,6 +1836,9 @@ FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo,
template <bool SWAP_RB> template <bool SWAP_RB>
FORCEINLINE __m128i ConvertColor8888To6665(const __m128i src) FORCEINLINE __m128i ConvertColor8888To6665(const __m128i src)
{ {
// Conversion algorithm:
// RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2)
// Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3)
__m128i rgb; __m128i rgb;
const __m128i a = _mm_and_si128( _mm_srli_epi32(src, 3), _mm_set1_epi32(0x1F000000) ); const __m128i a = _mm_and_si128( _mm_srli_epi32(src, 3), _mm_set1_epi32(0x1F000000) );

View File

@ -994,6 +994,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
_mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), ConvertColor8888To6665<true>(srcColorHi) ); _mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), ConvertColor8888To6665<true>(srcColorHi) );
_mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) ); _mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
@ -1025,6 +1029,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
_mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), srcColorHi ); _mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), srcColorHi );
_mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) ); _mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
@ -1064,6 +1072,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
_mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), ConvertColor8888To6665<true>(srcColorHi) ); _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), ConvertColor8888To6665<true>(srcColorHi) );
_mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) ); _mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; x < pixCount; x++, ir++, iw++) for (; x < pixCount; x++, ir++, iw++)
{ {
@ -1105,6 +1117,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
_mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), srcColorHi ); _mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), srcColorHi );
_mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) ); _mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
} }
#endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif #endif
for (; x < pixCount; x++, ir++, iw++) for (; x < pixCount; x++, ir++, iw++)
{ {

View File

@ -2059,6 +2059,9 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor
_mm_stream_si128((__m128i *)(this->_framebufferAttributes->isTranslucentPoly + i), attrIsTranslucentPoly_vec128); _mm_stream_si128((__m128i *)(this->_framebufferAttributes->isTranslucentPoly + i), attrIsTranslucentPoly_vec128);
} }
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < pixCount; i++) for (; i < pixCount; i++)
{ {
this->_framebufferColor[i] = clearColor6665; this->_framebufferColor[i] = clearColor6665;

View File

@ -237,6 +237,9 @@ void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
} }
#endif #endif
#ifdef ENABLE_SSE2
#pragma LOOPVECTORIZE_DISABLE
#endif
for (; i < count; i++) for (; i < count; i++)
{ {
this->SetAtIndex(i, attr); this->SetAtIndex(i, attr);

View File

@ -162,6 +162,16 @@
#endif #endif
#endif #endif
#ifndef LOOPVECTORIZE_DISABLE
#if defined(_MSC_VER)
#define LOOPVECTORIZE_DISABLE loop(no_vector)
#elif defined(__clang__)
#define LOOPVECTORIZE_DISABLE clang loop vectorize(disable)
#else
#define LOOPVECTORIZE_DISABLE
#endif
#endif
#if defined(__LP64__) #if defined(__LP64__)
typedef unsigned char u8; typedef unsigned char u8;
typedef unsigned short u16; typedef unsigned short u16;