GPU:
- Avoid generating autovectorized SSE2 code for loops where a hand-coded SSE2 loop already exists. (MSVC and Clang only.)
This commit is contained in:
parent
3f895b85fb
commit
dde0da24ab
|
@ -2917,6 +2917,10 @@ void GPUEngineBase::_RenderPixelsCustom(void *__restrict dstColorLine, u8 *__res
|
||||||
(GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i),
|
(GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i),
|
||||||
dstLayerID + i);
|
dstLayerID + i);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
#endif
|
#endif
|
||||||
for (; i < dstPixCount; i++)
|
for (; i < dstPixCount; i++)
|
||||||
{
|
{
|
||||||
|
@ -2969,6 +2973,10 @@ void GPUEngineBase::_RenderPixelsCustomVRAM(void *__restrict dstColorLine, u8 *_
|
||||||
(GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i),
|
(GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i),
|
||||||
dstLayerID + i);
|
dstLayerID + i);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
#endif
|
#endif
|
||||||
for (; i < dstPixCount; i++)
|
for (; i < dstPixCount; i++)
|
||||||
{
|
{
|
||||||
|
@ -4004,6 +4012,10 @@ void GPUEngineBase::ApplyMasterBrightness()
|
||||||
const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i));
|
const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i));
|
||||||
_mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectIncreaseBrightness(dstColor_vec128, intensity_vec128) );
|
_mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectIncreaseBrightness(dstColor_vec128, intensity_vec128) );
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
#endif
|
#endif
|
||||||
for (; i < pixCount; i++)
|
for (; i < pixCount; i++)
|
||||||
{
|
{
|
||||||
|
@ -4077,6 +4089,10 @@ void GPUEngineBase::ApplyMasterBrightness()
|
||||||
const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i));
|
const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i));
|
||||||
_mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectDecreaseBrightness(dstColor_vec128, intensity_vec128) );
|
_mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectDecreaseBrightness(dstColor_vec128, intensity_vec128) );
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
#endif
|
#endif
|
||||||
for (; i < pixCount; i++)
|
for (; i < pixCount; i++)
|
||||||
{
|
{
|
||||||
|
@ -5041,6 +5057,10 @@ void* GPUEngineA::_RenderLine_Layers(const u16 l)
|
||||||
(dispInfo.colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + dstX) : (u16 *)(dstColorLine32 + dstX),
|
(dispInfo.colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + dstX) : (u16 *)(dstColorLine32 + dstX),
|
||||||
dstLayerIDPtr + dstX);
|
dstLayerIDPtr + dstX);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
#endif
|
#endif
|
||||||
for (; dstX < customLineWidth; dstX++)
|
for (; dstX < customLineWidth; dstX++)
|
||||||
{
|
{
|
||||||
|
@ -5669,6 +5689,10 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const u16 *src, u16 *dst, const si
|
||||||
{
|
{
|
||||||
_mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) );
|
_mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) );
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
#endif
|
#endif
|
||||||
for (; i < pixCountExt; i++)
|
for (; i < pixCountExt; i++)
|
||||||
{
|
{
|
||||||
|
@ -5686,6 +5710,10 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const u16 *src, u16 *dst, const si
|
||||||
{
|
{
|
||||||
_mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) );
|
_mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) );
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
#endif
|
#endif
|
||||||
for (; i < captureLengthExt; i++)
|
for (; i < captureLengthExt; i++)
|
||||||
{
|
{
|
||||||
|
@ -5824,6 +5852,9 @@ void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const u16 *srcA,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
|
#endif
|
||||||
for (; i < length; i++)
|
for (; i < length; i++)
|
||||||
{
|
{
|
||||||
const u16 colorA = (!CAPTUREFROMNATIVESRCA) ? srcA[i] : srcA[offset + i];
|
const u16 colorA = (!CAPTUREFROMNATIVESRCA) ? srcA[i] : srcA[offset + i];
|
||||||
|
@ -7095,6 +7126,9 @@ void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restric
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
|
#endif
|
||||||
for (; i < pixCount; i++)
|
for (; i < pixCount; i++)
|
||||||
{
|
{
|
||||||
dst[i] = ConvertColor555To8888Opaque<SWAP_RB>(src[i]);
|
dst[i] = ConvertColor555To8888Opaque<SWAP_RB>(src[i]);
|
||||||
|
@ -7127,6 +7161,9 @@ void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restric
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
|
#endif
|
||||||
for (; i < pixCount; i++)
|
for (; i < pixCount; i++)
|
||||||
{
|
{
|
||||||
dst[i] = ConvertColor555To6665Opaque<SWAP_RB>(src[i]);
|
dst[i] = ConvertColor555To6665Opaque<SWAP_RB>(src[i]);
|
||||||
|
@ -7146,6 +7183,9 @@ void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
|
#endif
|
||||||
for (; i < pixCount; i++)
|
for (; i < pixCount; i++)
|
||||||
{
|
{
|
||||||
dst[i] = ConvertColor8888To6665<SWAP_RB>(src[i]);
|
dst[i] = ConvertColor8888To6665<SWAP_RB>(src[i]);
|
||||||
|
@ -7165,6 +7205,9 @@ void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
|
#endif
|
||||||
for (; i < pixCount; i++)
|
for (; i < pixCount; i++)
|
||||||
{
|
{
|
||||||
dst[i] = ConvertColor6665To8888<SWAP_RB>(src[i]);
|
dst[i] = ConvertColor6665To8888<SWAP_RB>(src[i]);
|
||||||
|
@ -7191,6 +7234,9 @@ void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
|
#endif
|
||||||
for (; i < pixCount; i++)
|
for (; i < pixCount; i++)
|
||||||
{
|
{
|
||||||
dst[i] = ConvertColor8888To5551<SWAP_RB>(src[i]);
|
dst[i] = ConvertColor8888To5551<SWAP_RB>(src[i]);
|
||||||
|
@ -7217,6 +7263,9 @@ void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
|
#endif
|
||||||
for (; i < pixCount; i++)
|
for (; i < pixCount; i++)
|
||||||
{
|
{
|
||||||
dst[i] = ConvertColor6665To5551<SWAP_RB>(src[i]);
|
dst[i] = ConvertColor6665To5551<SWAP_RB>(src[i]);
|
||||||
|
|
|
@ -1802,7 +1802,7 @@ template <bool SWAP_RB>
|
||||||
FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo, __m128i &dstHi)
|
FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo, __m128i &dstHi)
|
||||||
{
|
{
|
||||||
// Conversion algorithm:
|
// Conversion algorithm:
|
||||||
// RGB 5-bit to 6-bit formula: dstRGB8 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
|
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
|
||||||
if (SWAP_RB)
|
if (SWAP_RB)
|
||||||
{
|
{
|
||||||
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 17), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src, 12), _mm_set1_epi32(0x00010000)));
|
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 17), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src, 12), _mm_set1_epi32(0x00010000)));
|
||||||
|
@ -1836,6 +1836,9 @@ FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo,
|
||||||
template <bool SWAP_RB>
|
template <bool SWAP_RB>
|
||||||
FORCEINLINE __m128i ConvertColor8888To6665(const __m128i src)
|
FORCEINLINE __m128i ConvertColor8888To6665(const __m128i src)
|
||||||
{
|
{
|
||||||
|
// Conversion algorithm:
|
||||||
|
// RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2)
|
||||||
|
// Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3)
|
||||||
__m128i rgb;
|
__m128i rgb;
|
||||||
const __m128i a = _mm_and_si128( _mm_srli_epi32(src, 3), _mm_set1_epi32(0x1F000000) );
|
const __m128i a = _mm_and_si128( _mm_srli_epi32(src, 3), _mm_set1_epi32(0x1F000000) );
|
||||||
|
|
||||||
|
|
|
@ -994,6 +994,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
|
||||||
_mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), ConvertColor8888To6665<true>(srcColorHi) );
|
_mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), ConvertColor8888To6665<true>(srcColorHi) );
|
||||||
_mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
|
_mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
#endif
|
#endif
|
||||||
for (; i < pixCount; i++)
|
for (; i < pixCount; i++)
|
||||||
{
|
{
|
||||||
|
@ -1025,6 +1029,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
|
||||||
_mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), srcColorHi );
|
_mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), srcColorHi );
|
||||||
_mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
|
_mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
#endif
|
#endif
|
||||||
for (; i < pixCount; i++)
|
for (; i < pixCount; i++)
|
||||||
{
|
{
|
||||||
|
@ -1064,6 +1072,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
|
||||||
_mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), ConvertColor8888To6665<true>(srcColorHi) );
|
_mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), ConvertColor8888To6665<true>(srcColorHi) );
|
||||||
_mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
|
_mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
#endif
|
#endif
|
||||||
for (; x < pixCount; x++, ir++, iw++)
|
for (; x < pixCount; x++, ir++, iw++)
|
||||||
{
|
{
|
||||||
|
@ -1105,6 +1117,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
|
||||||
_mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), srcColorHi );
|
_mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), srcColorHi );
|
||||||
_mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
|
_mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
#endif
|
#endif
|
||||||
for (; x < pixCount; x++, ir++, iw++)
|
for (; x < pixCount; x++, ir++, iw++)
|
||||||
{
|
{
|
||||||
|
|
|
@ -2059,6 +2059,9 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor
|
||||||
_mm_stream_si128((__m128i *)(this->_framebufferAttributes->isTranslucentPoly + i), attrIsTranslucentPoly_vec128);
|
_mm_stream_si128((__m128i *)(this->_framebufferAttributes->isTranslucentPoly + i), attrIsTranslucentPoly_vec128);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
|
#endif
|
||||||
for (; i < pixCount; i++)
|
for (; i < pixCount; i++)
|
||||||
{
|
{
|
||||||
this->_framebufferColor[i] = clearColor6665;
|
this->_framebufferColor[i] = clearColor6665;
|
||||||
|
|
|
@ -237,6 +237,9 @@ void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef ENABLE_SSE2
|
||||||
|
#pragma LOOPVECTORIZE_DISABLE
|
||||||
|
#endif
|
||||||
for (; i < count; i++)
|
for (; i < count; i++)
|
||||||
{
|
{
|
||||||
this->SetAtIndex(i, attr);
|
this->SetAtIndex(i, attr);
|
||||||
|
|
|
@ -162,6 +162,16 @@
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef LOOPVECTORIZE_DISABLE
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#define LOOPVECTORIZE_DISABLE loop(no_vector)
|
||||||
|
#elif defined(__clang__)
|
||||||
|
#define LOOPVECTORIZE_DISABLE clang loop vectorize(disable)
|
||||||
|
#else
|
||||||
|
#define LOOPVECTORIZE_DISABLE
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__LP64__)
|
#if defined(__LP64__)
|
||||||
typedef unsigned char u8;
|
typedef unsigned char u8;
|
||||||
typedef unsigned short u16;
|
typedef unsigned short u16;
|
||||||
|
|
Loading…
Reference in New Issue