GPU:
- Avoid generating autovectorized SSE2 code for loops where a hand-coded SSE2 loop already exists. (MSVC and Clang only.)
This commit is contained in:
parent
3f895b85fb
commit
dde0da24ab
|
@ -2917,6 +2917,10 @@ void GPUEngineBase::_RenderPixelsCustom(void *__restrict dstColorLine, u8 *__res
|
|||
(GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i),
|
||||
dstLayerID + i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < dstPixCount; i++)
|
||||
{
|
||||
|
@ -2969,6 +2973,10 @@ void GPUEngineBase::_RenderPixelsCustomVRAM(void *__restrict dstColorLine, u8 *_
|
|||
(GPU->GetDisplayInfo().colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + i) : (u16 *)(dstColorLine32 + i),
|
||||
dstLayerID + i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < dstPixCount; i++)
|
||||
{
|
||||
|
@ -4004,6 +4012,10 @@ void GPUEngineBase::ApplyMasterBrightness()
|
|||
const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i));
|
||||
_mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectIncreaseBrightness(dstColor_vec128, intensity_vec128) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
|
@ -4077,6 +4089,10 @@ void GPUEngineBase::ApplyMasterBrightness()
|
|||
const __m128i dstColor_vec128 = _mm_load_si128((__m128i *)((u16 *)dst + i));
|
||||
_mm_store_si128( (__m128i *)((u16 *)dst + i), this->_ColorEffectDecreaseBrightness(dstColor_vec128, intensity_vec128) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
|
@ -5041,6 +5057,10 @@ void* GPUEngineA::_RenderLine_Layers(const u16 l)
|
|||
(dispInfo.colorFormat == NDSColorFormat_BGR555_Rev) ? (u16 *)(dstColorLine16 + dstX) : (u16 *)(dstColorLine32 + dstX),
|
||||
dstLayerIDPtr + dstX);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; dstX < customLineWidth; dstX++)
|
||||
{
|
||||
|
@ -5669,6 +5689,10 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const u16 *src, u16 *dst, const si
|
|||
{
|
||||
_mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCountExt; i++)
|
||||
{
|
||||
|
@ -5686,6 +5710,10 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const u16 *src, u16 *dst, const si
|
|||
{
|
||||
_mm_store_si128((__m128i *)(dst + i), _mm_or_si128( _mm_load_si128( (__m128i *)(src + i)), alpha_vec128 ) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < captureLengthExt; i++)
|
||||
{
|
||||
|
@ -5824,6 +5852,9 @@ void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const u16 *srcA,
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < length; i++)
|
||||
{
|
||||
const u16 colorA = (!CAPTUREFROMNATIVESRCA) ? srcA[i] : srcA[offset + i];
|
||||
|
@ -7095,6 +7126,9 @@ void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restric
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = ConvertColor555To8888Opaque<SWAP_RB>(src[i]);
|
||||
|
@ -7127,6 +7161,9 @@ void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restric
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = ConvertColor555To6665Opaque<SWAP_RB>(src[i]);
|
||||
|
@ -7146,6 +7183,9 @@ void ConvertColorBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = ConvertColor8888To6665<SWAP_RB>(src[i]);
|
||||
|
@ -7165,6 +7205,9 @@ void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = ConvertColor6665To8888<SWAP_RB>(src[i]);
|
||||
|
@ -7191,6 +7234,9 @@ void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = ConvertColor8888To5551<SWAP_RB>(src[i]);
|
||||
|
@ -7217,6 +7263,9 @@ void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dst[i] = ConvertColor6665To5551<SWAP_RB>(src[i]);
|
||||
|
|
|
@ -1802,7 +1802,7 @@ template <bool SWAP_RB>
|
|||
FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo, __m128i &dstHi)
|
||||
{
|
||||
// Conversion algorithm:
|
||||
// RGB 5-bit to 6-bit formula: dstRGB8 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
|
||||
// RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01)
|
||||
if (SWAP_RB)
|
||||
{
|
||||
dstLo = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(src, 17), _mm_set1_epi32(0x003E0000)), _mm_and_si128(_mm_slli_epi32(src, 12), _mm_set1_epi32(0x00010000)));
|
||||
|
@ -1836,6 +1836,9 @@ FORCEINLINE void ConvertColor555To6665Opaque(const __m128i src, __m128i &dstLo,
|
|||
template <bool SWAP_RB>
|
||||
FORCEINLINE __m128i ConvertColor8888To6665(const __m128i src)
|
||||
{
|
||||
// Conversion algorithm:
|
||||
// RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2)
|
||||
// Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3)
|
||||
__m128i rgb;
|
||||
const __m128i a = _mm_and_si128( _mm_srli_epi32(src, 3), _mm_set1_epi32(0x1F000000) );
|
||||
|
||||
|
|
|
@ -994,6 +994,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
|
|||
_mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), ConvertColor8888To6665<true>(srcColorHi) );
|
||||
_mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
|
@ -1025,6 +1029,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
|
|||
_mm_store_si128( (__m128i *)(dstFramebuffer + i + 4), srcColorHi );
|
||||
_mm_store_si128( (__m128i *)(dstRGBA5551 + i), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
|
@ -1064,6 +1072,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
|
|||
_mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), ConvertColor8888To6665<true>(srcColorHi) );
|
||||
_mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; x < pixCount; x++, ir++, iw++)
|
||||
{
|
||||
|
@ -1105,6 +1117,10 @@ Render3DError OpenGLRenderer::_FlushFramebufferConvertOnCPU(const FragmentColor
|
|||
_mm_store_si128( (__m128i *)(dstFramebuffer + iw + 4), srcColorHi );
|
||||
_mm_store_si128( (__m128i *)(dstRGBA5551 + iw), ConvertColor8888To5551<true>(srcColorLo, srcColorHi) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; x < pixCount; x++, ir++, iw++)
|
||||
{
|
||||
|
|
|
@ -2059,6 +2059,9 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor
|
|||
_mm_stream_si128((__m128i *)(this->_framebufferAttributes->isTranslucentPoly + i), attrIsTranslucentPoly_vec128);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
this->_framebufferColor[i] = clearColor6665;
|
||||
|
|
|
@ -237,6 +237,9 @@ void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#pragma LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
for (; i < count; i++)
|
||||
{
|
||||
this->SetAtIndex(i, attr);
|
||||
|
|
|
@ -35,13 +35,13 @@
|
|||
|
||||
//enforce a constraint: gdb stub requires developer
|
||||
#if defined(GDB_STUB) && !defined(DEVELOPER)
|
||||
#define DEVELOPER
|
||||
#define DEVELOPER
|
||||
#endif
|
||||
|
||||
#ifdef DEVELOPER
|
||||
#define IF_DEVELOPER(X) X
|
||||
#define IF_DEVELOPER(X) X
|
||||
#else
|
||||
#define IF_DEVELOPER(X)
|
||||
#define IF_DEVELOPER(X)
|
||||
#endif
|
||||
|
||||
#ifdef HOST_WINDOWS
|
||||
|
@ -96,19 +96,19 @@
|
|||
//dont apply these to types without further testing. it only works portably here on declarations of variables
|
||||
//cant we find a pattern other people use more successfully?
|
||||
#if _MSC_VER >= 1900
|
||||
#define DS_ALIGN(X) alignas(X)
|
||||
#define DS_ALIGN(X) alignas(X)
|
||||
#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
|
||||
#define DS_ALIGN(X) __declspec(align(X))
|
||||
#define DS_ALIGN(X) __declspec(align(X))
|
||||
#elif defined(__GNUC__)
|
||||
#define DS_ALIGN(X) __attribute__ ((aligned (X)))
|
||||
#define DS_ALIGN(X) __attribute__ ((aligned (X)))
|
||||
#else
|
||||
#define DS_ALIGN(X)
|
||||
#define DS_ALIGN(X)
|
||||
#endif
|
||||
|
||||
#ifdef HOST_64
|
||||
#define CACHE_ALIGN_SIZE 64
|
||||
#define CACHE_ALIGN_SIZE 64
|
||||
#else
|
||||
#define CACHE_ALIGN_SIZE 32
|
||||
#define CACHE_ALIGN_SIZE 32
|
||||
#endif
|
||||
|
||||
//use this for example when you want a byte value to be better-aligned
|
||||
|
@ -117,49 +117,59 @@
|
|||
//---------------------------------------------
|
||||
|
||||
#ifdef __MINGW32__
|
||||
#define FASTCALL __attribute__((fastcall))
|
||||
#define ASMJIT_CALL_CONV kX86FuncConvGccFastCall
|
||||
#define FASTCALL __attribute__((fastcall))
|
||||
#define ASMJIT_CALL_CONV kX86FuncConvGccFastCall
|
||||
#elif defined (__i386__) && !defined(__clang__)
|
||||
#define FASTCALL __attribute__((regparm(3)))
|
||||
#define ASMJIT_CALL_CONV kX86FuncConvGccRegParm3
|
||||
#define FASTCALL __attribute__((regparm(3)))
|
||||
#define ASMJIT_CALL_CONV kX86FuncConvGccRegParm3
|
||||
#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
|
||||
#define FASTCALL
|
||||
#define ASMJIT_CALL_CONV kX86FuncConvDefault
|
||||
#define FASTCALL
|
||||
#define ASMJIT_CALL_CONV kX86FuncConvDefault
|
||||
#else
|
||||
#define FASTCALL
|
||||
#define ASMJIT_CALL_CONV kX86FuncConvDefault
|
||||
#define FASTCALL
|
||||
#define ASMJIT_CALL_CONV kX86FuncConvDefault
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CDECL_ __cdecl
|
||||
#define _CDECL_ __cdecl
|
||||
#else
|
||||
#define _CDECL_
|
||||
#define _CDECL_
|
||||
#endif
|
||||
|
||||
#ifndef INLINE
|
||||
#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
|
||||
#define INLINE _inline
|
||||
#else
|
||||
#define INLINE inline
|
||||
#endif
|
||||
#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
|
||||
#define INLINE _inline
|
||||
#else
|
||||
#define INLINE inline
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef FORCEINLINE
|
||||
#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
|
||||
#define FORCEINLINE __forceinline
|
||||
#define MSC_FORCEINLINE __forceinline
|
||||
#else
|
||||
#define FORCEINLINE inline __attribute__((always_inline))
|
||||
#define MSC_FORCEINLINE
|
||||
#endif
|
||||
#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
|
||||
#define FORCEINLINE __forceinline
|
||||
#define MSC_FORCEINLINE __forceinline
|
||||
#else
|
||||
#define FORCEINLINE inline __attribute__((always_inline))
|
||||
#define MSC_FORCEINLINE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef NOINLINE
|
||||
#ifdef __GNUC__
|
||||
#define NOINLINE __attribute__((noinline))
|
||||
#else
|
||||
#define NOINLINE
|
||||
#ifdef __GNUC__
|
||||
#define NOINLINE __attribute__((noinline))
|
||||
#else
|
||||
#define NOINLINE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef LOOPVECTORIZE_DISABLE
|
||||
#if defined(_MSC_VER)
|
||||
#define LOOPVECTORIZE_DISABLE loop(no_vector)
|
||||
#elif defined(__clang__)
|
||||
#define LOOPVECTORIZE_DISABLE clang loop vectorize(disable)
|
||||
#else
|
||||
#define LOOPVECTORIZE_DISABLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__LP64__)
|
||||
|
|
Loading…
Reference in New Issue