fix vs2010 compiling. gpu.cpp compling is slow... :(

This commit is contained in:
zeromus 2016-06-24 18:29:00 +00:00
parent dde0da24ab
commit cdd5892c60
2 changed files with 15 additions and 11 deletions

View File

@ -7100,7 +7100,7 @@ void NDSDisplay::SetEngineByID(const GPUEngineID theID)
this->_gpu->SetDisplayByID(this->_ID); this->_gpu->SetDisplayByID(this->_ID);
} }
template <bool SWAP_RB, bool UNALIGNED> template <bool SWAP_RB, bool IS_UNALIGNED>
void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount)
{ {
size_t i = 0; size_t i = 0;
@ -7109,11 +7109,11 @@ void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restric
const size_t ssePixCount = pixCount - (pixCount % 8); const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8) for (; i < ssePixCount; i += 8)
{ {
__m128i src_vec128 = (UNALIGNED) ? _mm_loadu_si128((__m128i *)(src + i)) : _mm_load_si128((__m128i *)(src + i)); __m128i src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((__m128i *)(src + i)) : _mm_load_si128((__m128i *)(src + i));
__m128i dstConvertedLo, dstConvertedHi; __m128i dstConvertedLo, dstConvertedHi;
ConvertColor555To8888Opaque<SWAP_RB>(src_vec128, dstConvertedLo, dstConvertedHi); ConvertColor555To8888Opaque<SWAP_RB>(src_vec128, dstConvertedLo, dstConvertedHi);
if (UNALIGNED) if (IS_UNALIGNED)
{ {
_mm_storeu_si128((__m128i *)(dst + i + 0), dstConvertedLo); _mm_storeu_si128((__m128i *)(dst + i + 0), dstConvertedLo);
_mm_storeu_si128((__m128i *)(dst + i + 4), dstConvertedHi); _mm_storeu_si128((__m128i *)(dst + i + 4), dstConvertedHi);
@ -7135,7 +7135,7 @@ void ConvertColorBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restric
} }
} }
template <bool SWAP_RB, bool UNALIGNED> template <bool SWAP_RB, bool IS_UNALIGNED>
void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount)
{ {
size_t i = 0; size_t i = 0;
@ -7144,11 +7144,11 @@ void ConvertColorBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restric
const size_t ssePixCount = pixCount - (pixCount % 8); const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8) for (; i < ssePixCount; i += 8)
{ {
__m128i src_vec128 = (UNALIGNED) ? _mm_loadu_si128((__m128i *)(src + i)) : _mm_load_si128((__m128i *)(src + i)); __m128i src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((__m128i *)(src + i)) : _mm_load_si128((__m128i *)(src + i));
__m128i dstConvertedLo, dstConvertedHi; __m128i dstConvertedLo, dstConvertedHi;
ConvertColor555To6665Opaque<SWAP_RB>(src_vec128, dstConvertedLo, dstConvertedHi); ConvertColor555To6665Opaque<SWAP_RB>(src_vec128, dstConvertedLo, dstConvertedHi);
if (UNALIGNED) if (IS_UNALIGNED)
{ {
_mm_storeu_si128((__m128i *)(dst + i + 0), dstConvertedLo); _mm_storeu_si128((__m128i *)(dst + i + 0), dstConvertedLo);
_mm_storeu_si128((__m128i *)(dst + i + 4), dstConvertedHi); _mm_storeu_si128((__m128i *)(dst + i + 4), dstConvertedHi);
@ -7214,7 +7214,7 @@ void ConvertColorBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount)
} }
} }
template <bool SWAP_RB, bool UNALIGNED> template <bool SWAP_RB, bool IS_UNALIGNED>
void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount)
{ {
size_t i = 0; size_t i = 0;
@ -7223,7 +7223,7 @@ void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst
const size_t ssePixCount = pixCount - (pixCount % 8); const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8) for (; i < ssePixCount; i += 8)
{ {
if (UNALIGNED) if (IS_UNALIGNED)
{ {
_mm_storeu_si128( (__m128i *)(dst + i), ConvertColor8888To5551<SWAP_RB>(_mm_loadu_si128((__m128i *)(src + i)), _mm_loadu_si128((__m128i *)(src + i + 4))) ); _mm_storeu_si128( (__m128i *)(dst + i), ConvertColor8888To5551<SWAP_RB>(_mm_loadu_si128((__m128i *)(src + i)), _mm_loadu_si128((__m128i *)(src + i + 4))) );
} }
@ -7243,7 +7243,7 @@ void ConvertColorBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst
} }
} }
template <bool SWAP_RB, bool UNALIGNED> template <bool SWAP_RB, bool IS_UNALIGNED>
void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount)
{ {
size_t i = 0; size_t i = 0;
@ -7252,7 +7252,7 @@ void ConvertColorBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst
const size_t ssePixCount = pixCount - (pixCount % 8); const size_t ssePixCount = pixCount - (pixCount % 8);
for (; i < ssePixCount; i += 8) for (; i < ssePixCount; i += 8)
{ {
if (UNALIGNED) if (IS_UNALIGNED)
{ {
_mm_storeu_si128( (__m128i *)(dst + i), ConvertColor6665To5551<SWAP_RB>(_mm_loadu_si128((__m128i *)(src + i)), _mm_loadu_si128((__m128i *)(src + i + 4))) ); _mm_storeu_si128( (__m128i *)(dst + i), ConvertColor6665To5551<SWAP_RB>(_mm_loadu_si128((__m128i *)(src + i)), _mm_loadu_si128((__m128i *)(src + i + 4))) );
} }

View File

@ -164,7 +164,11 @@
#ifndef LOOPVECTORIZE_DISABLE #ifndef LOOPVECTORIZE_DISABLE
#if defined(_MSC_VER) #if defined(_MSC_VER)
#if _MSC_VER >= 1700
#define LOOPVECTORIZE_DISABLE loop(no_vector) #define LOOPVECTORIZE_DISABLE loop(no_vector)
#else
#define LOOPVECTORIZE_DISABLE
#endif
#elif defined(__clang__) #elif defined(__clang__)
#define LOOPVECTORIZE_DISABLE clang loop vectorize(disable) #define LOOPVECTORIZE_DISABLE clang loop vectorize(disable)
#else #else