From f3c3228c70df318c6dcd07aa0984f6d3f5771403 Mon Sep 17 00:00:00 2001 From: rogerman Date: Tue, 7 Sep 2021 02:13:35 -0700 Subject: [PATCH] Windows Port: Fix Windows build. (Regression from commit 037d328.) --- desmume/src/GPU.cpp | 8 ++++++-- desmume/src/GPU_Operations.cpp | 11 ++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 42c122be4..fbe286f97 100755 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -4585,15 +4585,16 @@ void GPUEngineA::_RenderLine_DispCapture_Blend_Buffer(const void *srcA, const vo #ifdef USEMANUALVECTORIZATION i = this->_RenderLine_DispCapture_Blend_VecLoop(srcA, srcB, dst, blendEVA, blendEVB, length); -#pragma LOOPVECTORIZE_DISABLE #endif - if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) { const FragmentColor *srcA_32 = (const FragmentColor *)srcA; const FragmentColor *srcB_32 = (const FragmentColor *)srcB; FragmentColor *dst32 = (FragmentColor *)dst; +#ifdef USEMANUALVECTORIZATION +#pragma LOOPVECTORIZE_DISABLE +#endif for (; i < length; i++) { const FragmentColor colorA = srcA_32[i]; @@ -4608,6 +4609,9 @@ void GPUEngineA::_RenderLine_DispCapture_Blend_Buffer(const void *srcA, const vo const u16 *srcB_16 = (const u16 *)srcB; u16 *dst16 = (u16 *)dst; +#ifdef USEMANUALVECTORIZATION +#pragma LOOPVECTORIZE_DISABLE +#endif for (; i < length; i++) { const u16 colorA = srcA_16[i]; diff --git a/desmume/src/GPU_Operations.cpp b/desmume/src/GPU_Operations.cpp index 0916fe1cc..b258daee1 100644 --- a/desmume/src/GPU_Operations.cpp +++ b/desmume/src/GPU_Operations.cpp @@ -1184,7 +1184,12 @@ void CopyLineExpandHinted(const void *__restrict srcBuffer, const size_t srcLine case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 4): CopyLineExpand<4, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 4, 4); break; - + +// Building on MSVC takes too long when LTO is on (typical use case), so remove these extra calls to +// CopyLineExpand() in order to reduce the number of permutations and make build times more sane. +// Other compilers, such as GCC and Clang, have no problems with building using LTO within a +// reasonable time frame. +#ifndef _MSC_VER case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 5): CopyLineExpand<5, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 5, 5); break; @@ -1200,7 +1205,7 @@ void CopyLineExpandHinted(const void *__restrict srcBuffer, const size_t srcLine case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 8): CopyLineExpand<8, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 8, 8); break; - + case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 9): CopyLineExpand<9, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 9, 9); break; @@ -1232,7 +1237,7 @@ void CopyLineExpandHinted(const void *__restrict srcBuffer, const size_t srcLine case (GPU_FRAMEBUFFER_NATIVE_WIDTH * 16): CopyLineExpand<16, SCALEVERTICAL, NEEDENDIANSWAP, ELEMENTSIZE>(dst, src, GPU_FRAMEBUFFER_NATIVE_WIDTH * 16, 16); break; - +#endif default: { if ((dstLineWidth % GPU_FRAMEBUFFER_NATIVE_WIDTH) == 0)