diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index b20182b12..cde834c55 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -5517,7 +5517,7 @@ void GPUEngineA::RenderLine_Layer3D(GPUEngineCompositorInfo &compInfo) } } -template +template void GPUEngineA::_RenderLine_DisplayCapture(const u16 l) { assert( (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH/2) || (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH) ); @@ -5708,11 +5708,11 @@ void GPUEngineA::_RenderLine_DisplayCapture(const u16 l) { if (this->isLineRenderNative[l]) { - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); } else { - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); } newCaptureLineNativeState = false; @@ -5727,11 +5727,11 @@ void GPUEngineA::_RenderLine_DisplayCapture(const u16 l) if (this->isLineRenderNative[l]) { - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); } else { - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); } newCaptureLineNativeState = this->isLineRenderNative[l]; @@ -5743,7 +5743,7 @@ void GPUEngineA::_RenderLine_DisplayCapture(const u16 l) { if ( (DISPCAPCNT.SrcB == 0) && !this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset] ) { - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); newCaptureLineNativeState = false; } else @@ -5754,7 +5754,7 @@ void GPUEngineA::_RenderLine_DisplayCapture(const u16 l) this->_RenderLine_DispCapture_FIFOToBuffer(fifoLine); } - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); newCaptureLineNativeState = true; } } @@ -5762,7 +5762,7 @@ void GPUEngineA::_RenderLine_DisplayCapture(const u16 l) { if ( (DISPCAPCNT.SrcB == 0) && !this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset] ) { - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); } else { @@ -5772,7 +5772,7 @@ void GPUEngineA::_RenderLine_DisplayCapture(const u16 l) this->_RenderLine_DispCapture_FIFOToBuffer(fifoLine); } - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst, CAPTURELENGTH, 1); } newCaptureLineNativeState = false; @@ -5879,22 +5879,22 @@ void GPUEngineA::_RenderLine_DisplayCapture(const u16 l) { if (this->isLineRenderNative[l]) { - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); } else { - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); } } else { if (this->isLineRenderNative[l]) { - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); } else { - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); } } } @@ -5904,22 +5904,22 @@ void GPUEngineA::_RenderLine_DisplayCapture(const u16 l) { if ( (DISPCAPCNT.SrcB == 0) && !this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset] ) { - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); } else { - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); } } else { if ( (DISPCAPCNT.SrcB == 0) && !this->isLineCaptureNative[vramReadBlock][readLineIndexWithOffset] ) { - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); } else { - this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); + this->_RenderLine_DispCapture_Blend(srcA, srcB, cap_dst_ext, captureLengthExt, captureLineCount); } } } @@ -6244,7 +6244,7 @@ FragmentColor GPUEngineA::_RenderLine_DispCapture_BlendFunc(const FragmentColor } #ifdef ENABLE_SSE2 -template +template __m128i GPUEngineA::_RenderLine_DispCapture_BlendFunc_SSE2(const __m128i &srcA, const __m128i &srcB, const __m128i &blendEVA, const __m128i &blendEVB) { #ifdef ENABLE_SSSE3 @@ -6356,8 +6356,8 @@ __m128i GPUEngineA::_RenderLine_DispCapture_BlendFunc_SSE2(const __m128i &srcA, } #endif -template -void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const u16 *srcA, const u16 *srcB, u16 *dst, const u8 blendEVA, const u8 blendEVB, const size_t length, size_t l) +template +void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t length, size_t l) { #ifdef ENABLE_SSE2 const __m128i blendEVA_vec128 = _mm_set1_epi16(blendEVA); @@ -6368,49 +6368,91 @@ void GPUEngineA::_RenderLine_DispCapture_BlendToCustomDstBuffer(const u16 *srcA, size_t offset = _gpuDstToSrcIndex[_gpuDstLineIndex[l] * dispInfo.customWidth] - (l * GPU_FRAMEBUFFER_NATIVE_WIDTH); size_t i = 0; -#ifdef ENABLE_SSE2 - const size_t ssePixCount = length - (length % 8); - for (; i < ssePixCount; i += 8) + if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) { - __m128i srcA_vec128 = (!CAPTUREFROMNATIVESRCA) ? _mm_load_si128((__m128i *)(srcA + i)) : _mm_set_epi16(srcA[offset + i + 7], - srcA[offset + i + 6], - srcA[offset + i + 5], - srcA[offset + i + 4], - srcA[offset + i + 3], - srcA[offset + i + 2], - srcA[offset + i + 1], - srcA[offset + i + 0]); + const u32 *srcA_32 = (const u32 *)srcA; + const u32 *srcB_32 = (const u32 *)srcB; + FragmentColor *dst32 = (FragmentColor *)dst; - __m128i srcB_vec128 = (!CAPTUREFROMNATIVESRCB) ? _mm_load_si128((__m128i *)(srcB + i)) : _mm_set_epi16(srcB[offset + i + 7], - srcB[offset + i + 6], - srcB[offset + i + 5], - srcB[offset + i + 4], - srcB[offset + i + 3], - srcB[offset + i + 2], - srcB[offset + i + 1], - srcB[offset + i + 0]); - - _mm_store_si128( (__m128i *)(dst + i), this->_RenderLine_DispCapture_BlendFunc_SSE2(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) ); - } +#ifdef ENABLE_SSE2 + const size_t ssePixCount = length - (length % 4); + for (; i < ssePixCount; i+=4) + { + __m128i srcA_vec128 = (!CAPTUREFROMNATIVESRCA) ? _mm_load_si128((__m128i *)(srcA_32 + i)) : _mm_set_epi32(srcA_32[offset + i + 3], + srcA_32[offset + i + 2], + srcA_32[offset + i + 1], + srcA_32[offset + i + 0]); + + __m128i srcB_vec128 = (!CAPTUREFROMNATIVESRCB) ? _mm_load_si128((__m128i *)(srcB_32 + i)) : _mm_set_epi32(srcB_32[offset + i + 3], + srcB_32[offset + i + 2], + srcB_32[offset + i + 1], + srcB_32[offset + i + 0]); + + _mm_store_si128( (__m128i *)(dst32 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) ); + } #endif - + #ifdef ENABLE_SSE2 #pragma LOOPVECTORIZE_DISABLE #endif - for (; i < length; i++) + for (; i < length; i++) + { + const FragmentColor colorA = (!CAPTUREFROMNATIVESRCA) ? ((const FragmentColor *)srcA)[i] : ((const FragmentColor *)srcA)[offset + i]; + const FragmentColor colorB = (!CAPTUREFROMNATIVESRCB) ? ((const FragmentColor *)srcB)[i] : ((const FragmentColor *)srcB)[offset + i]; + + ((FragmentColor *)dst)[i] = this->_RenderLine_DispCapture_BlendFunc(colorA, colorB, blendEVA, blendEVB); + } + } + else { - const u16 colorA = (!CAPTUREFROMNATIVESRCA) ? srcA[i] : srcA[offset + i]; - const u16 colorB = (!CAPTUREFROMNATIVESRCB) ? srcB[i] : srcB[offset + i]; + const u16 *srcA_16 = (const u16 *)srcA; + const u16 *srcB_16 = (const u16 *)srcB; + u16 *dst16 = (u16 *)dst; - dst[i] = this->_RenderLine_DispCapture_BlendFunc(colorA, colorB, blendEVA, blendEVB); +#ifdef ENABLE_SSE2 + const size_t ssePixCount = length - (length % 8); + for (; i < ssePixCount; i+=8) + { + __m128i srcA_vec128 = (!CAPTUREFROMNATIVESRCA) ? _mm_load_si128((__m128i *)(srcA_16 + i)) : _mm_set_epi16(srcA_16[offset + i + 7], + srcA_16[offset + i + 6], + srcA_16[offset + i + 5], + srcA_16[offset + i + 4], + srcA_16[offset + i + 3], + srcA_16[offset + i + 2], + srcA_16[offset + i + 1], + srcA_16[offset + i + 0]); + + __m128i srcB_vec128 = (!CAPTUREFROMNATIVESRCB) ? _mm_load_si128((__m128i *)(srcB_16 + i)) : _mm_set_epi16(srcB_16[offset + i + 7], + srcB_16[offset + i + 6], + srcB_16[offset + i + 5], + srcB_16[offset + i + 4], + srcB_16[offset + i + 3], + srcB_16[offset + i + 2], + srcB_16[offset + i + 1], + srcB_16[offset + i + 0]); + + _mm_store_si128( (__m128i *)(dst16 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) ); + } +#endif + +#ifdef ENABLE_SSE2 +#pragma LOOPVECTORIZE_DISABLE +#endif + for (; i < length; i++) + { + const u16 colorA = (!CAPTUREFROMNATIVESRCA) ? srcA_16[i] : srcA_16[offset + i]; + const u16 colorB = (!CAPTUREFROMNATIVESRCB) ? srcB_16[i] : srcB_16[offset + i]; + + dst16[i] = this->_RenderLine_DispCapture_BlendFunc(colorA, colorB, blendEVA, blendEVB); + } } } -template -void GPUEngineA::_RenderLine_DispCapture_Blend(const u16 *srcA, const u16 *srcB, u16 *dst, const size_t captureLengthExt, const size_t l) +template +void GPUEngineA::_RenderLine_DispCapture_Blend(const void *srcA, const void *srcB, void *dst, const size_t captureLengthExt, const size_t l) { - const u8 blendEVA = GPU->GetEngineMain()->_dispCapCnt.EVA; - const u8 blendEVB = GPU->GetEngineMain()->_dispCapCnt.EVB; + const u8 blendEVA = this->_dispCapCnt.EVA; + const u8 blendEVB = this->_dispCapCnt.EVB; if (CAPTURETONATIVEDST) { @@ -6418,55 +6460,93 @@ void GPUEngineA::_RenderLine_DispCapture_Blend(const u16 *srcA, const u16 *srcB, const __m128i blendEVA_vec128 = _mm_set1_epi16(blendEVA); const __m128i blendEVB_vec128 = _mm_set1_epi16(blendEVB); - for (size_t i = 0; i < CAPTURELENGTH; i += 8) + if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) { - __m128i srcA_vec128 = (CAPTUREFROMNATIVESRCA) ? _mm_load_si128((__m128i *)(srcA + i)) : _mm_set_epi16(srcA[_gpuDstPitchIndex[i+7]], - srcA[_gpuDstPitchIndex[i+6]], - srcA[_gpuDstPitchIndex[i+5]], - srcA[_gpuDstPitchIndex[i+4]], - srcA[_gpuDstPitchIndex[i+3]], - srcA[_gpuDstPitchIndex[i+2]], - srcA[_gpuDstPitchIndex[i+1]], - srcA[_gpuDstPitchIndex[i+0]]); + const u32 *srcA_32 = (const u32 *)srcA; + const u32 *srcB_32 = (const u32 *)srcB; + FragmentColor *dst32 = (FragmentColor *)dst; - __m128i srcB_vec128 = (CAPTUREFROMNATIVESRCB) ? _mm_load_si128((__m128i *)(srcB + i)) : _mm_set_epi16(srcB[_gpuDstPitchIndex[i+7]], - srcB[_gpuDstPitchIndex[i+6]], - srcB[_gpuDstPitchIndex[i+5]], - srcB[_gpuDstPitchIndex[i+4]], - srcB[_gpuDstPitchIndex[i+3]], - srcB[_gpuDstPitchIndex[i+2]], - srcB[_gpuDstPitchIndex[i+1]], - srcB[_gpuDstPitchIndex[i+0]]); + for (size_t i = 0; i < CAPTURELENGTH; i+=4) + { + __m128i srcA_vec128 = (CAPTUREFROMNATIVESRCA) ? _mm_load_si128((__m128i *)(srcA_32 + i)) : _mm_set_epi32(srcA_32[_gpuDstPitchIndex[i+3]], + srcA_32[_gpuDstPitchIndex[i+2]], + srcA_32[_gpuDstPitchIndex[i+1]], + srcA_32[_gpuDstPitchIndex[i+0]]); + + __m128i srcB_vec128 = (CAPTUREFROMNATIVESRCB) ? _mm_load_si128((__m128i *)(srcB_32 + i)) : _mm_set_epi32(srcB_32[_gpuDstPitchIndex[i+3]], + srcB_32[_gpuDstPitchIndex[i+2]], + srcB_32[_gpuDstPitchIndex[i+1]], + srcB_32[_gpuDstPitchIndex[i+0]]); + + _mm_store_si128( (__m128i *)(dst32 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) ); + } + } + else + { + const u16 *srcA_16 = (const u16 *)srcA; + const u16 *srcB_16 = (const u16 *)srcB; + u16 *dst16 = (u16 *)dst; - _mm_store_si128( (__m128i *)(dst + i), this->_RenderLine_DispCapture_BlendFunc_SSE2(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) ); + for (size_t i = 0; i < CAPTURELENGTH; i+=8) + { + __m128i srcA_vec128 = (CAPTUREFROMNATIVESRCA) ? _mm_load_si128((__m128i *)(srcA_16 + i)) : _mm_set_epi16(srcA_16[_gpuDstPitchIndex[i+7]], + srcA_16[_gpuDstPitchIndex[i+6]], + srcA_16[_gpuDstPitchIndex[i+5]], + srcA_16[_gpuDstPitchIndex[i+4]], + srcA_16[_gpuDstPitchIndex[i+3]], + srcA_16[_gpuDstPitchIndex[i+2]], + srcA_16[_gpuDstPitchIndex[i+1]], + srcA_16[_gpuDstPitchIndex[i+0]]); + + __m128i srcB_vec128 = (CAPTUREFROMNATIVESRCB) ? _mm_load_si128((__m128i *)(srcB_16 + i)) : _mm_set_epi16(srcB_16[_gpuDstPitchIndex[i+7]], + srcB_16[_gpuDstPitchIndex[i+6]], + srcB_16[_gpuDstPitchIndex[i+5]], + srcB_16[_gpuDstPitchIndex[i+4]], + srcB_16[_gpuDstPitchIndex[i+3]], + srcB_16[_gpuDstPitchIndex[i+2]], + srcB_16[_gpuDstPitchIndex[i+1]], + srcB_16[_gpuDstPitchIndex[i+0]]); + + _mm_store_si128( (__m128i *)(dst16 + i), this->_RenderLine_DispCapture_BlendFunc_SSE2(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) ); + } } #else for (size_t i = 0; i < CAPTURELENGTH; i++) { - const u16 colorA = (CAPTUREFROMNATIVESRCA) ? srcA[i] : srcA[_gpuDstPitchIndex[i]]; - const u16 colorB = (CAPTUREFROMNATIVESRCB) ? srcB[i] : srcB[_gpuDstPitchIndex[i]]; - - dst[i] = this->_RenderLine_DispCapture_BlendFunc(colorA, colorB, blendEVA, blendEVB); + if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) + { + const FragmentColor colorA = (CAPTUREFROMNATIVESRCA) ? ((const FragmentColor *)srcA)[i] : ((const FragmentColor *)srcA)[_gpuDstPitchIndex[i]]; + const FragmentColor colorB = (CAPTUREFROMNATIVESRCB) ? ((const FragmentColor *)srcB)[i] : ((const FragmentColor *)srcB)[_gpuDstPitchIndex[i]]; + + ((FragmentColor *)dst)[i] = this->_RenderLine_DispCapture_BlendFunc(colorA, colorB, blendEVA, blendEVB); + } + else + { + const u16 colorA = (CAPTUREFROMNATIVESRCA) ? ((u16 *)srcA)[i] : ((u16 *)srcA)[_gpuDstPitchIndex[i]]; + const u16 colorB = (CAPTUREFROMNATIVESRCB) ? ((u16 *)srcB)[i] : ((u16 *)srcB)[_gpuDstPitchIndex[i]]; + + ((u16 *)dst)[i] = this->_RenderLine_DispCapture_BlendFunc(colorA, colorB, blendEVA, blendEVB); + } } #endif } else { - const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo(); + const size_t lineWidth = GPU->GetDisplayInfo().customWidth; const size_t captureLineCount = _gpuCaptureLineCount[l]; if (CAPTURELENGTH == GPU_FRAMEBUFFER_NATIVE_WIDTH) { - this->_RenderLine_DispCapture_BlendToCustomDstBuffer(srcA, srcB, dst, blendEVA, blendEVB, captureLengthExt * captureLineCount, l); + this->_RenderLine_DispCapture_BlendToCustomDstBuffer(srcA, srcB, dst, blendEVA, blendEVB, captureLengthExt * captureLineCount, l); } else { for (size_t line = 0; line < captureLineCount; line++) { - this->_RenderLine_DispCapture_BlendToCustomDstBuffer(srcA, srcB, dst, blendEVA, blendEVB, captureLengthExt, l); - srcA += dispInfo.customWidth; - srcB += dispInfo.customWidth; - dst += dispInfo.customWidth; + this->_RenderLine_DispCapture_BlendToCustomDstBuffer(srcA, srcB, dst, blendEVA, blendEVB, captureLengthExt, l); + srcA = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)srcA + lineWidth) : (void *)((u16 *)srcA + lineWidth); + srcB = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)srcB + lineWidth) : (void *)((u16 *)srcB + lineWidth); + dst = (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) ? (void *)((FragmentColor *)dst + lineWidth) : (void *)((u16 *)dst + lineWidth); } } } diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index be994077d..47c1455db 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -1567,11 +1567,11 @@ protected: template __m128i _RenderLine_DispCapture_BlendFunc_SSE2(const __m128i &srcA, const __m128i &srcB, const __m128i &blendEVA, const __m128i &blendEVB); #endif - template - void _RenderLine_DispCapture_BlendToCustomDstBuffer(const u16 *srcA, const u16 *srcB, u16 *dst, const u8 blendEVA, const u8 blendEVB, const size_t length, size_t l); // Do not use restrict pointers, since srcB and dst can be the same + template + void _RenderLine_DispCapture_BlendToCustomDstBuffer(const void *srcA, const void *srcB, void *dst, const u8 blendEVA, const u8 blendEVB, const size_t length, size_t l); // Do not use restrict pointers, since srcB and dst can be the same - template - void _RenderLine_DispCapture_Blend(const u16 *srcA, const u16 *srcB, u16 *dst, const size_t captureLengthExt, const size_t l); // Do not use restrict pointers, since srcB and dst can be the same + template + void _RenderLine_DispCapture_Blend(const void *srcA, const void *srcB, void *dst, const size_t captureLengthExt, const size_t l); // Do not use restrict pointers, since srcB and dst can be the same template void _HandleDisplayModeVRAM(const size_t l); template void _HandleDisplayModeMainMemory(const size_t l);