diff --git a/desmume/src/FIFO.cpp b/desmume/src/FIFO.cpp index 38f10876b..46e9c94e5 100644 --- a/desmume/src/FIFO.cpp +++ b/desmume/src/FIFO.cpp @@ -1,7 +1,7 @@ /* Copyright 2006 yopyop Copyright 2007 shash - Copyright 2007-2015 DeSmuME team + Copyright 2007-2021 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -29,6 +29,29 @@ #include "NDSSystem.h" #include "gfx3d.h" +#if defined(ENABLE_AVX512_1) + #define USEVECTORSIZE_512 + #define VECTORSIZE 64 + #include "./utils/colorspacehandler/colorspacehandler_AVX512.h" +#elif defined(ENABLE_AVX2) + #define USEVECTORSIZE_256 + #define VECTORSIZE 32 + #include "./utils/colorspacehandler/colorspacehandler_AVX2.h" +#elif defined(ENABLE_SSE2) + #define USEVECTORSIZE_128 + #define VECTORSIZE 16 + #include "./utils/colorspacehandler/colorspacehandler_SSE2.h" +#elif defined(ENABLE_ALTIVEC) + #define USEVECTORSIZE_128 + #define VECTORSIZE 16 + #include "./utils/colorspacehandler/colorspacehandler_AltiVec.h" +#endif + +#if defined(USEVECTORSIZE_512) || defined(USEVECTORSIZE_256) || defined(USEVECTORSIZE_128) + #define USEMANUALVECTORIZATION +#endif + + // ========================================================= IPC FIFO IPC_FIFO ipc_fifo[2]; @@ -317,23 +340,233 @@ void DISP_FIFOinit() memset(&disp_fifo, 0, sizeof(DISP_FIFO)); } -void DISP_FIFOsend(u32 val) +void DISP_FIFOsend_u32(u32 val) { //INFO("DISP_FIFO send value 0x%08X (head 0x%06X, tail 0x%06X)\n", val, disp_fifo.head, disp_fifo.tail); disp_fifo.buf[disp_fifo.tail] = val; + disp_fifo.tail++; - if (disp_fifo.tail > 0x5FFF) - disp_fifo.tail = 0; + if (disp_fifo.head >= 0x6000) + { + disp_fifo.head -= 0x6000; + } } -u32 DISP_FIFOrecv() +u32 DISP_FIFOrecv_u32() { //if (disp_fifo.tail == disp_fifo.head) return (0); // FIFO is empty u32 val = disp_fifo.buf[disp_fifo.head]; + disp_fifo.head++; - if (disp_fifo.head > 0x5FFF) - disp_fifo.head = 0; - return (val); + if (disp_fifo.head >= 0x6000) + { + disp_fifo.head -= 0x6000; + } + + return val; +} + +static void _DISP_FIFOrecv_LineAdvance() +{ + disp_fifo.head += (GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)) / sizeof(u32); + if (disp_fifo.head >= 0x6000) + { + disp_fifo.head -= 0x6000; + } +} + +void DISP_FIFOrecv_Line16(u16 *__restrict dst) +{ +#ifndef ENABLE_ALTIVEC // buffer_copy_fast() doesn't support endian swapping + if ( (disp_fifo.head + (GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)) / sizeof(u32) <= 0x6000) +#ifdef USEMANUALVECTORIZATION + && (disp_fifo.head == (disp_fifo.head & ~(VECTORSIZE - 1))) +#endif + ) + { + buffer_copy_fast(dst, disp_fifo.buf + disp_fifo.head); + _DISP_FIFOrecv_LineAdvance(); + } + else +#endif // ENABLE_ALTIVEC + { + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++) + { + ((u32 *)dst)[i] = LE_TO_LOCAL_32( DISP_FIFOrecv_u32() ); + } + } +} + +#ifdef USEMANUALVECTORIZATION + +template +void _DISP_FIFOrecv_LineOpaque16_vec(u32 *__restrict dst) +{ + buffer_copy_or_constant_s16_fast(dst, disp_fifo.buf + disp_fifo.head, 0x8000); + _DISP_FIFOrecv_LineAdvance(); +} + +template +void _DISP_FIFOrecv_LineOpaque32_vec(u32 *__restrict dst) +{ +#if defined(ENABLE_AVX512_0) + for (size_t i = 0, d = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(v512u16); i++, d+=2) + { + const v512u16 fifoColor = _mm512_load_si512((v512u16 *)(disp_fifo.buf + disp_fifo.head)); + + disp_fifo.head += (sizeof(v512u16)/sizeof(u32)); + if (disp_fifo.head >= 0x6000) + { + disp_fifo.head -= 0x6000; + } + + v512u32 dstLo = _mm512_setzero_si512(); + v512u32 dstHi = _mm512_setzero_si512(); + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555To6665Opaque_AVX512(fifoColor, dstLo, dstHi); + } + else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) + { + ColorspaceConvert555To8888Opaque_AVX512(fifoColor, dstLo, dstHi); + } + + _mm512_store_si512((v512u32 *)dst + d + 0, dstLo); + _mm512_store_si512((v512u32 *)dst + d + 1, dstHi); + } +#elif defined(ENABLE_AVX2) + for (size_t i = 0, d = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(v256u16); i++, d+=2) + { + const v256u16 fifoColor = _mm256_load_si256((v256u16 *)(disp_fifo.buf + disp_fifo.head)); + + disp_fifo.head += (sizeof(v256u16)/sizeof(u32)); + if (disp_fifo.head >= 0x6000) + { + disp_fifo.head -= 0x6000; + } + + v256u32 dstLo = _mm256_setzero_si256(); + v256u32 dstHi = _mm256_setzero_si256(); + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555To6665Opaque_AVX2(fifoColor, dstLo, dstHi); + } + else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) + { + ColorspaceConvert555To8888Opaque_AVX2(fifoColor, dstLo, dstHi); + } + + _mm256_store_si256((v256u32 *)dst + d + 0, dstLo); + _mm256_store_si256((v256u32 *)dst + d + 1, dstHi); + } +#elif defined(ENABLE_SSE2) + for (size_t i = 0, d = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(v128u16); i++, d+=2) + { + const v128u16 fifoColor = _mm_load_si128((v128u16 *)(disp_fifo.buf + disp_fifo.head)); + + disp_fifo.head += (sizeof(v128u16)/sizeof(u32)); + if (disp_fifo.head >= 0x6000) + { + disp_fifo.head -= 0x6000; + } + + v128u32 dstLo = _mm_setzero_si128(); + v128u32 dstHi = _mm_setzero_si128(); + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555To6665Opaque_SSE2(fifoColor, dstLo, dstHi); + } + else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) + { + ColorspaceConvert555To8888Opaque_SSE2(fifoColor, dstLo, dstHi); + } + + _mm_store_si128((v128u32 *)dst + d + 0, dstLo); + _mm_store_si128((v128u32 *)dst + d + 1, dstHi); + } +#elif defined(ENABLE_ALTIVEC) + for (size_t i = 0, d = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16); i+=16, d+=32) + { + const v128u16 fifoColor = vec_ld(disp_fifo.head, disp_fifo.buf); + + disp_fifo.head += (sizeof(v128u16)/sizeof(u32)); + if (disp_fifo.head >= 0x6000) + { + disp_fifo.head -= 0x6000; + } + + v128u32 dstLo = ((v128u32){0,0,0,0}); + v128u32 dstHi = ((v128u32){0,0,0,0}); + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + ColorspaceConvert555To6665Opaque_Altivec(fifoColor, dstLo, dstHi); + } + else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) + { + ColorspaceConvert555To8888Opaque_Altivec(fifoColor, dstLo, dstHi); + } + + vec_st(dstLo, d + 0, dst); + vec_st(dstHi, d + 16, dst); + } +#endif +} + +#endif + +template +void DISP_FIFOrecv_LineOpaque(u32 *__restrict dst) +{ +#ifdef USEMANUALVECTORIZATION + if ( (disp_fifo.head + (GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)) / sizeof(u32) <= 0x6000) && (disp_fifo.head == (disp_fifo.head & ~(VECTORSIZE - 1))) ) + { + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + _DISP_FIFOrecv_LineOpaque16_vec(dst); + } + else + { + _DISP_FIFOrecv_LineOpaque32_vec(dst); + } + } + else +#endif + { + if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev) + { + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++) + { + const u32 src = DISP_FIFOrecv_u32(); +#ifdef MSB_FIRST + dst[i] = (src >> 16) | (src << 16) | 0x80008000; +#else + dst[i] = src | 0x80008000; +#endif + } + } + else + { + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=2) + { + const u32 src = DISP_FIFOrecv_u32(); + + if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) + { + dst[i+0] = COLOR555TO6665_OPAQUE((src >> 0) & 0x7FFF); + dst[i+1] = COLOR555TO6665_OPAQUE((src >> 16) & 0x7FFF); + } + else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) + { + dst[i+0] = COLOR555TO8888_OPAQUE((src >> 0) & 0x7FFF); + dst[i+1] = COLOR555TO8888_OPAQUE((src >> 16) & 0x7FFF); + } + } + } + } } void DISP_FIFOreset() @@ -341,3 +574,7 @@ void DISP_FIFOreset() disp_fifo.head = 0; disp_fifo.tail = 0; } + +template void DISP_FIFOrecv_LineOpaque(u32 *__restrict dst); +template void DISP_FIFOrecv_LineOpaque(u32 *__restrict dst); +template void DISP_FIFOrecv_LineOpaque(u32 *__restrict dst); diff --git a/desmume/src/FIFO.h b/desmume/src/FIFO.h index fbca2d620..da1d85aff 100644 --- a/desmume/src/FIFO.h +++ b/desmume/src/FIFO.h @@ -1,7 +1,7 @@ /* Copyright 2006 yopyop Copyright 2007 shash - Copyright 2007-2011 DeSmuME team + Copyright 2007-2021 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -22,6 +22,7 @@ #define FIFO_H #include "types.h" +#include "./utils/colorspacehandler/colorspacehandler.h" //=================================================== IPC FIFO typedef struct @@ -78,15 +79,20 @@ void GFX_FIFOcnt(u32 val); //=================================================== Display memory FIFO typedef struct { - u32 buf[0x6000]; // 256x192 32K color - u32 head; // head - u32 tail; // tail + CACHE_ALIGN u32 buf[0x6000]; // 256x192 32K color + u32 head; // head + u32 tail; // tail } DISP_FIFO; extern DISP_FIFO disp_fifo; void DISP_FIFOinit(); -void DISP_FIFOsend(u32 val); -u32 DISP_FIFOrecv(); + +void DISP_FIFOsend_u32(u32 val); +u32 DISP_FIFOrecv_u32(); + +void DISP_FIFOrecv_Line16(u16 *__restrict dst); +template void DISP_FIFOrecv_LineOpaque(u32 *__restrict dst); + void DISP_FIFOreset(); #endif diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index b6aed0d89..091bb6ebf 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -2,7 +2,7 @@ Copyright (C) 2006 yopyop Copyright (C) 2006-2007 Theo Berkau Copyright (C) 2007 shash - Copyright (C) 2008-2019 DeSmuME team + Copyright (C) 2008-2021 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -7784,22 +7784,7 @@ void GPUEngineA::_RenderLine_DisplayCapture(const GPUEngineCompositorInfo &compI void GPUEngineA::_RenderLine_DispCapture_FIFOToBuffer(u16 *fifoLineBuffer) { -#ifdef ENABLE_SSE2 - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++) - { - const u32 srcA = DISP_FIFOrecv(); - const u32 srcB = DISP_FIFOrecv(); - const u32 srcC = DISP_FIFOrecv(); - const u32 srcD = DISP_FIFOrecv(); - const __m128i fifoColor = _mm_setr_epi32(srcA, srcB, srcC, srcD); - _mm_store_si128((__m128i *)fifoLineBuffer + i, fifoColor); - } -#else - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++) - { - ((u32 *)fifoLineBuffer)[i] = LE_TO_LOCAL_32( DISP_FIFOrecv() ); - } -#endif + DISP_FIFOrecv_Line16(fifoLineBuffer); } template @@ -8461,100 +8446,23 @@ void GPUEngineA::_HandleDisplayModeMainMemory(const GPUEngineLineInfo &lineInfo) // Displays video using color data directly read from main memory. // Doing this should always result in an output line that is at the native size (192px x 1px). -#ifdef ENABLE_SSE2 switch (OUTPUTFORMAT) { case NDSColorFormat_BGR555_Rev: { u32 *__restrict dst = (u32 *__restrict)((u16 *)this->_nativeBuffer + (lineInfo.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH)); - const __m128i alphaBit = _mm_set1_epi16(0x8000); - - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++) - { - const u32 srcA = DISP_FIFOrecv(); - const u32 srcB = DISP_FIFOrecv(); - const u32 srcC = DISP_FIFOrecv(); - const u32 srcD = DISP_FIFOrecv(); - const __m128i fifoColor = _mm_setr_epi32(srcA, srcB, srcC, srcD); - _mm_store_si128((__m128i *)dst + i, _mm_or_si128(fifoColor, alphaBit)); - } + DISP_FIFOrecv_LineOpaque(dst); break; } case NDSColorFormat_BGR666_Rev: case NDSColorFormat_BGR888_Rev: { - FragmentColor *__restrict dst = (FragmentColor *__restrict)this->_nativeBuffer + (lineInfo.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH); - - for (size_t i = 0, d = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++, d+=2) - { - const u32 srcA = DISP_FIFOrecv(); - const u32 srcB = DISP_FIFOrecv(); - const u32 srcC = DISP_FIFOrecv(); - const u32 srcD = DISP_FIFOrecv(); - const __m128i fifoColor = _mm_setr_epi32(srcA, srcB, srcC, srcD); - - __m128i dstLo = _mm_setzero_si128(); - __m128i dstHi = _mm_setzero_si128(); - - if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev) - { - ColorspaceConvert555To6665Opaque_SSE2(fifoColor, dstLo, dstHi); - } - else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev) - { - ColorspaceConvert555To8888Opaque_SSE2(fifoColor, dstLo, dstHi); - } - - _mm_store_si128((__m128i *)dst + d + 0, dstLo); - _mm_store_si128((__m128i *)dst + d + 1, dstHi); - } + u32 *__restrict dst = (u32 *__restrict)this->_nativeBuffer + (lineInfo.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH); + DISP_FIFOrecv_LineOpaque(dst); break; } } -#else - switch (OUTPUTFORMAT) - { - case NDSColorFormat_BGR555_Rev: - { - u32 *__restrict dst = (u32 *__restrict)((u16 *)this->_nativeBuffer + (lineInfo.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH)); - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++) - { - const u32 src = DISP_FIFOrecv(); -#ifdef MSB_FIRST - dst[i] = (src >> 16) | (src << 16) | 0x80008000; -#else - dst[i] = src | 0x80008000; -#endif - } - break; - } - - case NDSColorFormat_BGR666_Rev: - { - FragmentColor *__restrict dst = (FragmentColor *__restrict)this->_nativeBuffer + (lineInfo.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH); - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=2) - { - const u32 src = DISP_FIFOrecv(); - dst[i+0].color = COLOR555TO6665_OPAQUE((src >> 0) & 0x7FFF); - dst[i+1].color = COLOR555TO6665_OPAQUE((src >> 16) & 0x7FFF); - } - break; - } - - case NDSColorFormat_BGR888_Rev: - { - FragmentColor *__restrict dst = (FragmentColor *__restrict)this->_nativeBuffer + (lineInfo.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH); - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=2) - { - const u32 src = DISP_FIFOrecv(); - dst[i+0].color = COLOR555TO8888_OPAQUE((src >> 0) & 0x7FFF); - dst[i+1].color = COLOR555TO8888_OPAQUE((src >> 16) & 0x7FFF); - } - break; - } - } -#endif } template diff --git a/desmume/src/MMU.cpp b/desmume/src/MMU.cpp index ebbaafeb9..6097e24d5 100644 --- a/desmume/src/MMU.cpp +++ b/desmume/src/MMU.cpp @@ -1,7 +1,7 @@ /* Copyright (C) 2006 yopyop Copyright (C) 2007 shash - Copyright (C) 2007-2018 DeSmuME team + Copyright (C) 2007-2021 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -3427,7 +3427,7 @@ void FASTCALL _MMU_ARM9_write08(u32 adr, u8 val) return; case REG_DISPA_DISPMMEMFIFO: - DISP_FIFOsend(val); + DISP_FIFOsend_u32(val); return; case REG_DISPB_BG0HOFS: @@ -3936,7 +3936,7 @@ void FASTCALL _MMU_ARM9_write16(u32 adr, u16 val) return; case REG_DISPA_DISPMMEMFIFO: - DISP_FIFOsend(val); + DISP_FIFOsend_u32(val); return; case REG_DISPA_MASTERBRIGHT: @@ -4503,7 +4503,7 @@ void FASTCALL _MMU_ARM9_write32(u32 adr, u32 val) return; case REG_DISPA_DISPMMEMFIFO: - DISP_FIFOsend(val); + DISP_FIFOsend_u32(val); return; case REG_DISPA_MASTERBRIGHT: