FIFO.cpp: Add new manually vectorized versions of FIFO buffer line copies for AVX2, AVX-512, and AltiVec.

This commit is contained in:
rogerman 2021-08-26 18:05:53 -07:00
parent 79437371e3
commit 31851c2524
4 changed files with 266 additions and 115 deletions

View File

@ -1,7 +1,7 @@
/*
Copyright 2006 yopyop
Copyright 2007 shash
Copyright 2007-2015 DeSmuME team
Copyright 2007-2021 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -29,6 +29,29 @@
#include "NDSSystem.h"
#include "gfx3d.h"
#if defined(ENABLE_AVX512_1)
#define USEVECTORSIZE_512
#define VECTORSIZE 64
#include "./utils/colorspacehandler/colorspacehandler_AVX512.h"
#elif defined(ENABLE_AVX2)
#define USEVECTORSIZE_256
#define VECTORSIZE 32
#include "./utils/colorspacehandler/colorspacehandler_AVX2.h"
#elif defined(ENABLE_SSE2)
#define USEVECTORSIZE_128
#define VECTORSIZE 16
#include "./utils/colorspacehandler/colorspacehandler_SSE2.h"
#elif defined(ENABLE_ALTIVEC)
#define USEVECTORSIZE_128
#define VECTORSIZE 16
#include "./utils/colorspacehandler/colorspacehandler_AltiVec.h"
#endif
#if defined(USEVECTORSIZE_512) || defined(USEVECTORSIZE_256) || defined(USEVECTORSIZE_128)
#define USEMANUALVECTORIZATION
#endif
// ========================================================= IPC FIFO
IPC_FIFO ipc_fifo[2];
@ -317,23 +340,233 @@ void DISP_FIFOinit()
memset(&disp_fifo, 0, sizeof(DISP_FIFO));
}
void DISP_FIFOsend(u32 val)
void DISP_FIFOsend_u32(u32 val)
{
//INFO("DISP_FIFO send value 0x%08X (head 0x%06X, tail 0x%06X)\n", val, disp_fifo.head, disp_fifo.tail);
disp_fifo.buf[disp_fifo.tail] = val;
disp_fifo.tail++;
if (disp_fifo.tail > 0x5FFF)
disp_fifo.tail = 0;
if (disp_fifo.head >= 0x6000)
{
disp_fifo.head -= 0x6000;
}
}
u32 DISP_FIFOrecv()
u32 DISP_FIFOrecv_u32()
{
//if (disp_fifo.tail == disp_fifo.head) return (0); // FIFO is empty
u32 val = disp_fifo.buf[disp_fifo.head];
disp_fifo.head++;
if (disp_fifo.head > 0x5FFF)
disp_fifo.head = 0;
return (val);
if (disp_fifo.head >= 0x6000)
{
disp_fifo.head -= 0x6000;
}
return val;
}
static void _DISP_FIFOrecv_LineAdvance()
{
disp_fifo.head += (GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)) / sizeof(u32);
if (disp_fifo.head >= 0x6000)
{
disp_fifo.head -= 0x6000;
}
}
void DISP_FIFOrecv_Line16(u16 *__restrict dst)
{
#ifndef ENABLE_ALTIVEC // buffer_copy_fast() doesn't support endian swapping
if ( (disp_fifo.head + (GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)) / sizeof(u32) <= 0x6000)
#ifdef USEMANUALVECTORIZATION
&& (disp_fifo.head == (disp_fifo.head & ~(VECTORSIZE - 1)))
#endif
)
{
buffer_copy_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)>(dst, disp_fifo.buf + disp_fifo.head);
_DISP_FIFOrecv_LineAdvance();
}
else
#endif // ENABLE_ALTIVEC
{
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
{
((u32 *)dst)[i] = LE_TO_LOCAL_32( DISP_FIFOrecv_u32() );
}
}
}
#ifdef USEMANUALVECTORIZATION
template <NDSColorFormat OUTPUTFORMAT>
void _DISP_FIFOrecv_LineOpaque16_vec(u32 *__restrict dst)
{
buffer_copy_or_constant_s16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16), false>(dst, disp_fifo.buf + disp_fifo.head, 0x8000);
_DISP_FIFOrecv_LineAdvance();
}
template <NDSColorFormat OUTPUTFORMAT>
void _DISP_FIFOrecv_LineOpaque32_vec(u32 *__restrict dst)
{
#if defined(ENABLE_AVX512_0)
for (size_t i = 0, d = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(v512u16); i++, d+=2)
{
const v512u16 fifoColor = _mm512_load_si512((v512u16 *)(disp_fifo.buf + disp_fifo.head));
disp_fifo.head += (sizeof(v512u16)/sizeof(u32));
if (disp_fifo.head >= 0x6000)
{
disp_fifo.head -= 0x6000;
}
v512u32 dstLo = _mm512_setzero_si512();
v512u32 dstHi = _mm512_setzero_si512();
if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
{
ColorspaceConvert555To6665Opaque_AVX512<false>(fifoColor, dstLo, dstHi);
}
else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
{
ColorspaceConvert555To8888Opaque_AVX512<false>(fifoColor, dstLo, dstHi);
}
_mm512_store_si512((v512u32 *)dst + d + 0, dstLo);
_mm512_store_si512((v512u32 *)dst + d + 1, dstHi);
}
#elif defined(ENABLE_AVX2)
for (size_t i = 0, d = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(v256u16); i++, d+=2)
{
const v256u16 fifoColor = _mm256_load_si256((v256u16 *)(disp_fifo.buf + disp_fifo.head));
disp_fifo.head += (sizeof(v256u16)/sizeof(u32));
if (disp_fifo.head >= 0x6000)
{
disp_fifo.head -= 0x6000;
}
v256u32 dstLo = _mm256_setzero_si256();
v256u32 dstHi = _mm256_setzero_si256();
if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
{
ColorspaceConvert555To6665Opaque_AVX2<false>(fifoColor, dstLo, dstHi);
}
else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
{
ColorspaceConvert555To8888Opaque_AVX2<false>(fifoColor, dstLo, dstHi);
}
_mm256_store_si256((v256u32 *)dst + d + 0, dstLo);
_mm256_store_si256((v256u32 *)dst + d + 1, dstHi);
}
#elif defined(ENABLE_SSE2)
for (size_t i = 0, d = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(v128u16); i++, d+=2)
{
const v128u16 fifoColor = _mm_load_si128((v128u16 *)(disp_fifo.buf + disp_fifo.head));
disp_fifo.head += (sizeof(v128u16)/sizeof(u32));
if (disp_fifo.head >= 0x6000)
{
disp_fifo.head -= 0x6000;
}
v128u32 dstLo = _mm_setzero_si128();
v128u32 dstHi = _mm_setzero_si128();
if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
{
ColorspaceConvert555To6665Opaque_SSE2<false>(fifoColor, dstLo, dstHi);
}
else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
{
ColorspaceConvert555To8888Opaque_SSE2<false>(fifoColor, dstLo, dstHi);
}
_mm_store_si128((v128u32 *)dst + d + 0, dstLo);
_mm_store_si128((v128u32 *)dst + d + 1, dstHi);
}
#elif defined(ENABLE_ALTIVEC)
for (size_t i = 0, d = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16); i+=16, d+=32)
{
const v128u16 fifoColor = vec_ld(disp_fifo.head, disp_fifo.buf);
disp_fifo.head += (sizeof(v128u16)/sizeof(u32));
if (disp_fifo.head >= 0x6000)
{
disp_fifo.head -= 0x6000;
}
v128u32 dstLo = ((v128u32){0,0,0,0});
v128u32 dstHi = ((v128u32){0,0,0,0});
if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
{
ColorspaceConvert555To6665Opaque_Altivec<false>(fifoColor, dstLo, dstHi);
}
else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
{
ColorspaceConvert555To8888Opaque_Altivec<false>(fifoColor, dstLo, dstHi);
}
vec_st(dstLo, d + 0, dst);
vec_st(dstHi, d + 16, dst);
}
#endif
}
#endif
template <NDSColorFormat OUTPUTFORMAT>
void DISP_FIFOrecv_LineOpaque(u32 *__restrict dst)
{
#ifdef USEMANUALVECTORIZATION
if ( (disp_fifo.head + (GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)) / sizeof(u32) <= 0x6000) && (disp_fifo.head == (disp_fifo.head & ~(VECTORSIZE - 1))) )
{
if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
{
_DISP_FIFOrecv_LineOpaque16_vec<OUTPUTFORMAT>(dst);
}
else
{
_DISP_FIFOrecv_LineOpaque32_vec<OUTPUTFORMAT>(dst);
}
}
else
#endif
{
if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
{
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
{
const u32 src = DISP_FIFOrecv_u32();
#ifdef MSB_FIRST
dst[i] = (src >> 16) | (src << 16) | 0x80008000;
#else
dst[i] = src | 0x80008000;
#endif
}
}
else
{
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=2)
{
const u32 src = DISP_FIFOrecv_u32();
if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
{
dst[i+0] = COLOR555TO6665_OPAQUE((src >> 0) & 0x7FFF);
dst[i+1] = COLOR555TO6665_OPAQUE((src >> 16) & 0x7FFF);
}
else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
{
dst[i+0] = COLOR555TO8888_OPAQUE((src >> 0) & 0x7FFF);
dst[i+1] = COLOR555TO8888_OPAQUE((src >> 16) & 0x7FFF);
}
}
}
}
}
void DISP_FIFOreset()
@ -341,3 +574,7 @@ void DISP_FIFOreset()
disp_fifo.head = 0;
disp_fifo.tail = 0;
}
template void DISP_FIFOrecv_LineOpaque<NDSColorFormat_BGR555_Rev>(u32 *__restrict dst);
template void DISP_FIFOrecv_LineOpaque<NDSColorFormat_BGR666_Rev>(u32 *__restrict dst);
template void DISP_FIFOrecv_LineOpaque<NDSColorFormat_BGR888_Rev>(u32 *__restrict dst);

View File

@ -1,7 +1,7 @@
/*
Copyright 2006 yopyop
Copyright 2007 shash
Copyright 2007-2011 DeSmuME team
Copyright 2007-2021 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -22,6 +22,7 @@
#define FIFO_H
#include "types.h"
#include "./utils/colorspacehandler/colorspacehandler.h"
//=================================================== IPC FIFO
typedef struct
@ -78,15 +79,20 @@ void GFX_FIFOcnt(u32 val);
//=================================================== Display memory FIFO
typedef struct
{
u32 buf[0x6000]; // 256x192 32K color
CACHE_ALIGN u32 buf[0x6000]; // 256x192 32K color
u32 head; // head
u32 tail; // tail
} DISP_FIFO;
extern DISP_FIFO disp_fifo;
void DISP_FIFOinit();
void DISP_FIFOsend(u32 val);
u32 DISP_FIFOrecv();
void DISP_FIFOsend_u32(u32 val);
u32 DISP_FIFOrecv_u32();
void DISP_FIFOrecv_Line16(u16 *__restrict dst);
template<NDSColorFormat OUTPUTFORMAT> void DISP_FIFOrecv_LineOpaque(u32 *__restrict dst);
void DISP_FIFOreset();
#endif

View File

@ -2,7 +2,7 @@
Copyright (C) 2006 yopyop
Copyright (C) 2006-2007 Theo Berkau
Copyright (C) 2007 shash
Copyright (C) 2008-2019 DeSmuME team
Copyright (C) 2008-2021 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -7784,22 +7784,7 @@ void GPUEngineA::_RenderLine_DisplayCapture(const GPUEngineCompositorInfo &compI
void GPUEngineA::_RenderLine_DispCapture_FIFOToBuffer(u16 *fifoLineBuffer)
{
#ifdef ENABLE_SSE2
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++)
{
const u32 srcA = DISP_FIFOrecv();
const u32 srcB = DISP_FIFOrecv();
const u32 srcC = DISP_FIFOrecv();
const u32 srcD = DISP_FIFOrecv();
const __m128i fifoColor = _mm_setr_epi32(srcA, srcB, srcC, srcD);
_mm_store_si128((__m128i *)fifoLineBuffer + i, fifoColor);
}
#else
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
{
((u32 *)fifoLineBuffer)[i] = LE_TO_LOCAL_32( DISP_FIFOrecv() );
}
#endif
DISP_FIFOrecv_Line16(fifoLineBuffer);
}
template<NDSColorFormat COLORFORMAT, int SOURCESWITCH, size_t CAPTURELENGTH, bool CAPTUREFROMNATIVESRC, bool CAPTURETONATIVEDST>
@ -8461,100 +8446,23 @@ void GPUEngineA::_HandleDisplayModeMainMemory(const GPUEngineLineInfo &lineInfo)
// Displays video using color data directly read from main memory.
// Doing this should always result in an output line that is at the native size (192px x 1px).
#ifdef ENABLE_SSE2
switch (OUTPUTFORMAT)
{
case NDSColorFormat_BGR555_Rev:
{
u32 *__restrict dst = (u32 *__restrict)((u16 *)this->_nativeBuffer + (lineInfo.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH));
const __m128i alphaBit = _mm_set1_epi16(0x8000);
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++)
{
const u32 srcA = DISP_FIFOrecv();
const u32 srcB = DISP_FIFOrecv();
const u32 srcC = DISP_FIFOrecv();
const u32 srcD = DISP_FIFOrecv();
const __m128i fifoColor = _mm_setr_epi32(srcA, srcB, srcC, srcD);
_mm_store_si128((__m128i *)dst + i, _mm_or_si128(fifoColor, alphaBit));
}
DISP_FIFOrecv_LineOpaque<OUTPUTFORMAT>(dst);
break;
}
case NDSColorFormat_BGR666_Rev:
case NDSColorFormat_BGR888_Rev:
{
FragmentColor *__restrict dst = (FragmentColor *__restrict)this->_nativeBuffer + (lineInfo.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH);
for (size_t i = 0, d = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++, d+=2)
{
const u32 srcA = DISP_FIFOrecv();
const u32 srcB = DISP_FIFOrecv();
const u32 srcC = DISP_FIFOrecv();
const u32 srcD = DISP_FIFOrecv();
const __m128i fifoColor = _mm_setr_epi32(srcA, srcB, srcC, srcD);
__m128i dstLo = _mm_setzero_si128();
__m128i dstHi = _mm_setzero_si128();
if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
{
ColorspaceConvert555To6665Opaque_SSE2<false>(fifoColor, dstLo, dstHi);
}
else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
{
ColorspaceConvert555To8888Opaque_SSE2<false>(fifoColor, dstLo, dstHi);
}
_mm_store_si128((__m128i *)dst + d + 0, dstLo);
_mm_store_si128((__m128i *)dst + d + 1, dstHi);
}
u32 *__restrict dst = (u32 *__restrict)this->_nativeBuffer + (lineInfo.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH);
DISP_FIFOrecv_LineOpaque<OUTPUTFORMAT>(dst);
break;
}
}
#else
switch (OUTPUTFORMAT)
{
case NDSColorFormat_BGR555_Rev:
{
u32 *__restrict dst = (u32 *__restrict)((u16 *)this->_nativeBuffer + (lineInfo.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH));
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
{
const u32 src = DISP_FIFOrecv();
#ifdef MSB_FIRST
dst[i] = (src >> 16) | (src << 16) | 0x80008000;
#else
dst[i] = src | 0x80008000;
#endif
}
break;
}
case NDSColorFormat_BGR666_Rev:
{
FragmentColor *__restrict dst = (FragmentColor *__restrict)this->_nativeBuffer + (lineInfo.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH);
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=2)
{
const u32 src = DISP_FIFOrecv();
dst[i+0].color = COLOR555TO6665_OPAQUE((src >> 0) & 0x7FFF);
dst[i+1].color = COLOR555TO6665_OPAQUE((src >> 16) & 0x7FFF);
}
break;
}
case NDSColorFormat_BGR888_Rev:
{
FragmentColor *__restrict dst = (FragmentColor *__restrict)this->_nativeBuffer + (lineInfo.indexNative * GPU_FRAMEBUFFER_NATIVE_WIDTH);
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i+=2)
{
const u32 src = DISP_FIFOrecv();
dst[i+0].color = COLOR555TO8888_OPAQUE((src >> 0) & 0x7FFF);
dst[i+1].color = COLOR555TO8888_OPAQUE((src >> 16) & 0x7FFF);
}
break;
}
}
#endif
}
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, bool MOSAIC, bool WILLPERFORMWINDOWTEST, bool WILLDEFERCOMPOSITING>

View File

@ -1,7 +1,7 @@
/*
Copyright (C) 2006 yopyop
Copyright (C) 2007 shash
Copyright (C) 2007-2018 DeSmuME team
Copyright (C) 2007-2021 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -3427,7 +3427,7 @@ void FASTCALL _MMU_ARM9_write08(u32 adr, u8 val)
return;
case REG_DISPA_DISPMMEMFIFO:
DISP_FIFOsend(val);
DISP_FIFOsend_u32(val);
return;
case REG_DISPB_BG0HOFS:
@ -3936,7 +3936,7 @@ void FASTCALL _MMU_ARM9_write16(u32 adr, u16 val)
return;
case REG_DISPA_DISPMMEMFIFO:
DISP_FIFOsend(val);
DISP_FIFOsend_u32(val);
return;
case REG_DISPA_MASTERBRIGHT:
@ -4503,7 +4503,7 @@ void FASTCALL _MMU_ARM9_write32(u32 adr, u32 val)
return;
case REG_DISPA_DISPMMEMFIFO:
DISP_FIFOsend(val);
DISP_FIFOsend_u32(val);
return;
case REG_DISPA_MASTERBRIGHT: