FIFO.cpp: Code cleanup; remove AltiVec-specific code from display FIFO, as it is no longer needed.

- The new code works by pre-swapping big-endian words on disp_fifo.buf write, rather than swapping the big-endian words during disp_fifo.buf read.
- There is a behavior change here. Before, 8-bit and 16-bit writes to disp_fifo.buf would increment disp_fifo.tail. Now, 8-bit and 16-bit writes only increment disp_fifo.tail when the most significant bit within the FIFO value's 32-bit boundary is written to.
- Behavior is unchanged when doing 32-bit writes. In practice, the rare games that use display FIFO have only ever done 32-bit writes, so this scenario is well tested.
This commit is contained in:
rogerman 2022-07-21 15:31:40 -07:00
parent f8a7723e86
commit 5ab59eac86
4 changed files with 132 additions and 101 deletions

View File

@ -44,7 +44,6 @@
#elif defined(ENABLE_ALTIVEC)
#define USEVECTORSIZE_128
#define VECTORSIZE 16
#include "./utils/colorspacehandler/colorspacehandler_AltiVec.h"
#endif
#if defined(USEVECTORSIZE_512) || defined(USEVECTORSIZE_256) || defined(USEVECTORSIZE_128)
@ -340,12 +339,96 @@ void DISP_FIFOinit()
memset(&disp_fifo, 0, sizeof(DISP_FIFO));
}
void DISP_FIFOsend_u32(u32 val)
template <typename T, size_t ADDROFFSET>
void DISP_FIFOsend(const T val)
{
//INFO("DISP_FIFO send value 0x%08X (head 0x%06X, tail 0x%06X)\n", val, disp_fifo.head, disp_fifo.tail);
disp_fifo.buf[disp_fifo.tail] = val;
const size_t numBytes = sizeof(T);
const size_t baseWriteAddress = disp_fifo.tail * sizeof(u32);
const size_t finalWriteAddress = baseWriteAddress + ADDROFFSET;
switch (numBytes)
{
case 1:
{
#ifndef MSB_FIRST
HostWriteByte((u8 *)disp_fifo.buf, finalWriteAddress, val);
#else
switch (ADDROFFSET)
{
case 0:
HostWriteByte((u8 *)disp_fifo.buf, baseWriteAddress + 2, val);
break;
case 1:
HostWriteByte((u8 *)disp_fifo.buf, baseWriteAddress + 3, val);
break;
case 2:
HostWriteByte((u8 *)disp_fifo.buf, baseWriteAddress + 0, val);
break;
case 3:
HostWriteByte((u8 *)disp_fifo.buf, baseWriteAddress + 1, val);
break;
default:
break;
}
#endif
#ifndef MSB_FIRST
if (ADDROFFSET == 3)
#else
if (ADDROFFSET == 1)
#endif
{
disp_fifo.tail++;
}
break;
}
case 2:
{
#ifndef MSB_FIRST
HostWriteWord((u8 *)disp_fifo.buf, finalWriteAddress, val);
#else
switch (ADDROFFSET)
{
case 0:
HostWriteWord((u8 *)disp_fifo.buf, baseWriteAddress + 2, val);
break;
case 2:
HostWriteWord((u8 *)disp_fifo.buf, baseWriteAddress + 0, val);
break;
default:
break;
}
#endif
#ifndef MSB_FIRST
if (ADDROFFSET == 2)
#else
if (ADDROFFSET == 0)
#endif
{
disp_fifo.tail++;
}
break;
}
case 4:
HostWriteTwoWords((u8 *)disp_fifo.buf, finalWriteAddress, val);
disp_fifo.tail++;
break;
default:
break;
}
if (disp_fifo.tail >= 0x6000)
{
disp_fifo.tail = 0;
@ -380,19 +463,7 @@ void DISP_FIFOrecv_Line16(u16 *__restrict dst)
#ifdef USEMANUALVECTORIZATION
if ( (disp_fifo.head + (GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)) / sizeof(u32) <= 0x6000) && (disp_fifo.head == (disp_fifo.head & ~(VECTORSIZE - 1))) )
{
#ifdef ENABLE_ALTIVEC
// Big-endian systems read the pixels in their correct bit order, but swap 16-bit chunks
// within 32-bit lanes, and so we can't use a standard buffer copy function here.
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16); i+=sizeof(v128u16))
{
v128u16 fifoColor = vec_ld(i, disp_fifo.buf + disp_fifo.head);
fifoColor = vec_perm( (v128u8)fifoColor, (v128u8)fifoColor, ((v128u8){2,3, 0,1, 6,7, 4,5, 10,11, 8,9, 14,15, 12,13}) );
vec_st(fifoColor, i, dst);
}
#else
buffer_copy_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)>(dst, disp_fifo.buf + disp_fifo.head);
#endif // ENABLE_ALTIVEC
_DISP_FIFOrecv_LineAdvance();
}
else
@ -401,82 +472,11 @@ void DISP_FIFOrecv_Line16(u16 *__restrict dst)
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
{
const u32 src = DISP_FIFOrecv_u32();
#ifdef MSB_FIRST
((u32 *)dst)[i] = (src >> 16) | (src << 16);
#else
((u32 *)dst)[i] = src;
#endif
}
}
}
#ifdef USEMANUALVECTORIZATION
template <NDSColorFormat OUTPUTFORMAT>
void _DISP_FIFOrecv_LineOpaque16_vec(u32 *__restrict dst)
{
#ifdef ENABLE_ALTIVEC
// Big-endian systems read the pixels in their correct bit order, but swap 16-bit chunks
// within 32-bit lanes, and so we can't use a standard buffer copy function here.
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16); i+=sizeof(v128u16))
{
v128u16 fifoColor = vec_ld(i, disp_fifo.buf + disp_fifo.head);
fifoColor = vec_perm( (v128u8)fifoColor, (v128u8)fifoColor, ((v128u8){2,3, 0,1, 6,7, 4,5, 10,11, 8,9, 14,15, 12,13}) );
fifoColor = vec_or(fifoColor, ((v128u16){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}));
vec_st(fifoColor, i, dst);
}
#else
buffer_copy_or_constant_s16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16), false>(dst, disp_fifo.buf + disp_fifo.head, 0x8000);
#endif // ENABLE_ALTIVEC
_DISP_FIFOrecv_LineAdvance();
}
template <NDSColorFormat OUTPUTFORMAT>
void _DISP_FIFOrecv_LineOpaque32_vec(u32 *__restrict dst)
{
#ifdef ENABLE_ALTIVEC
for (size_t i = 0, d = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16); i+=16, d+=32)
{
v128u16 fifoColor = vec_ld(0, disp_fifo.buf + disp_fifo.head);
disp_fifo.head += (sizeof(v128u16)/sizeof(u32));
if (disp_fifo.head >= 0x6000)
{
disp_fifo.head -= 0x6000;
}
v128u32 dstLo = ((v128u32){0,0,0,0});
v128u32 dstHi = ((v128u32){0,0,0,0});
fifoColor = vec_perm( (v128u8)fifoColor, (v128u8)fifoColor, ((v128u8){10,11, 8,9, 14,15, 12,13, 2,3, 0,1, 6,7, 4,5}) );
if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
{
ColorspaceConvert555To6665Opaque_AltiVec<false, BESwapDst>(fifoColor, dstLo, dstHi);
}
else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
{
ColorspaceConvert555To8888Opaque_AltiVec<false, BESwapDst>(fifoColor, dstLo, dstHi);
}
vec_st(dstLo, d + 0, dst);
vec_st(dstHi, d + 16, dst);
}
#else
if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
{
ColorspaceConvertBuffer555To6665Opaque<false, false, BESwapDst>((u16 *)(disp_fifo.buf + disp_fifo.head), dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
}
else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
{
ColorspaceConvertBuffer555To8888Opaque<false, false, BESwapDst>((u16 *)(disp_fifo.buf + disp_fifo.head), dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
}
_DISP_FIFOrecv_LineAdvance();
#endif // ENABLE_ALTIVEC
}
#endif // USEMANUALVECTORIZATION
template <NDSColorFormat OUTPUTFORMAT>
void DISP_FIFOrecv_LineOpaque(u32 *__restrict dst)
{
@ -485,26 +485,28 @@ void DISP_FIFOrecv_LineOpaque(u32 *__restrict dst)
{
if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
{
_DISP_FIFOrecv_LineOpaque16_vec<OUTPUTFORMAT>(dst);
buffer_copy_or_constant_s16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16), false>(dst, disp_fifo.buf + disp_fifo.head, 0x8000);
}
else
else if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
{
_DISP_FIFOrecv_LineOpaque32_vec<OUTPUTFORMAT>(dst);
ColorspaceConvertBuffer555To6665Opaque<false, false, BESwapDst>((u16 *)(disp_fifo.buf + disp_fifo.head), dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
}
else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
{
ColorspaceConvertBuffer555To8888Opaque<false, false, BESwapDst>((u16 *)(disp_fifo.buf + disp_fifo.head), dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
}
_DISP_FIFOrecv_LineAdvance();
}
else
#endif
#endif // USEMANUALVECTORIZATION
{
if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
{
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
{
const u32 src = DISP_FIFOrecv_u32();
#ifdef MSB_FIRST
dst[i] = (src >> 16) | (src << 16) | 0x80008000;
#else
dst[i] = src | 0x80008000;
#endif
}
}
else
@ -534,6 +536,14 @@ void DISP_FIFOreset()
disp_fifo.tail = 0;
}
template void DISP_FIFOsend< u8, 0>(const u8 val);
template void DISP_FIFOsend< u8, 1>(const u8 val);
template void DISP_FIFOsend< u8, 2>(const u8 val);
template void DISP_FIFOsend< u8, 3>(const u8 val);
template void DISP_FIFOsend<u16, 0>(const u16 val);
template void DISP_FIFOsend<u16, 2>(const u16 val);
template void DISP_FIFOsend<u32, 0>(const u32 val);
template void DISP_FIFOrecv_LineOpaque<NDSColorFormat_BGR555_Rev>(u32 *__restrict dst);
template void DISP_FIFOrecv_LineOpaque<NDSColorFormat_BGR666_Rev>(u32 *__restrict dst);
template void DISP_FIFOrecv_LineOpaque<NDSColorFormat_BGR888_Rev>(u32 *__restrict dst);

View File

@ -1,7 +1,7 @@
/*
Copyright 2006 yopyop
Copyright 2007 shash
Copyright 2007-2021 DeSmuME team
Copyright 2007-2022 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -87,7 +87,7 @@ typedef struct
extern DISP_FIFO disp_fifo;
void DISP_FIFOinit();
void DISP_FIFOsend_u32(u32 val);
template<typename T, size_t ADDROFFSET> void DISP_FIFOsend(const T val);
u32 DISP_FIFOrecv_u32();
void DISP_FIFOrecv_Line16(u16 *__restrict dst);

View File

@ -3425,7 +3425,19 @@ void FASTCALL _MMU_ARM9_write08(u32 adr, u8 val)
return;
case REG_DISPA_DISPMMEMFIFO:
DISP_FIFOsend_u32(val);
DISP_FIFOsend<u8, 0>(val);
return;
case REG_DISPA_DISPMMEMFIFO+1:
DISP_FIFOsend<u8, 1>(val);
return;
case REG_DISPA_DISPMMEMFIFO+2:
DISP_FIFOsend<u8, 2>(val);
return;
case REG_DISPA_DISPMMEMFIFO+3:
DISP_FIFOsend<u8, 3>(val);
return;
case REG_DISPB_BG0HOFS:
@ -3992,7 +4004,11 @@ void FASTCALL _MMU_ARM9_write16(u32 adr, u16 val)
return;
case REG_DISPA_DISPMMEMFIFO:
DISP_FIFOsend_u32(val);
DISP_FIFOsend<u16, 0>(val);
return;
case REG_DISPA_DISPMMEMFIFO+2:
DISP_FIFOsend<u16, 2>(val);
return;
case REG_DISPA_MASTERBRIGHT:
@ -4635,7 +4651,7 @@ void FASTCALL _MMU_ARM9_write32(u32 adr, u32 val)
return;
case REG_DISPA_DISPMMEMFIFO:
DISP_FIFOsend_u32(val);
DISP_FIFOsend<u32, 0>(val);
return;
case REG_DISPA_MASTERBRIGHT:

View File

@ -1,7 +1,7 @@
/*
Copyright (C) 2005 Theo Berkau
Copyright (C) 2005-2006 Guillaume Duhamel
Copyright (C) 2008-2010 DeSmuME team
Copyright (C) 2008-2022 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -140,6 +140,11 @@ static INLINE u16 HostReadWord(u8* const mem, const u32 addr)
return *((u16 *) (mem + addr));
}
static INLINE void HostWriteByte(u8* const mem, const u32 addr, const u8 val)
{
mem[addr] = val;
}
static INLINE void HostWriteWord(u8* const mem, const u32 addr, const u16 val)
{
*((u16 *) (mem + addr)) = val;