FIFO.cpp: Code cleanup; remove AltiVec-specific code from display FIFO, as it is no longer needed.

- The new code works by pre-swapping big-endian words on disp_fifo.buf write, rather than swapping the big-endian words during disp_fifo.buf read. - There is a behavior change here. Before, 8-bit and 16-bit writes to disp_fifo.buf would increment disp_fifo.tail. Now, 8-bit and 16-bit writes only increment disp_fifo.tail when the most significant bit within the FIFO value's 32-bit boundary is written to. - Behavior is unchanged when doing 32-bit writes. In practice, the rare games that use display FIFO have only ever done 32-bit writes, so this scenario is well tested.
2022-07-21 15:31:40 -07:00 · 2022-07-21 15:31:40 -07:00 · 5ab59eac86
parent f8a7723e86
commit 5ab59eac86
4 changed files with 132 additions and 101 deletions
--- a/desmume/src/FIFO.cpp
+++ b/desmume/src/FIFO.cpp
@ -44,7 +44,6 @@
 #elif defined(ENABLE_ALTIVEC)
 	#define USEVECTORSIZE_128
 	#define VECTORSIZE 16
-	#include "./utils/colorspacehandler/colorspacehandler_AltiVec.h"
 #endif

 #if defined(USEVECTORSIZE_512) || defined(USEVECTORSIZE_256) || defined(USEVECTORSIZE_128)
@ -340,12 +339,96 @@ void DISP_FIFOinit()
 	memset(&disp_fifo, 0, sizeof(DISP_FIFO));
 }

-void DISP_FIFOsend_u32(u32 val)
+template <typename T, size_t ADDROFFSET>
+void DISP_FIFOsend(const T val)
 {
 	//INFO("DISP_FIFO send value 0x%08X (head 0x%06X, tail 0x%06X)\n", val, disp_fifo.head, disp_fifo.tail);
-	disp_fifo.buf[disp_fifo.tail] = val;
 	
+	const size_t numBytes = sizeof(T);
+	const size_t baseWriteAddress = disp_fifo.tail * sizeof(u32);
+	const size_t finalWriteAddress = baseWriteAddress + ADDROFFSET;
+	
+	switch (numBytes)
+	{
+		case 1:
+		{
+#ifndef MSB_FIRST
+			HostWriteByte((u8 *)disp_fifo.buf, finalWriteAddress, val);
+#else
+			switch (ADDROFFSET)
+			{
+				case 0:
+					HostWriteByte((u8 *)disp_fifo.buf, baseWriteAddress + 2, val);
+					break;
+					
+				case 1:
+					HostWriteByte((u8 *)disp_fifo.buf, baseWriteAddress + 3, val);
+					break;
+					
+				case 2:
+					HostWriteByte((u8 *)disp_fifo.buf, baseWriteAddress + 0, val);
+					break;
+					
+				case 3:
+					HostWriteByte((u8 *)disp_fifo.buf, baseWriteAddress + 1, val);
+					break;
+					
+				default:
+					break;
+			}
+#endif
+				
+#ifndef MSB_FIRST
+			if (ADDROFFSET == 3)
+#else
+			if (ADDROFFSET == 1)
+#endif
+			{
 				disp_fifo.tail++;
+			}
+			break;
+		}
+			
+		case 2:
+		{
+#ifndef MSB_FIRST
+			HostWriteWord((u8 *)disp_fifo.buf, finalWriteAddress, val);
+#else
+			switch (ADDROFFSET)
+			{
+				case 0:
+					HostWriteWord((u8 *)disp_fifo.buf, baseWriteAddress + 2, val);
+					break;
+					
+				case 2:
+					HostWriteWord((u8 *)disp_fifo.buf, baseWriteAddress + 0, val);
+					break;
+					
+				default:
+					break;
+			}
+#endif
+				
+#ifndef MSB_FIRST
+			if (ADDROFFSET == 2)
+#else
+			if (ADDROFFSET == 0)
+#endif
+			{
+				disp_fifo.tail++;
+			}
+			break;
+		}
+			
+		case 4:
+			HostWriteTwoWords((u8 *)disp_fifo.buf, finalWriteAddress, val);
+			disp_fifo.tail++;
+			break;
+			
+		default:
+			break;
+	}
+	
 	if (disp_fifo.tail >= 0x6000)
 	{
 		disp_fifo.tail = 0;
@ -380,19 +463,7 @@ void DISP_FIFOrecv_Line16(u16 *__restrict dst)
 #ifdef USEMANUALVECTORIZATION
 	if ( (disp_fifo.head + (GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)) / sizeof(u32) <= 0x6000) && (disp_fifo.head == (disp_fifo.head & ~(VECTORSIZE - 1))) )
 	{
-#ifdef ENABLE_ALTIVEC
-		// Big-endian systems read the pixels in their correct bit order, but swap 16-bit chunks
-		// within 32-bit lanes, and so we can't use a standard buffer copy function here.
-		for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16); i+=sizeof(v128u16))
-		{
-			v128u16 fifoColor = vec_ld(i, disp_fifo.buf + disp_fifo.head);
-			fifoColor = vec_perm( (v128u8)fifoColor, (v128u8)fifoColor, ((v128u8){2,3, 0,1, 6,7, 4,5, 10,11, 8,9, 14,15, 12,13}) );
-			vec_st(fifoColor, i, dst);
-		}
-#else
 		buffer_copy_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16)>(dst, disp_fifo.buf + disp_fifo.head);
-#endif // ENABLE_ALTIVEC
-		
 		_DISP_FIFOrecv_LineAdvance();
 	}
 	else
@ -401,82 +472,11 @@ void DISP_FIFOrecv_Line16(u16 *__restrict dst)
 		for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
 		{
 			const u32 src = DISP_FIFOrecv_u32();
-#ifdef MSB_FIRST
-			((u32 *)dst)[i] = (src >> 16) | (src << 16);
-#else
 			((u32 *)dst)[i] = src;
-#endif
 		}
 	}
 }

-#ifdef USEMANUALVECTORIZATION
-
-template <NDSColorFormat OUTPUTFORMAT>
-void _DISP_FIFOrecv_LineOpaque16_vec(u32 *__restrict dst)
-{
-#ifdef ENABLE_ALTIVEC
-	// Big-endian systems read the pixels in their correct bit order, but swap 16-bit chunks
-	// within 32-bit lanes, and so we can't use a standard buffer copy function here.
-	for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16); i+=sizeof(v128u16))
-	{
-		v128u16 fifoColor = vec_ld(i, disp_fifo.buf + disp_fifo.head);
-		fifoColor = vec_perm( (v128u8)fifoColor, (v128u8)fifoColor, ((v128u8){2,3, 0,1, 6,7, 4,5, 10,11, 8,9, 14,15, 12,13}) );
-		fifoColor = vec_or(fifoColor, ((v128u16){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}));
-		vec_st(fifoColor, i, dst);
-	}
-#else
-	buffer_copy_or_constant_s16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16), false>(dst, disp_fifo.buf + disp_fifo.head, 0x8000);
-#endif // ENABLE_ALTIVEC
-	
-	_DISP_FIFOrecv_LineAdvance();
-}
-
-template <NDSColorFormat OUTPUTFORMAT>
-void _DISP_FIFOrecv_LineOpaque32_vec(u32 *__restrict dst)
-{
-#ifdef ENABLE_ALTIVEC
-	for (size_t i = 0, d = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16); i+=16, d+=32)
-	{
-		v128u16 fifoColor = vec_ld(0, disp_fifo.buf + disp_fifo.head);
-		
-		disp_fifo.head += (sizeof(v128u16)/sizeof(u32));
-		if (disp_fifo.head >= 0x6000)
-		{
-			disp_fifo.head -= 0x6000;
-		}
-		
-		v128u32 dstLo = ((v128u32){0,0,0,0});
-		v128u32 dstHi = ((v128u32){0,0,0,0});
-		fifoColor = vec_perm( (v128u8)fifoColor, (v128u8)fifoColor, ((v128u8){10,11, 8,9, 14,15, 12,13, 2,3, 0,1, 6,7, 4,5}) );
-		
-		if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
-		{
-			ColorspaceConvert555To6665Opaque_AltiVec<false, BESwapDst>(fifoColor, dstLo, dstHi);
-		}
-		else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
-		{
-			ColorspaceConvert555To8888Opaque_AltiVec<false, BESwapDst>(fifoColor, dstLo, dstHi);
-		}
-		
-		vec_st(dstLo, d +  0, dst);
-		vec_st(dstHi, d + 16, dst);
-	}
-#else
-	if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
-	{
-		ColorspaceConvertBuffer555To6665Opaque<false, false, BESwapDst>((u16 *)(disp_fifo.buf + disp_fifo.head), dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	}
-	else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
-	{
-		ColorspaceConvertBuffer555To8888Opaque<false, false, BESwapDst>((u16 *)(disp_fifo.buf + disp_fifo.head), dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
-	}
-	_DISP_FIFOrecv_LineAdvance();
-#endif // ENABLE_ALTIVEC
-}
-
-#endif // USEMANUALVECTORIZATION
-
 template <NDSColorFormat OUTPUTFORMAT>
 void DISP_FIFOrecv_LineOpaque(u32 *__restrict dst)
 {
@ -485,26 +485,28 @@ void DISP_FIFOrecv_LineOpaque(u32 *__restrict dst)
 	{
 		if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
 		{
-			_DISP_FIFOrecv_LineOpaque16_vec<OUTPUTFORMAT>(dst);
+			buffer_copy_or_constant_s16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16), false>(dst, disp_fifo.buf + disp_fifo.head, 0x8000);
 		}
-		else
+		else if (OUTPUTFORMAT == NDSColorFormat_BGR666_Rev)
 		{
-			_DISP_FIFOrecv_LineOpaque32_vec<OUTPUTFORMAT>(dst);
+			ColorspaceConvertBuffer555To6665Opaque<false, false, BESwapDst>((u16 *)(disp_fifo.buf + disp_fifo.head), dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
 		}
+		else if (OUTPUTFORMAT == NDSColorFormat_BGR888_Rev)
+		{
+			ColorspaceConvertBuffer555To8888Opaque<false, false, BESwapDst>((u16 *)(disp_fifo.buf + disp_fifo.head), dst, GPU_FRAMEBUFFER_NATIVE_WIDTH);
+		}
+		
+		_DISP_FIFOrecv_LineAdvance();
 	}
 	else
-#endif
+#endif // USEMANUALVECTORIZATION
 	{
 		if (OUTPUTFORMAT == NDSColorFormat_BGR555_Rev)
 		{
 			for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
 			{
 				const u32 src = DISP_FIFOrecv_u32();
-#ifdef MSB_FIRST
-				dst[i] = (src >> 16) | (src << 16) | 0x80008000;
-#else
 				dst[i] = src | 0x80008000;
-#endif
 			}
 		}
 		else
@ -534,6 +536,14 @@ void DISP_FIFOreset()
 	disp_fifo.tail = 0;
 }

+template void DISP_FIFOsend< u8, 0>(const  u8 val);
+template void DISP_FIFOsend< u8, 1>(const  u8 val);
+template void DISP_FIFOsend< u8, 2>(const  u8 val);
+template void DISP_FIFOsend< u8, 3>(const  u8 val);
+template void DISP_FIFOsend<u16, 0>(const u16 val);
+template void DISP_FIFOsend<u16, 2>(const u16 val);
+template void DISP_FIFOsend<u32, 0>(const u32 val);
+
 template void DISP_FIFOrecv_LineOpaque<NDSColorFormat_BGR555_Rev>(u32 *__restrict dst);
 template void DISP_FIFOrecv_LineOpaque<NDSColorFormat_BGR666_Rev>(u32 *__restrict dst);
 template void DISP_FIFOrecv_LineOpaque<NDSColorFormat_BGR888_Rev>(u32 *__restrict dst);
--- a/desmume/src/FIFO.h
+++ b/desmume/src/FIFO.h
@ -1,7 +1,7 @@
 /*
 	Copyright 2006 yopyop
 	Copyright 2007 shash
-	Copyright 2007-2021 DeSmuME team
+	Copyright 2007-2022 DeSmuME team

 	This file is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
@ -87,7 +87,7 @@ typedef struct
 extern DISP_FIFO disp_fifo;
 void DISP_FIFOinit();

-void DISP_FIFOsend_u32(u32 val);
+template<typename T, size_t ADDROFFSET> void DISP_FIFOsend(const T val);
 u32 DISP_FIFOrecv_u32();

 void DISP_FIFOrecv_Line16(u16 *__restrict dst);
--- a/desmume/src/MMU.cpp
+++ b/desmume/src/MMU.cpp
@ -3425,7 +3425,19 @@ void FASTCALL _MMU_ARM9_write08(u32 adr, u8 val)
 					return;
 					
 				case REG_DISPA_DISPMMEMFIFO:
-					DISP_FIFOsend_u32(val);
+					DISP_FIFOsend<u8, 0>(val);
+					return;
+					
+				case REG_DISPA_DISPMMEMFIFO+1:
+					DISP_FIFOsend<u8, 1>(val);
+					return;
+					
+				case REG_DISPA_DISPMMEMFIFO+2:
+					DISP_FIFOsend<u8, 2>(val);
+					return;
+					
+				case REG_DISPA_DISPMMEMFIFO+3:
+					DISP_FIFOsend<u8, 3>(val);
 					return;
 					
 				case REG_DISPB_BG0HOFS:
@ -3992,7 +4004,11 @@ void FASTCALL _MMU_ARM9_write16(u32 adr, u16 val)
 					return;
 					
 				case REG_DISPA_DISPMMEMFIFO:
-					DISP_FIFOsend_u32(val);
+					DISP_FIFOsend<u16, 0>(val);
+					return;
+					
+				case REG_DISPA_DISPMMEMFIFO+2:
+					DISP_FIFOsend<u16, 2>(val);
 					return;
 					
 				case REG_DISPA_MASTERBRIGHT:
@ -4635,7 +4651,7 @@ void FASTCALL _MMU_ARM9_write32(u32 adr, u32 val)
 					return;
 					
 				case REG_DISPA_DISPMMEMFIFO:
-					DISP_FIFOsend_u32(val);
+					DISP_FIFOsend<u32, 0>(val);
 					return;
 					
 				case REG_DISPA_MASTERBRIGHT:
--- a/desmume/src/mem.h
+++ b/desmume/src/mem.h
@ -1,7 +1,7 @@
 /*
 	Copyright (C) 2005 Theo Berkau
 	Copyright (C) 2005-2006 Guillaume Duhamel
-	Copyright (C) 2008-2010 DeSmuME team
+	Copyright (C) 2008-2022 DeSmuME team

 	This file is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
@ -140,6 +140,11 @@ static INLINE u16 HostReadWord(u8* const mem, const u32 addr)
   return *((u16 *) (mem + addr));
 }

+static INLINE void HostWriteByte(u8* const mem, const u32 addr, const u8 val)
+{
+   mem[addr] = val;
+}
+
 static INLINE void HostWriteWord(u8* const mem, const u32 addr, const u16 val)
 {
   *((u16 *) (mem + addr)) = val;