GPU:

- More code cleanup, and various small optimizations. - Optimize the OpenGL 3D renderer’s framebuffer flush. (Requires SSSE3.)
2015-06-17 07:15:22 +00:00 · 2015-06-17 07:15:22 +00:00 · d3ec1dedfb
parent ccf2e76c32
commit d3ec1dedfb
5 changed files with 842 additions and 706 deletions
--- a/desmume/src/GPU.cpp
+++ b/desmume/src/GPU.cpp
--- a/desmume/src/GPU.h
+++ b/desmume/src/GPU.h
@ -32,6 +32,9 @@ struct MMU_struct;
 //#undef FORCEINLINE
 //#define FORCEINLINE

+#define GPU_FRAMEBUFFER_NATIVE_WIDTH	256
+#define GPU_FRAMEBUFFER_NATIVE_HEIGHT	192
+
 void gpu_savestate(EMUFILE* os);
 bool gpu_loadstate(EMUFILE* is, int size);

@ -544,14 +547,14 @@ struct _OAM_
 	u16 attr3;
 };

-void SlurpOAM(_OAM_* oam_output, void* oam_buffer, int oam_index);
-u16 SlurpOAMAffineParam(void* oam_buffer, int oam_index);
+void SlurpOAM(_OAM_ *oam_output, void *oam_buffer, const size_t oam_index);
+u16 SlurpOAMAffineParam(void *oam_buffer, const size_t oam_index);

 typedef struct
 {
 	 s16 x;
 	 s16 y;
-} size;
+} SpriteSize;


 #define NB_PRIORITIES	4
@ -620,7 +623,7 @@ struct GPU

 	_BGxCNT & bgcnt(int num) { return (dispx_st)->dispx_BGxCNT[num].bits; }
 	_DISPCNT & dispCnt() { return dispx_st->dispx_DISPCNT.bits; }
-	template<bool MOSAIC> void modeRender(int layer);
+	template<bool MOSAIC> void modeRender(const size_t layer);

 	DISPCAPCNT dispCapCnt;
 	BOOL LayersEnable[5];
@ -647,18 +650,18 @@ struct GPU
 	} mosaicColors;

 	u8 sprNum[256];
-	//u8 h_win[2][256];
 	u8 *h_win[2];
 	const u8 *curr_win[2];
-	void update_winh(int WIN_NUM); 
 	bool need_update_winh[2];
 	
-	template<int WIN_NUM> void setup_windows();
+	template<size_t WIN_NUM> void update_winh();
+	template<size_t WIN_NUM> void setup_windows();

 	GPUCoreID core;
 	GPUDisplayMode dispMode;
 	u8 vramBlock;
-	u8 *VRAMaddr;
+	u16 *VRAMaddr;
+	u16 *VRAMBuffer;

 	//FIFO	fifo;

@ -714,7 +717,6 @@ struct GPU
 	GPUMasterBrightMode	MasterBrightMode;
 	u32 MasterBrightFactor;

-	//CACHE_ALIGN u8 bgPixels[1024]; //yes indeed, this is oversized. map debug tools try to write to it
 	u8 *bgPixels;

 	u32 currLine;
@ -779,9 +781,9 @@ struct GPU
 	template<bool MOSAIC, bool BACKDROP> FORCEINLINE void __setFinalColorBck(u16 color, const size_t x, const bool opaque);
 	template<bool MOSAIC, bool BACKDROP, int FUNCNUM> FORCEINLINE void ___setFinalColorBck(u16 color, const size_t x, const bool opaque);

-	void setAffineStart(int layer, int xy, u32 val);
-	void setAffineStartWord(int layer, int xy, u16 val, int word);
-	u32 getAffineStart(int layer, int xy);
+	void setAffineStart(const size_t layer, int xy, u32 val);
+	void setAffineStartWord(const size_t layer, int xy, u16 val, int word);
+	u32 getAffineStart(const size_t layer, int xy);
 	void refreshAffineStartRegs(const int num, const int xy);

 	struct AffineInfo {
@ -814,8 +816,8 @@ struct GPU
 		updateBLDALPHA();
 	}

-	u32 getHOFS(int bg);
-	u32 getVOFS(int bg);
+	u32 getHOFS(const size_t bg);
+	u32 getVOFS(const size_t bg);

 	typedef u8 TBlendTable[32][32];
 	TBlendTable *blendTable;
@ -865,7 +867,7 @@ namespace GPU_EXT
 void sprite1D(GPU *gpu, u16 l, u8 *dst, u8 *dst_alpha, u8 *typeTab, u8 *prioTab);
 void sprite2D(GPU *gpu, u16 l, u8 *dst, u8 *dst_alpha, u8 *typeTab, u8 *prioTab);

-extern const size sprSizeTab[4][4];
+extern const SpriteSize sprSizeTab[4][4];

 typedef struct
 {
@ -882,8 +884,8 @@ void Screen_DeInit(void);

 extern MMU_struct MMU;

-void GPU_setVideoProp(GPU *gpu, u32 p);
-void GPU_setBGProp(GPU *gpu, u16 num, u16 p);
+void GPU_setVideoProp(GPU *gpu, const u32 ctrlBits);
+void GPU_setBGProp(GPU *gpu, const size_t num, const u16 ctrlBits);

 void GPU_setBLDCNT(GPU *gpu, u16 v);
 void GPU_setBLDY(GPU *gpu, u16 v);
@ -907,7 +909,7 @@ inline void GPU_setWIN0_V(GPU *gpu, u16 val) { gpu->WIN0V0 = val >> 8; gpu->WIN0
 inline void GPU_setWIN0_V0(GPU *gpu, u8 val) { gpu->WIN0V0 = val; }
 inline void GPU_setWIN0_V1(GPU *gpu, u8 val) { gpu->WIN0V1 = val; }

-inline void GPU_setWIN1_H(GPU *gpu, u16 val) {gpu->WIN1H0 = val >> 8; gpu->WIN1H1 = val&0xFF;  gpu->need_update_winh[1] = true; }
+inline void GPU_setWIN1_H(GPU *gpu, u16 val) { gpu->WIN1H0 = val >> 8; gpu->WIN1H1 = val&0xFF;  gpu->need_update_winh[1] = true; }
 inline void GPU_setWIN1_H0(GPU *gpu, u8 val) { gpu->WIN1H0 = val;  gpu->need_update_winh[1] = true; }
 inline void GPU_setWIN1_H1(GPU *gpu, u8 val) { gpu->WIN1H1 = val;  gpu->need_update_winh[1] = true; }

--- a/desmume/src/OGLRender.cpp
+++ b/desmume/src/OGLRender.cpp
@ -28,6 +28,13 @@
 #include "NDSSystem.h"
 #include "texcache.h"

+#ifdef ENABLE_SSE2
+#include <emmintrin.h>
+#endif
+
+#ifdef ENABLE_SSSE3
+#include <tmmintrin.h>
+#endif

 typedef struct
 {
@ -885,6 +892,79 @@ void OpenGLRenderer::SetVersion(unsigned int major, unsigned int minor, unsigned
 	this->versionRevision = revision;
 }

+#if defined(ENABLE_SSE2) && defined(ENABLE_SSSE3) && defined(LOCAL_LE)
+Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
+{
+	// Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL
+	// stores pixels using a flipped Y-coordinate, so this needs to be flipped back
+	// to the DS Y-coordinate.
+	
+	if ((this->_framebufferWidth % 4) == 0)
+	{
+		for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
+		{
+			for (size_t x = 0; x < this->_framebufferWidth; x+=4, ir+=4, iw+=4)
+			{
+				// Convert to RGBA6665
+				__m128i v = _mm_load_si128((__m128i *)(this->_framebufferColor + ir));
+				v = _mm_srli_epi32(v, 2);
+				
+				__m128i a = _mm_srli_epi32(v, 1); // Special handling for 5-bit alpha
+				a = _mm_and_si128(a, _mm_set1_epi32(0x1F000000));
+				
+				v = _mm_and_si128(v, _mm_set1_epi32(0x003F3F3F));
+				v = _mm_or_si128(v, a);
+				v = _mm_shuffle_epi8(v, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)); // Swizzle RGBA to BGRA
+				_mm_store_si128((__m128i *)(dstRGBA6665 + iw), v);
+				
+				// Convert to RGBA5551
+				v = _mm_load_si128((__m128i *)(this->_framebufferColor + ir));
+				
+				__m128i b = _mm_and_si128(v, _mm_set1_epi32(0x000000F8));	// Read from R
+				b = _mm_slli_epi32(b, 7);									// Shift to B
+				
+				__m128i g = _mm_and_si128(v, _mm_set1_epi32(0x0000F800));	// Read from G
+				g = _mm_srli_epi32(g, 6);									// Shift in G
+				
+				__m128i r = _mm_and_si128(v, _mm_set1_epi32(0x00F80000));	// Read from B
+				r = _mm_srli_epi32(r, 19);									// Shift to R
+				
+				a = _mm_and_si128(v, _mm_set1_epi32(0xFF000000));			// Read from A
+				a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000));			// Determine A
+				a = _mm_and_si128(a, _mm_set1_epi32(0x00008000));			// Mask to A
+				
+				v = b;
+				v = _mm_add_epi32(v, g);
+				v = _mm_add_epi32(v, r);
+				v = _mm_add_epi32(v, a);
+				
+				// All the colors are currently placed every other 16 bits, so we need to swizzle them
+				// to the lower 64 bits of our vector before we store them back to memory.
+				v = _mm_shuffle_epi8(v, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
+				_mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), v);
+			}
+		}
+	}
+	else
+	{
+		for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
+		{
+			for (size_t x = 0; x < this->_framebufferWidth; x++, ir++, iw++)
+			{
+				dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(this->_framebufferColor[ir].color);
+				dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F,
+												(this->_framebufferColor[ir].g >> 3) & 0x1F,
+												(this->_framebufferColor[ir].r >> 3) & 0x1F) |
+												((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
+			}
+		}
+	}
+	
+	return RENDER3DERROR_NOERR;
+}
+
+#else // Code path where SSE2, SSSE3, or little-endian is not supported
+
 Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
 {
 	// Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL
@ -915,6 +995,8 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
 	return RENDER3DERROR_NOERR;
 }

+#endif // defined(ENABLE_SSE2) && defined(ENABLE_SSSE3) && defined(LOCAL_LE)
+
 OpenGLRenderer_1_2::~OpenGLRenderer_1_2()
 {
 	glFinish();
--- a/desmume/src/matrix.h
+++ b/desmume/src/matrix.h
@ -120,30 +120,75 @@ FORCEINLINE s32 s32floor(double d)
 //-------------
 #ifdef ENABLE_SSE2

-FORCEINLINE void memset_u16_le(void* dst, const size_t length, u16 val)
+static void memset_u16(void *dst, const u16 val, const size_t length)
 {
-	u32 u32val;
-	//just for the endian safety
-	T1WriteWord((u8*)&u32val, 0, val);
-	T1WriteWord((u8*)&u32val, 2, val);
-	////const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
-	
 #if defined(__GNUC__) || defined(__INTEL_COMPILER)
-	const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
-	MACRODO_N(length/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp));
+	__m128i *dst_vec128 = (__m128i *)dst;
+	const __m128i val_vec128 = _mm_set1_epi16(val);
+	const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
+	//MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128));
+	
+	for (size_t i = 0; i < length_vec128; i++)
+		dst_vec128[i] = val_vec128;
 #else
-	__m128 temp; temp.m128_i32[0] = u32val;
-	//MACRODO_N(length/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp));
-	MACRODO_N(length/8,_mm_store_ps1((float*)((u8*)dst+(X)*16), temp));
+	const u32 val_u32 = ((u32)val << 16) | (u32)val;
+	__m128 val_vec128; val_vec128.m128_i32[0] = val_u32;
+	const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
+	//MACRODO_N(length_vec128,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), val_vec128));
+	MACRODO_N(length_vec128, _mm_store_ps1((float*)((u8*)dst+(X)*16), val_vec128));
+#endif
+}
+
+static void memset_u32(void *dst, const u32 val, const size_t length)
+{
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+	__m128i *dst_vec128 = (__m128i *)dst;
+	const __m128i val_vec128 = _mm_set1_epi32(val);
+	const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
+	//MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128));
+	
+	for (size_t i = 0; i < length_vec128; i++)
+		dst_vec128[i] = val_vec128;
+#else
+	__m128 val_vec128; val_vec128.m128_i32[0] = val;
+	const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
+	//MACRODO_N(length_vec128,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), val_vec128));
+	MACRODO_N(length_vec128, _mm_store_ps1((float*)((u8*)dst+(X)*16), val_vec128));
 #endif
 }

 #else //no sse2

-static FORCEINLINE void memset_u16_le(void *dst, const size_t length, const u16 val)
+static void memset_u16(void *dst, const u16 val, const size_t length)
 {
+#ifdef HOST_64
+	u64 *dst_u64 = (u64 *)dst;
+	const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
+	const size_t length_u64 = length / (sizeof(val_u64) / sizeof(val));
+	//MACRODO_N(length_u64, (dst_u64[X] = val_u64));
+	
+	for (size_t i = 0; i < length_u64; i++)
+		dst_u64[i] = val_u64;
+#else
 	for (size_t i = 0; i < length; i++)
-		T1WriteWord((u8*)dst, i << 1, val);
+		((u16 *)dst)[i] = val;
+#endif
+}
+
+static void memset_u32(void *dst, const u32 val, const size_t length)
+{
+#ifdef HOST_64
+	u64 *dst_u64 = (u64 *)dst;
+	const u64 val_u64 = ((u64)val << 32) | (u64)val;
+	const size_t length_u64 = length / (sizeof(val_u64) / sizeof(val));
+	//MACRODO_N(length_u64, (dst_u64[X] = val_u64));
+	
+	for (size_t i = 0; i < length_u64; i++)
+		dst_u64[i] = val_u64;
+#else
+	for (size_t i = 0; i < length; i++)
+		((u32 *)dst)[i] = val;
+#endif
 }

 #endif
--- a/desmume/src/types.h
+++ b/desmume/src/types.h
@ -61,12 +61,21 @@
 #endif

 #ifdef __GNUC__
-#ifdef __SSE__
-#define ENABLE_SSE
-#endif
-#ifdef __SSE2__
-#define ENABLE_SSE2
-#endif
+	#ifdef __SSE__
+		#define ENABLE_SSE
+	#endif
+
+	#ifdef __SSE2__
+		#define ENABLE_SSE2
+	#endif
+
+	#ifdef __SSE3__
+		#define ENABLE_SSE3
+	#endif
+
+	#ifdef __SSSE3__
+		#define ENABLE_SSSE3
+	#endif
 #endif

 #ifdef NOSSE
@ -392,7 +401,7 @@ char (*BLAHBLAHBLAH( UNALIGNED T (&)[N] ))[N];


 //fairly standard for loop macros
-#define MACRODO1(TRICK,TODO) { const int X = TRICK; TODO; }
+#define MACRODO1(TRICK,TODO) { const size_t X = TRICK; TODO; }
 #define MACRODO2(X,TODO)   { MACRODO1((X),TODO)   MACRODO1(((X)+1),TODO) }
 #define MACRODO4(X,TODO)   { MACRODO2((X),TODO)   MACRODO2(((X)+2),TODO) }
 #define MACRODO8(X,TODO)   { MACRODO4((X),TODO)   MACRODO4(((X)+4),TODO) }