GPU:
- More code cleanup, and various small optimizations. - Optimize the OpenGL 3D renderer’s framebuffer flush. (Requires SSSE3.)
This commit is contained in:
parent
ccf2e76c32
commit
d3ec1dedfb
1332
desmume/src/GPU.cpp
1332
desmume/src/GPU.cpp
File diff suppressed because it is too large
Load Diff
|
@ -32,6 +32,9 @@ struct MMU_struct;
|
|||
//#undef FORCEINLINE
|
||||
//#define FORCEINLINE
|
||||
|
||||
#define GPU_FRAMEBUFFER_NATIVE_WIDTH 256
|
||||
#define GPU_FRAMEBUFFER_NATIVE_HEIGHT 192
|
||||
|
||||
void gpu_savestate(EMUFILE* os);
|
||||
bool gpu_loadstate(EMUFILE* is, int size);
|
||||
|
||||
|
@ -544,14 +547,14 @@ struct _OAM_
|
|||
u16 attr3;
|
||||
};
|
||||
|
||||
void SlurpOAM(_OAM_* oam_output, void* oam_buffer, int oam_index);
|
||||
u16 SlurpOAMAffineParam(void* oam_buffer, int oam_index);
|
||||
void SlurpOAM(_OAM_ *oam_output, void *oam_buffer, const size_t oam_index);
|
||||
u16 SlurpOAMAffineParam(void *oam_buffer, const size_t oam_index);
|
||||
|
||||
typedef struct
|
||||
{
|
||||
s16 x;
|
||||
s16 y;
|
||||
} size;
|
||||
} SpriteSize;
|
||||
|
||||
|
||||
#define NB_PRIORITIES 4
|
||||
|
@ -620,7 +623,7 @@ struct GPU
|
|||
|
||||
_BGxCNT & bgcnt(int num) { return (dispx_st)->dispx_BGxCNT[num].bits; }
|
||||
_DISPCNT & dispCnt() { return dispx_st->dispx_DISPCNT.bits; }
|
||||
template<bool MOSAIC> void modeRender(int layer);
|
||||
template<bool MOSAIC> void modeRender(const size_t layer);
|
||||
|
||||
DISPCAPCNT dispCapCnt;
|
||||
BOOL LayersEnable[5];
|
||||
|
@ -647,18 +650,18 @@ struct GPU
|
|||
} mosaicColors;
|
||||
|
||||
u8 sprNum[256];
|
||||
//u8 h_win[2][256];
|
||||
u8 *h_win[2];
|
||||
const u8 *curr_win[2];
|
||||
void update_winh(int WIN_NUM);
|
||||
bool need_update_winh[2];
|
||||
|
||||
template<int WIN_NUM> void setup_windows();
|
||||
template<size_t WIN_NUM> void update_winh();
|
||||
template<size_t WIN_NUM> void setup_windows();
|
||||
|
||||
GPUCoreID core;
|
||||
GPUDisplayMode dispMode;
|
||||
u8 vramBlock;
|
||||
u8 *VRAMaddr;
|
||||
u16 *VRAMaddr;
|
||||
u16 *VRAMBuffer;
|
||||
|
||||
//FIFO fifo;
|
||||
|
||||
|
@ -714,7 +717,6 @@ struct GPU
|
|||
GPUMasterBrightMode MasterBrightMode;
|
||||
u32 MasterBrightFactor;
|
||||
|
||||
//CACHE_ALIGN u8 bgPixels[1024]; //yes indeed, this is oversized. map debug tools try to write to it
|
||||
u8 *bgPixels;
|
||||
|
||||
u32 currLine;
|
||||
|
@ -779,9 +781,9 @@ struct GPU
|
|||
template<bool MOSAIC, bool BACKDROP> FORCEINLINE void __setFinalColorBck(u16 color, const size_t x, const bool opaque);
|
||||
template<bool MOSAIC, bool BACKDROP, int FUNCNUM> FORCEINLINE void ___setFinalColorBck(u16 color, const size_t x, const bool opaque);
|
||||
|
||||
void setAffineStart(int layer, int xy, u32 val);
|
||||
void setAffineStartWord(int layer, int xy, u16 val, int word);
|
||||
u32 getAffineStart(int layer, int xy);
|
||||
void setAffineStart(const size_t layer, int xy, u32 val);
|
||||
void setAffineStartWord(const size_t layer, int xy, u16 val, int word);
|
||||
u32 getAffineStart(const size_t layer, int xy);
|
||||
void refreshAffineStartRegs(const int num, const int xy);
|
||||
|
||||
struct AffineInfo {
|
||||
|
@ -814,8 +816,8 @@ struct GPU
|
|||
updateBLDALPHA();
|
||||
}
|
||||
|
||||
u32 getHOFS(int bg);
|
||||
u32 getVOFS(int bg);
|
||||
u32 getHOFS(const size_t bg);
|
||||
u32 getVOFS(const size_t bg);
|
||||
|
||||
typedef u8 TBlendTable[32][32];
|
||||
TBlendTable *blendTable;
|
||||
|
@ -865,7 +867,7 @@ namespace GPU_EXT
|
|||
void sprite1D(GPU *gpu, u16 l, u8 *dst, u8 *dst_alpha, u8 *typeTab, u8 *prioTab);
|
||||
void sprite2D(GPU *gpu, u16 l, u8 *dst, u8 *dst_alpha, u8 *typeTab, u8 *prioTab);
|
||||
|
||||
extern const size sprSizeTab[4][4];
|
||||
extern const SpriteSize sprSizeTab[4][4];
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@ -882,8 +884,8 @@ void Screen_DeInit(void);
|
|||
|
||||
extern MMU_struct MMU;
|
||||
|
||||
void GPU_setVideoProp(GPU *gpu, u32 p);
|
||||
void GPU_setBGProp(GPU *gpu, u16 num, u16 p);
|
||||
void GPU_setVideoProp(GPU *gpu, const u32 ctrlBits);
|
||||
void GPU_setBGProp(GPU *gpu, const size_t num, const u16 ctrlBits);
|
||||
|
||||
void GPU_setBLDCNT(GPU *gpu, u16 v);
|
||||
void GPU_setBLDY(GPU *gpu, u16 v);
|
||||
|
@ -907,7 +909,7 @@ inline void GPU_setWIN0_V(GPU *gpu, u16 val) { gpu->WIN0V0 = val >> 8; gpu->WIN0
|
|||
inline void GPU_setWIN0_V0(GPU *gpu, u8 val) { gpu->WIN0V0 = val; }
|
||||
inline void GPU_setWIN0_V1(GPU *gpu, u8 val) { gpu->WIN0V1 = val; }
|
||||
|
||||
inline void GPU_setWIN1_H(GPU *gpu, u16 val) {gpu->WIN1H0 = val >> 8; gpu->WIN1H1 = val&0xFF; gpu->need_update_winh[1] = true; }
|
||||
inline void GPU_setWIN1_H(GPU *gpu, u16 val) { gpu->WIN1H0 = val >> 8; gpu->WIN1H1 = val&0xFF; gpu->need_update_winh[1] = true; }
|
||||
inline void GPU_setWIN1_H0(GPU *gpu, u8 val) { gpu->WIN1H0 = val; gpu->need_update_winh[1] = true; }
|
||||
inline void GPU_setWIN1_H1(GPU *gpu, u8 val) { gpu->WIN1H1 = val; gpu->need_update_winh[1] = true; }
|
||||
|
||||
|
|
|
@ -28,6 +28,13 @@
|
|||
#include "NDSSystem.h"
|
||||
#include "texcache.h"
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSSE3
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@ -885,6 +892,79 @@ void OpenGLRenderer::SetVersion(unsigned int major, unsigned int minor, unsigned
|
|||
this->versionRevision = revision;
|
||||
}
|
||||
|
||||
#if defined(ENABLE_SSE2) && defined(ENABLE_SSSE3) && defined(LOCAL_LE)
|
||||
Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
|
||||
{
|
||||
// Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL
|
||||
// stores pixels using a flipped Y-coordinate, so this needs to be flipped back
|
||||
// to the DS Y-coordinate.
|
||||
|
||||
if ((this->_framebufferWidth % 4) == 0)
|
||||
{
|
||||
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
|
||||
{
|
||||
for (size_t x = 0; x < this->_framebufferWidth; x+=4, ir+=4, iw+=4)
|
||||
{
|
||||
// Convert to RGBA6665
|
||||
__m128i v = _mm_load_si128((__m128i *)(this->_framebufferColor + ir));
|
||||
v = _mm_srli_epi32(v, 2);
|
||||
|
||||
__m128i a = _mm_srli_epi32(v, 1); // Special handling for 5-bit alpha
|
||||
a = _mm_and_si128(a, _mm_set1_epi32(0x1F000000));
|
||||
|
||||
v = _mm_and_si128(v, _mm_set1_epi32(0x003F3F3F));
|
||||
v = _mm_or_si128(v, a);
|
||||
v = _mm_shuffle_epi8(v, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)); // Swizzle RGBA to BGRA
|
||||
_mm_store_si128((__m128i *)(dstRGBA6665 + iw), v);
|
||||
|
||||
// Convert to RGBA5551
|
||||
v = _mm_load_si128((__m128i *)(this->_framebufferColor + ir));
|
||||
|
||||
__m128i b = _mm_and_si128(v, _mm_set1_epi32(0x000000F8)); // Read from R
|
||||
b = _mm_slli_epi32(b, 7); // Shift to B
|
||||
|
||||
__m128i g = _mm_and_si128(v, _mm_set1_epi32(0x0000F800)); // Read from G
|
||||
g = _mm_srli_epi32(g, 6); // Shift in G
|
||||
|
||||
__m128i r = _mm_and_si128(v, _mm_set1_epi32(0x00F80000)); // Read from B
|
||||
r = _mm_srli_epi32(r, 19); // Shift to R
|
||||
|
||||
a = _mm_and_si128(v, _mm_set1_epi32(0xFF000000)); // Read from A
|
||||
a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A
|
||||
a = _mm_and_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A
|
||||
|
||||
v = b;
|
||||
v = _mm_add_epi32(v, g);
|
||||
v = _mm_add_epi32(v, r);
|
||||
v = _mm_add_epi32(v, a);
|
||||
|
||||
// All the colors are currently placed every other 16 bits, so we need to swizzle them
|
||||
// to the lower 64 bits of our vector before we store them back to memory.
|
||||
v = _mm_shuffle_epi8(v, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
|
||||
_mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), v);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
|
||||
{
|
||||
for (size_t x = 0; x < this->_framebufferWidth; x++, ir++, iw++)
|
||||
{
|
||||
dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(this->_framebufferColor[ir].color);
|
||||
dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F,
|
||||
(this->_framebufferColor[ir].g >> 3) & 0x1F,
|
||||
(this->_framebufferColor[ir].r >> 3) & 0x1F) |
|
||||
((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return RENDER3DERROR_NOERR;
|
||||
}
|
||||
|
||||
#else // Code path where SSE2, SSSE3, or little-endian is not supported
|
||||
|
||||
Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
|
||||
{
|
||||
// Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL
|
||||
|
@ -915,6 +995,8 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
|
|||
return RENDER3DERROR_NOERR;
|
||||
}
|
||||
|
||||
#endif // defined(ENABLE_SSE2) && defined(ENABLE_SSSE3) && defined(LOCAL_LE)
|
||||
|
||||
OpenGLRenderer_1_2::~OpenGLRenderer_1_2()
|
||||
{
|
||||
glFinish();
|
||||
|
|
|
@ -120,30 +120,75 @@ FORCEINLINE s32 s32floor(double d)
|
|||
//-------------
|
||||
#ifdef ENABLE_SSE2
|
||||
|
||||
FORCEINLINE void memset_u16_le(void* dst, const size_t length, u16 val)
|
||||
static void memset_u16(void *dst, const u16 val, const size_t length)
|
||||
{
|
||||
u32 u32val;
|
||||
//just for the endian safety
|
||||
T1WriteWord((u8*)&u32val, 0, val);
|
||||
T1WriteWord((u8*)&u32val, 2, val);
|
||||
////const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
|
||||
|
||||
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
|
||||
const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
|
||||
MACRODO_N(length/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp));
|
||||
__m128i *dst_vec128 = (__m128i *)dst;
|
||||
const __m128i val_vec128 = _mm_set1_epi16(val);
|
||||
const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
|
||||
//MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128));
|
||||
|
||||
for (size_t i = 0; i < length_vec128; i++)
|
||||
dst_vec128[i] = val_vec128;
|
||||
#else
|
||||
__m128 temp; temp.m128_i32[0] = u32val;
|
||||
//MACRODO_N(length/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp));
|
||||
MACRODO_N(length/8,_mm_store_ps1((float*)((u8*)dst+(X)*16), temp));
|
||||
const u32 val_u32 = ((u32)val << 16) | (u32)val;
|
||||
__m128 val_vec128; val_vec128.m128_i32[0] = val_u32;
|
||||
const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
|
||||
//MACRODO_N(length_vec128,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), val_vec128));
|
||||
MACRODO_N(length_vec128, _mm_store_ps1((float*)((u8*)dst+(X)*16), val_vec128));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void memset_u32(void *dst, const u32 val, const size_t length)
|
||||
{
|
||||
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
|
||||
__m128i *dst_vec128 = (__m128i *)dst;
|
||||
const __m128i val_vec128 = _mm_set1_epi32(val);
|
||||
const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
|
||||
//MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128));
|
||||
|
||||
for (size_t i = 0; i < length_vec128; i++)
|
||||
dst_vec128[i] = val_vec128;
|
||||
#else
|
||||
__m128 val_vec128; val_vec128.m128_i32[0] = val;
|
||||
const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
|
||||
//MACRODO_N(length_vec128,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), val_vec128));
|
||||
MACRODO_N(length_vec128, _mm_store_ps1((float*)((u8*)dst+(X)*16), val_vec128));
|
||||
#endif
|
||||
}
|
||||
|
||||
#else //no sse2
|
||||
|
||||
static FORCEINLINE void memset_u16_le(void *dst, const size_t length, const u16 val)
|
||||
static void memset_u16(void *dst, const u16 val, const size_t length)
|
||||
{
|
||||
#ifdef HOST_64
|
||||
u64 *dst_u64 = (u64 *)dst;
|
||||
const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
|
||||
const size_t length_u64 = length / (sizeof(val_u64) / sizeof(val));
|
||||
//MACRODO_N(length_u64, (dst_u64[X] = val_u64));
|
||||
|
||||
for (size_t i = 0; i < length_u64; i++)
|
||||
dst_u64[i] = val_u64;
|
||||
#else
|
||||
for (size_t i = 0; i < length; i++)
|
||||
T1WriteWord((u8*)dst, i << 1, val);
|
||||
((u16 *)dst)[i] = val;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void memset_u32(void *dst, const u32 val, const size_t length)
|
||||
{
|
||||
#ifdef HOST_64
|
||||
u64 *dst_u64 = (u64 *)dst;
|
||||
const u64 val_u64 = ((u64)val << 32) | (u64)val;
|
||||
const size_t length_u64 = length / (sizeof(val_u64) / sizeof(val));
|
||||
//MACRODO_N(length_u64, (dst_u64[X] = val_u64));
|
||||
|
||||
for (size_t i = 0; i < length_u64; i++)
|
||||
dst_u64[i] = val_u64;
|
||||
#else
|
||||
for (size_t i = 0; i < length; i++)
|
||||
((u32 *)dst)[i] = val;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -61,12 +61,21 @@
|
|||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __SSE__
|
||||
#define ENABLE_SSE
|
||||
#endif
|
||||
#ifdef __SSE2__
|
||||
#define ENABLE_SSE2
|
||||
#endif
|
||||
#ifdef __SSE__
|
||||
#define ENABLE_SSE
|
||||
#endif
|
||||
|
||||
#ifdef __SSE2__
|
||||
#define ENABLE_SSE2
|
||||
#endif
|
||||
|
||||
#ifdef __SSE3__
|
||||
#define ENABLE_SSE3
|
||||
#endif
|
||||
|
||||
#ifdef __SSSE3__
|
||||
#define ENABLE_SSSE3
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef NOSSE
|
||||
|
@ -392,7 +401,7 @@ char (*BLAHBLAHBLAH( UNALIGNED T (&)[N] ))[N];
|
|||
|
||||
|
||||
//fairly standard for loop macros
|
||||
#define MACRODO1(TRICK,TODO) { const int X = TRICK; TODO; }
|
||||
#define MACRODO1(TRICK,TODO) { const size_t X = TRICK; TODO; }
|
||||
#define MACRODO2(X,TODO) { MACRODO1((X),TODO) MACRODO1(((X)+1),TODO) }
|
||||
#define MACRODO4(X,TODO) { MACRODO2((X),TODO) MACRODO2(((X)+2),TODO) }
|
||||
#define MACRODO8(X,TODO) { MACRODO4((X),TODO) MACRODO4(((X)+4),TODO) }
|
||||
|
|
Loading…
Reference in New Issue