- More code cleanup, and various small optimizations.
- Optimize the OpenGL 3D renderer’s framebuffer flush. (Requires SSSE3.)
This commit is contained in:
rogerman 2015-06-17 07:15:22 +00:00
parent ccf2e76c32
commit d3ec1dedfb
5 changed files with 842 additions and 706 deletions

File diff suppressed because it is too large Load Diff

View File

@ -32,6 +32,9 @@ struct MMU_struct;
//#undef FORCEINLINE
//#define FORCEINLINE
#define GPU_FRAMEBUFFER_NATIVE_WIDTH 256
#define GPU_FRAMEBUFFER_NATIVE_HEIGHT 192
void gpu_savestate(EMUFILE* os);
bool gpu_loadstate(EMUFILE* is, int size);
@ -544,14 +547,14 @@ struct _OAM_
u16 attr3;
};
void SlurpOAM(_OAM_* oam_output, void* oam_buffer, int oam_index);
u16 SlurpOAMAffineParam(void* oam_buffer, int oam_index);
void SlurpOAM(_OAM_ *oam_output, void *oam_buffer, const size_t oam_index);
u16 SlurpOAMAffineParam(void *oam_buffer, const size_t oam_index);
typedef struct
{
s16 x;
s16 y;
} size;
} SpriteSize;
#define NB_PRIORITIES 4
@ -620,7 +623,7 @@ struct GPU
_BGxCNT & bgcnt(int num) { return (dispx_st)->dispx_BGxCNT[num].bits; }
_DISPCNT & dispCnt() { return dispx_st->dispx_DISPCNT.bits; }
template<bool MOSAIC> void modeRender(int layer);
template<bool MOSAIC> void modeRender(const size_t layer);
DISPCAPCNT dispCapCnt;
BOOL LayersEnable[5];
@ -647,18 +650,18 @@ struct GPU
} mosaicColors;
u8 sprNum[256];
//u8 h_win[2][256];
u8 *h_win[2];
const u8 *curr_win[2];
void update_winh(int WIN_NUM);
bool need_update_winh[2];
template<int WIN_NUM> void setup_windows();
template<size_t WIN_NUM> void update_winh();
template<size_t WIN_NUM> void setup_windows();
GPUCoreID core;
GPUDisplayMode dispMode;
u8 vramBlock;
u8 *VRAMaddr;
u16 *VRAMaddr;
u16 *VRAMBuffer;
//FIFO fifo;
@ -714,7 +717,6 @@ struct GPU
GPUMasterBrightMode MasterBrightMode;
u32 MasterBrightFactor;
//CACHE_ALIGN u8 bgPixels[1024]; //yes indeed, this is oversized. map debug tools try to write to it
u8 *bgPixels;
u32 currLine;
@ -779,9 +781,9 @@ struct GPU
template<bool MOSAIC, bool BACKDROP> FORCEINLINE void __setFinalColorBck(u16 color, const size_t x, const bool opaque);
template<bool MOSAIC, bool BACKDROP, int FUNCNUM> FORCEINLINE void ___setFinalColorBck(u16 color, const size_t x, const bool opaque);
void setAffineStart(int layer, int xy, u32 val);
void setAffineStartWord(int layer, int xy, u16 val, int word);
u32 getAffineStart(int layer, int xy);
void setAffineStart(const size_t layer, int xy, u32 val);
void setAffineStartWord(const size_t layer, int xy, u16 val, int word);
u32 getAffineStart(const size_t layer, int xy);
void refreshAffineStartRegs(const int num, const int xy);
struct AffineInfo {
@ -814,8 +816,8 @@ struct GPU
updateBLDALPHA();
}
u32 getHOFS(int bg);
u32 getVOFS(int bg);
u32 getHOFS(const size_t bg);
u32 getVOFS(const size_t bg);
typedef u8 TBlendTable[32][32];
TBlendTable *blendTable;
@ -865,7 +867,7 @@ namespace GPU_EXT
void sprite1D(GPU *gpu, u16 l, u8 *dst, u8 *dst_alpha, u8 *typeTab, u8 *prioTab);
void sprite2D(GPU *gpu, u16 l, u8 *dst, u8 *dst_alpha, u8 *typeTab, u8 *prioTab);
extern const size sprSizeTab[4][4];
extern const SpriteSize sprSizeTab[4][4];
typedef struct
{
@ -882,8 +884,8 @@ void Screen_DeInit(void);
extern MMU_struct MMU;
void GPU_setVideoProp(GPU *gpu, u32 p);
void GPU_setBGProp(GPU *gpu, u16 num, u16 p);
void GPU_setVideoProp(GPU *gpu, const u32 ctrlBits);
void GPU_setBGProp(GPU *gpu, const size_t num, const u16 ctrlBits);
void GPU_setBLDCNT(GPU *gpu, u16 v);
void GPU_setBLDY(GPU *gpu, u16 v);
@ -907,7 +909,7 @@ inline void GPU_setWIN0_V(GPU *gpu, u16 val) { gpu->WIN0V0 = val >> 8; gpu->WIN0
inline void GPU_setWIN0_V0(GPU *gpu, u8 val) { gpu->WIN0V0 = val; }
inline void GPU_setWIN0_V1(GPU *gpu, u8 val) { gpu->WIN0V1 = val; }
inline void GPU_setWIN1_H(GPU *gpu, u16 val) {gpu->WIN1H0 = val >> 8; gpu->WIN1H1 = val&0xFF; gpu->need_update_winh[1] = true; }
inline void GPU_setWIN1_H(GPU *gpu, u16 val) { gpu->WIN1H0 = val >> 8; gpu->WIN1H1 = val&0xFF; gpu->need_update_winh[1] = true; }
inline void GPU_setWIN1_H0(GPU *gpu, u8 val) { gpu->WIN1H0 = val; gpu->need_update_winh[1] = true; }
inline void GPU_setWIN1_H1(GPU *gpu, u8 val) { gpu->WIN1H1 = val; gpu->need_update_winh[1] = true; }

View File

@ -28,6 +28,13 @@
#include "NDSSystem.h"
#include "texcache.h"
#ifdef ENABLE_SSE2
#include <emmintrin.h>
#endif
#ifdef ENABLE_SSSE3
#include <tmmintrin.h>
#endif
typedef struct
{
@ -885,6 +892,79 @@ void OpenGLRenderer::SetVersion(unsigned int major, unsigned int minor, unsigned
this->versionRevision = revision;
}
#if defined(ENABLE_SSE2) && defined(ENABLE_SSSE3) && defined(LOCAL_LE)
Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
{
// Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL
// stores pixels using a flipped Y-coordinate, so this needs to be flipped back
// to the DS Y-coordinate.
if ((this->_framebufferWidth % 4) == 0)
{
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
{
for (size_t x = 0; x < this->_framebufferWidth; x+=4, ir+=4, iw+=4)
{
// Convert to RGBA6665
__m128i v = _mm_load_si128((__m128i *)(this->_framebufferColor + ir));
v = _mm_srli_epi32(v, 2);
__m128i a = _mm_srli_epi32(v, 1); // Special handling for 5-bit alpha
a = _mm_and_si128(a, _mm_set1_epi32(0x1F000000));
v = _mm_and_si128(v, _mm_set1_epi32(0x003F3F3F));
v = _mm_or_si128(v, a);
v = _mm_shuffle_epi8(v, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)); // Swizzle RGBA to BGRA
_mm_store_si128((__m128i *)(dstRGBA6665 + iw), v);
// Convert to RGBA5551
v = _mm_load_si128((__m128i *)(this->_framebufferColor + ir));
__m128i b = _mm_and_si128(v, _mm_set1_epi32(0x000000F8)); // Read from R
b = _mm_slli_epi32(b, 7); // Shift to B
__m128i g = _mm_and_si128(v, _mm_set1_epi32(0x0000F800)); // Read from G
g = _mm_srli_epi32(g, 6); // Shift in G
__m128i r = _mm_and_si128(v, _mm_set1_epi32(0x00F80000)); // Read from B
r = _mm_srli_epi32(r, 19); // Shift to R
a = _mm_and_si128(v, _mm_set1_epi32(0xFF000000)); // Read from A
a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A
a = _mm_and_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A
v = b;
v = _mm_add_epi32(v, g);
v = _mm_add_epi32(v, r);
v = _mm_add_epi32(v, a);
// All the colors are currently placed every other 16 bits, so we need to swizzle them
// to the lower 64 bits of our vector before we store them back to memory.
v = _mm_shuffle_epi8(v, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
_mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), v);
}
}
}
else
{
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
{
for (size_t x = 0; x < this->_framebufferWidth; x++, ir++, iw++)
{
dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(this->_framebufferColor[ir].color);
dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F,
(this->_framebufferColor[ir].g >> 3) & 0x1F,
(this->_framebufferColor[ir].r >> 3) & 0x1F) |
((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
}
}
}
return RENDER3DERROR_NOERR;
}
#else // Code path where SSE2, SSSE3, or little-endian is not supported
Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
{
// Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL
@ -915,6 +995,8 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
return RENDER3DERROR_NOERR;
}
#endif // defined(ENABLE_SSE2) && defined(ENABLE_SSSE3) && defined(LOCAL_LE)
OpenGLRenderer_1_2::~OpenGLRenderer_1_2()
{
glFinish();

View File

@ -120,30 +120,75 @@ FORCEINLINE s32 s32floor(double d)
//-------------
#ifdef ENABLE_SSE2
FORCEINLINE void memset_u16_le(void* dst, const size_t length, u16 val)
static void memset_u16(void *dst, const u16 val, const size_t length)
{
u32 u32val;
//just for the endian safety
T1WriteWord((u8*)&u32val, 0, val);
T1WriteWord((u8*)&u32val, 2, val);
////const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
MACRODO_N(length/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp));
__m128i *dst_vec128 = (__m128i *)dst;
const __m128i val_vec128 = _mm_set1_epi16(val);
const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
//MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128));
for (size_t i = 0; i < length_vec128; i++)
dst_vec128[i] = val_vec128;
#else
__m128 temp; temp.m128_i32[0] = u32val;
//MACRODO_N(length/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp));
MACRODO_N(length/8,_mm_store_ps1((float*)((u8*)dst+(X)*16), temp));
const u32 val_u32 = ((u32)val << 16) | (u32)val;
__m128 val_vec128; val_vec128.m128_i32[0] = val_u32;
const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
//MACRODO_N(length_vec128,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), val_vec128));
MACRODO_N(length_vec128, _mm_store_ps1((float*)((u8*)dst+(X)*16), val_vec128));
#endif
}
static void memset_u32(void *dst, const u32 val, const size_t length)
{
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
__m128i *dst_vec128 = (__m128i *)dst;
const __m128i val_vec128 = _mm_set1_epi32(val);
const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
//MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128));
for (size_t i = 0; i < length_vec128; i++)
dst_vec128[i] = val_vec128;
#else
__m128 val_vec128; val_vec128.m128_i32[0] = val;
const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
//MACRODO_N(length_vec128,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), val_vec128));
MACRODO_N(length_vec128, _mm_store_ps1((float*)((u8*)dst+(X)*16), val_vec128));
#endif
}
#else //no sse2
static FORCEINLINE void memset_u16_le(void *dst, const size_t length, const u16 val)
static void memset_u16(void *dst, const u16 val, const size_t length)
{
#ifdef HOST_64
u64 *dst_u64 = (u64 *)dst;
const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
const size_t length_u64 = length / (sizeof(val_u64) / sizeof(val));
//MACRODO_N(length_u64, (dst_u64[X] = val_u64));
for (size_t i = 0; i < length_u64; i++)
dst_u64[i] = val_u64;
#else
for (size_t i = 0; i < length; i++)
T1WriteWord((u8*)dst, i << 1, val);
((u16 *)dst)[i] = val;
#endif
}
static void memset_u32(void *dst, const u32 val, const size_t length)
{
#ifdef HOST_64
u64 *dst_u64 = (u64 *)dst;
const u64 val_u64 = ((u64)val << 32) | (u64)val;
const size_t length_u64 = length / (sizeof(val_u64) / sizeof(val));
//MACRODO_N(length_u64, (dst_u64[X] = val_u64));
for (size_t i = 0; i < length_u64; i++)
dst_u64[i] = val_u64;
#else
for (size_t i = 0; i < length; i++)
((u32 *)dst)[i] = val;
#endif
}
#endif

View File

@ -61,12 +61,21 @@
#endif
#ifdef __GNUC__
#ifdef __SSE__
#define ENABLE_SSE
#endif
#ifdef __SSE2__
#define ENABLE_SSE2
#endif
#ifdef __SSE__
#define ENABLE_SSE
#endif
#ifdef __SSE2__
#define ENABLE_SSE2
#endif
#ifdef __SSE3__
#define ENABLE_SSE3
#endif
#ifdef __SSSE3__
#define ENABLE_SSSE3
#endif
#endif
#ifdef NOSSE
@ -392,7 +401,7 @@ char (*BLAHBLAHBLAH( UNALIGNED T (&)[N] ))[N];
//fairly standard for loop macros
#define MACRODO1(TRICK,TODO) { const int X = TRICK; TODO; }
#define MACRODO1(TRICK,TODO) { const size_t X = TRICK; TODO; }
#define MACRODO2(X,TODO) { MACRODO1((X),TODO) MACRODO1(((X)+1),TODO) }
#define MACRODO4(X,TODO) { MACRODO2((X),TODO) MACRODO2(((X)+2),TODO) }
#define MACRODO8(X,TODO) { MACRODO4((X),TODO) MACRODO4(((X)+4),TODO) }