grab bag of optimizations: many optimizations to sse functions, now using intrinsic functions. this would enable these functions to work on gcc (i think) if our configuration supported it, but it will fail for some vc++ express installations. those guys will have to either #define SSE2_NOINTRIN or help me figure out which installations are broken and why and how to fix it. also, collapse BG layer pixel blenders into a single function, so watch for regressions there.

This commit is contained in:
zeromus 2009-07-18 09:15:41 +00:00
parent baef153e05
commit 062a228877
6 changed files with 258 additions and 241 deletions

View File

@ -39,6 +39,7 @@
//#undef FORCEINLINE
//#define FORCEINLINE
//#define SSE2_NOINTRIN
ARM9_struct ARM9Mem;
@ -491,29 +492,6 @@ FORCEINLINE void GPU::setFinal3DColorSpecialDecreaseWnd(int dstX, int srcX)
}
}
enum OBJFunc
{
None, Blend, Increase, Decrease
};
template<OBJFunc FUNC, bool WINDOW>
static void _master_setFinalOBJColor(GPU *gpu, u32 passing, u8 *dst, u16 color, u8 alpha, u8 type, u16 x);
static void setFinalOBJColorSpecialNoneWnd (GPU *gpu, u32 passing, u8 *dst, u16 color, u8 alpha, u8 type, u16 x);
static void setFinalOBJColorSpecialBlendWnd (GPU *gpu, u32 passing, u8 *dst, u16 color, u8 alpha, u8 type, u16 x);
static void setFinalOBJColorSpecialIncreaseWnd (GPU *gpu, u32 passing, u8 *dst, u16 color, u8 alpha, u8 type, u16 x);
static void setFinalOBJColorSpecialDecreaseWnd (GPU *gpu, u32 passing, u8 *dst, u16 color, u8 alpha, u8 type, u16 x);
const GPU::FinalOBJColFunct pixelBlittersOBJ[8] = {
_master_setFinalOBJColor<None,false>,
_master_setFinalOBJColor<Blend,false>,
_master_setFinalOBJColor<Increase,false>,
_master_setFinalOBJColor<Decrease,false>,
_master_setFinalOBJColor<None,true>,
_master_setFinalOBJColor<Blend,true>,
_master_setFinalOBJColor<Increase,true>,
_master_setFinalOBJColor<Decrease,true> };
/*****************************************************************************/
// INITIALIZATION
/*****************************************************************************/
@ -591,7 +569,7 @@ GPU * GPU_Init(u8 l)
g->need_update_winh[1] = true;
g->setFinalColorBck_funcNum = 0;
g->setFinalColor3d_funcNum = 0;
g->setFinalColorSpr = _master_setFinalOBJColor<None,false>;
g->setFinalColorSpr_funcNum = 0;
return g;
}
@ -602,7 +580,7 @@ void GPU_Reset(GPU *g, u8 l)
g->setFinalColorBck_funcNum = 0;
g->setFinalColor3d_funcNum = 0;
g->setFinalColorSpr = _master_setFinalOBJColor<None,false>;
g->setFinalColorSpr_funcNum = 0;
g->core = l;
g->BGSize[0][0] = g->BGSize[1][0] = g->BGSize[2][0] = g->BGSize[3][0] = 256;
g->BGSize[0][1] = g->BGSize[1][1] = g->BGSize[2][1] = g->BGSize[3][1] = 256;
@ -729,7 +707,7 @@ void SetupFinalPixelBlitter (GPU *gpu)
u8 windowUsed = (gpu->WIN0_ENABLED | gpu->WIN1_ENABLED | gpu->WINOBJ_ENABLED);
u8 blendMode = (gpu->BLDCNT >> 6)&3;
gpu->setFinalColorSpr = pixelBlittersOBJ[windowUsed*4 + blendMode];
gpu->setFinalColorSpr_funcNum = windowUsed*4 + blendMode;
gpu->setFinalColorBck_funcNum = windowUsed*4 + blendMode;
gpu->setFinalColor3d_funcNum = windowUsed*4 + blendMode;
@ -961,128 +939,47 @@ FORCEINLINE void GPU::renderline_checkWindows(u16 x, bool &draw, bool &effect) c
// PIXEL RENDERING - BGS
/*****************************************************************************/
template<bool BACKDROP> FORCEINLINE void GPU::setFinalBGColorSpecialNone(u16 &color, const u32 x)
{
}
template<bool BACKDROP> FORCEINLINE void GPU::setFinalBGColorSpecialBlend(u16 &color, const u32 x)
template<bool BACKDROP, BlendFunc FUNC, bool WINDOW>
FORCEINLINE FASTCALL bool GPU::_master_setFinalBGColor(u16 &color, const u32 x)
{
//no further analysis for no special effects. just draw it.
if(FUNC == None) return true;
//blend backdrop with what?? this doesn't make sense
if(BACKDROP) return;
if(blend1)
if(FUNC==Blend && BACKDROP) return true;
bool windowEffect = true;
if(WINDOW)
{
//If the layer we are drawing on is selected as 2nd source, we can blend
int bg_under = bgPixels[x];
if(blend2[bg_under])
color = blend(color,T2ReadWord(currDst, x<<1));
bool windowDraw;
renderline_checkWindows(x, windowDraw, windowEffect);
//backdrop must always be drawn
if(BACKDROP) windowDraw = true;
//we never have anything more to do if the window rejected us
if(!windowDraw) return false;
}
}
template<bool BACKDROP> FORCEINLINE void GPU::setFinalBGColorSpecialIncrease (u16 &color, const u32 x)
{
if(blend1) // the bg to draw has a special color effect
{
color = currentFadeInColors[color];
}
}
template<bool BACKDROP> FORCEINLINE void GPU::setFinalBGColorSpecialDecrease(u16 &color, const u32 x)
{
if(blend1) // the bg to draw has a special color effect
{
color = currentFadeOutColors[color];
}
}
template<bool BACKDROP> FORCEINLINE bool GPU::setFinalBGColorSpecialNoneWnd(u16 &color, const u32 x)
{
bool windowDraw = true, windowEffect = true;
renderline_checkWindows(x, windowDraw, windowEffect);
if(BACKDROP) windowDraw = true; //backdrop must always be drawn
if (blend1 && windowEffect) // the bg to draw has a special color effect
{
//special effects rejected. just draw it.
if(!(blend1 && windowEffect))
return true;
const u8 bg_under = bgPixels[x];
//perform the special effect
switch(FUNC) {
case Blend: if(blend2[bg_under]) color = blend(color,T2ReadWord(currDst, x<<1)); break;
case Increase: color = currentFadeInColors[color]; break;
case Decrease: color = currentFadeOutColors[color]; break;
}
else
{
if ((windowEffect && (BLDCNT & (0x100 << currBgNum))) || windowDraw)
{
return true;
}
}
return false;
return true;
}
template<bool BACKDROP> FORCEINLINE bool GPU::setFinalBGColorSpecialBlendWnd(u16 &color, const u32 x)
{
bool windowDraw = true, windowEffect = true;
renderline_checkWindows(x, windowDraw, windowEffect);
if(BACKDROP) windowDraw = true; //backdrop must always be drawn
if(windowDraw)
{
if(blend1 && windowEffect)
{
int bg_under = bgPixels[x];
// If the layer we are drawing on is selected as 2nd source, we can blend
if(blend2[bg_under])
color = blend(color,T2ReadWord(currDst, x<<1));
}
return true;
}
return false;
}
template<bool BACKDROP> FORCEINLINE bool GPU::setFinalBGColorSpecialIncreaseWnd(u16 &color, const u32 x)
{
bool windowDraw = true, windowEffect = true;
renderline_checkWindows(x, windowDraw, windowEffect);
if(BACKDROP) windowDraw = true; //backdrop must always be drawn
if(windowDraw)
{
if(blend1 && windowEffect)
{
color = currentFadeInColors[color];
}
return true;
}
return false;
}
template<bool BACKDROP> FORCEINLINE bool GPU::setFinalBGColorSpecialDecreaseWnd(u16 &color, const u32 x)
{
bool windowDraw = true, windowEffect = true;
renderline_checkWindows(x, windowDraw, windowEffect);
if(BACKDROP) windowDraw = true; //backdrop must always be drawn
if(windowDraw)
{
if(blend1 && windowEffect)
{
color = currentFadeOutColors[color];
}
return true;
}
return false;
}
/*****************************************************************************/
// PIXEL RENDERING - OBJS
/*****************************************************************************/
template<OBJFunc FUNC, bool WINDOW>
static void _master_setFinalOBJColor(GPU *gpu, u32 passing, u8 *dst, u16 color, u8 alpha, u8 type, u16 x)
template<BlendFunc FUNC, bool WINDOW>
static FORCEINLINE void _master_setFinalOBJColor(GPU *gpu, u8 *dst, u16 color, u8 alpha, u8 type, u16 x)
{
bool windowDraw = true, windowEffect = true;
@ -1094,19 +991,19 @@ static void _master_setFinalOBJColor(GPU *gpu, u32 passing, u8 *dst, u16 color,
}
//this inspects the layer beneath the sprite to see if the current blend flags make it a candidate for blending
int bg_under = gpu->bgPixels[x];
bool allowBlend = ((bg_under != 4) && (gpu->BLDCNT & (0x100 << bg_under)));
const int bg_under = gpu->bgPixels[x];
const bool allowBlend = (bg_under != 4) && gpu->blend2[bg_under];
bool sourceEffectSelected = (gpu->BLDCNT & 0x10)!=0;
const bool sourceEffectSelected = gpu->blend1;
//note that the fadein and fadeout is done here before blending,
//so that a fade and blending can be applied at the same time
//so that a fade and blending can be applied at the same time (actually, I don't think that is legal..)
bool forceBlendingForNormal = false;
if(windowEffect && sourceEffectSelected)
switch(FUNC)
{
case Increase: if(!allowBlend) color = fadeInColors[gpu->BLDY_EVY][color&0x7FFF]; break;
case Decrease: if(!allowBlend) color = fadeOutColors[gpu->BLDY_EVY][color&0x7FFF]; break;
case Increase: if(!allowBlend) color = gpu->currentFadeInColors[color&0x7FFF]; break;
case Decrease: if(!allowBlend) color = gpu->currentFadeOutColors[color&0x7FFF]; break;
//only when blend color effect is selected, ordinarily opaque sprites are blended with the color effect params
case Blend: forceBlendingForNormal = true; break;
@ -1115,7 +1012,7 @@ static void _master_setFinalOBJColor(GPU *gpu, u32 passing, u8 *dst, u16 color,
if(allowBlend)
{
u16 backColor = T2ReadWord(dst,passing);
u16 backColor = T2ReadWord(dst,x<<1);
//this hasn't been tested: this blending occurs without regard to the color effect,
//but rather purely from the sprite's alpha
if(type == GPU_OBJ_MODE_Bitmap)
@ -1124,11 +1021,13 @@ static void _master_setFinalOBJColor(GPU *gpu, u32 passing, u8 *dst, u16 color,
color = gpu->blend(color,backColor);
}
T2WriteWord(dst, passing, (color | 0x8000));
T2WriteWord(dst, x<<1, (color | 0x8000));
gpu->bgPixels[x] = 4;
}
template<bool BACKDROP> FORCEINLINE void GPU::setFinalColorBG(u16 color, const u32 x)
//FUNCNUM is only set for backdrop, for an optimization of looking it up early
template<bool BACKDROP, int FUNCNUM>
FORCEINLINE void GPU::setFinalColorBG(u16 color, const u32 x)
{
//It is not safe to assert this here.
//This is probably the best place to enforce it, since almost every single color that comes in here
@ -1136,17 +1035,19 @@ template<bool BACKDROP> FORCEINLINE void GPU::setFinalColorBG(u16 color, const u
//assert((color&0x8000)==0);
if(!BACKDROP) color &= 0x7FFF; //but for the backdrop we can easily guarantee earlier that theres no bit here
bool draw=true;
switch(setFinalColorBck_funcNum)
bool draw;
const int test = BACKDROP?FUNCNUM:setFinalColorBck_funcNum;
switch(test)
{
case 0x0: setFinalBGColorSpecialNone<BACKDROP>(color,x); break;
case 0x1: setFinalBGColorSpecialBlend<BACKDROP>(color,x); break;
case 0x2: setFinalBGColorSpecialIncrease<BACKDROP>(color,x); break;
case 0x3: setFinalBGColorSpecialDecrease<BACKDROP>(color,x); break;
case 0x4: draw=setFinalBGColorSpecialNoneWnd<BACKDROP>(color,x); break;
case 0x5: draw=setFinalBGColorSpecialBlendWnd<BACKDROP>(color,x); break;
case 0x6: draw=setFinalBGColorSpecialIncreaseWnd<BACKDROP>(color,x); break;
case 0x7: draw=setFinalBGColorSpecialDecreaseWnd<BACKDROP>(color,x); break;
case 0: draw = _master_setFinalBGColor<BACKDROP,None,false>(color,x); break;
case 1: draw = _master_setFinalBGColor<BACKDROP,Blend,false>(color,x); break;
case 2: draw = _master_setFinalBGColor<BACKDROP,Increase,false>(color,x); break;
case 3: draw = _master_setFinalBGColor<BACKDROP,Decrease,false>(color,x); break;
case 4: draw = _master_setFinalBGColor<BACKDROP,None,true>(color,x); break;
case 5: draw = _master_setFinalBGColor<BACKDROP,Blend,true>(color,x); break;
case 6: draw = _master_setFinalBGColor<BACKDROP,Increase,true>(color,x); break;
case 7: draw = _master_setFinalBGColor<BACKDROP,Decrease,true>(color,x); break;
};
if(BACKDROP || draw) //backdrop must always be drawn
@ -1159,7 +1060,6 @@ template<bool BACKDROP> FORCEINLINE void GPU::setFinalColorBG(u16 color, const u
FORCEINLINE void GPU::setFinalColor3d(int dstX, int srcX)
{
//if someone disagrees with these, they could be reimplemented as a function pointer easily
switch(setFinalColor3d_funcNum)
{
case 0x0: setFinal3DColorSpecialNone(dstX,srcX); break;
@ -1173,9 +1073,31 @@ FORCEINLINE void GPU::setFinalColor3d(int dstX, int srcX)
};
}
FORCEINLINE void setFinalColorSpr(GPU* gpu, u8 *dst, u16 color, u8 alpha, u8 type, u16 x)
{
switch(gpu->setFinalColorSpr_funcNum)
{
case 0x0: _master_setFinalOBJColor<None,false>(gpu, dst, color, alpha, type, x); break;
case 0x1: _master_setFinalOBJColor<Blend,false>(gpu, dst, color, alpha, type, x); break;
case 0x2: _master_setFinalOBJColor<Increase,false>(gpu, dst, color, alpha, type, x); break;
case 0x3: _master_setFinalOBJColor<Decrease,false>(gpu, dst, color, alpha, type, x); break;
case 0x4: _master_setFinalOBJColor<None,true>(gpu, dst, color, alpha, type, x); break;
case 0x5: _master_setFinalOBJColor<Blend,true>(gpu, dst, color, alpha, type, x); break;
case 0x6: _master_setFinalOBJColor<Increase,true>(gpu, dst, color, alpha, type, x); break;
case 0x7: _master_setFinalOBJColor<Decrease,true>(gpu, dst, color, alpha, type, x); break;
};
}
template<bool MOSAIC, bool BACKDROP>
FORCEINLINE void GPU::__setFinalColorBck(u16 color, const u32 x, const int opaque)
{
return ___setFinalColorBck<MOSAIC, BACKDROP, 0>(color,x,opaque);
}
//this was forced inline because most of the time it just falls through to setFinalColorBck() and the function call
//overhead was ridiculous and terrible
template<bool MOSAIC, bool BACKDROP> FORCEINLINE void GPU::__setFinalColorBck(u16 color, const u32 x, const bool opaque)
template<bool MOSAIC, bool BACKDROP, int FUNCNUM>
FORCEINLINE void GPU::___setFinalColorBck(u16 color, const u32 x, const int opaque)
{
//I commented out this line to make a point.
//under ordinary circumstances, nobody should pass in something >=256
@ -1206,7 +1128,7 @@ template<bool MOSAIC, bool BACKDROP> FORCEINLINE void GPU::__setFinalColorBck(u1
if(color != 0xFFFF)
{
finish:
setFinalColorBG<BACKDROP>(color,x);
setFinalColorBG<BACKDROP,FUNCNUM>(color,x);
}
}
@ -1244,7 +1166,7 @@ static void mosaicSpriteLinePixel(GPU * gpu, int x, u16 l, u8 * dst, u8 * dst_al
if(!objColor.opaque) prioTab[x] = 0xFF;
}
static void mosaicSpriteLine(GPU * gpu, u16 l, u8 * dst, u8 * dst_alpha, u8 * typeTab, u8 * prioTab)
FORCEINLINE static void mosaicSpriteLine(GPU * gpu, u16 l, u8 * dst, u8 * dst_alpha, u8 * typeTab, u8 * prioTab)
{
//don't even try this unless the mosaic is effective
if(gpu->mosaicLookup.widthValue != 0 || gpu->mosaicLookup.heightValue != 0)
@ -1281,7 +1203,7 @@ template<bool MOSAIC> void lineLarge8bpp(GPU * gpu)
XBG &= wmask;
u8 pixel = map[XBG];
u16 color = T1ReadWord(pal, pixel<<1);
gpu->__setFinalColorBck<MOSAIC,false>(color,x,color!=0);
gpu->__setFinalColorBck<MOSAIC,false>(color,x,color);
}
}
@ -1821,14 +1743,6 @@ FORCEINLINE BOOL compute_sprite_vars(_OAM_ * spriteInfo, u16 l,
// SPRITE RENDERING
/*****************************************************************************/
void GPU::spriteRender(u8 * dst, u8 * dst_alpha, u8 * typeTab, u8 * prioTab)
{
if(spriteRenderMode == SPRITE_1D)
_spriteRender<SPRITE_1D>(dst,dst_alpha,typeTab, prioTab);
else
_spriteRender<SPRITE_2D>(dst,dst_alpha,typeTab, prioTab);
}
//TODO - refactor this so there isnt as much duped code between rotozoomed and non-rotozoomed versions
template<GPU::SpriteRenderMode MODE>
@ -2324,13 +2238,6 @@ void GPU_set_DISPCAPCNT(u32 val)
gpu->dispCapCnt.srcB = (val >> 25) & 0x01;
gpu->dispCapCnt.capSrc = (val >> 29) & 0x03;
//gpu->dispCapCnt.dstBlock = = (gpu->dispCapCnt.writeBlock * 0x20000) +
// (gpu->dispCapCnt.writeOffset * 0x8000);
//
//gpu->dispCapCnt.src = (gpu->dispCapCnt.readBlock * 0x20000) +
// (gpu->dispCapCnt.readOffset * 0x8000);
//
switch((val >> 20) & 0x03)
{
case 0:
@ -2357,33 +2264,48 @@ void GPU_set_DISPCAPCNT(u32 val)
gpu->dispCapCnt.capSrc, gpu->dispCapCnt.dst - ARM9Mem.ARM9_LCD, gpu->dispCapCnt.src - ARM9Mem.ARM9_LCD,
gpu->dispCapCnt.srcA, gpu->dispCapCnt.srcB);*/
}
// #define BRIGHT_TABLES
static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
{
CACHE_ALIGN u8 spr[512];
CACHE_ALIGN u8 sprAlpha[256];
CACHE_ALIGN u8 sprType[256];
CACHE_ALIGN u8 sprPrio[256];
GPU * gpu = screen->gpu;
struct _DISPCNT * dispCnt = &(gpu->dispx_st)->dispx_DISPCNT.bits;
itemsForPriority_t * item;
u8 spr[512];
u8 sprAlpha[256];
u8 sprType[256];
u8 sprPrio[256];
u8 prio;
u16 i16;
BOOL BG_enabled = TRUE;
gpu->currentFadeInColors = &fadeInColors[gpu->BLDY_EVY][0];
gpu->currentFadeOutColors = &fadeOutColors[gpu->BLDY_EVY][0];
u16 backdrop_color = T1ReadWord(ARM9Mem.ARM9_VMEM, gpu->core * 0x400) & 0x7FFF;
//we need to write backdrop colors in the same way as we do BG pixels in order to do correct window processing
//this is currently eating up 2fps or so. it is a reasonable candidate for optimization.
gpu->currBgNum = 5;
for(int x=0;x<256;x++) {
gpu->__setFinalColorBck<false,true>(backdrop_color,x,1);
}
memset(gpu->bgPixels,5,256);
switch(gpu->setFinalColorBck_funcNum) {
case 0: case 1: //for backdrops, (even with window enabled) none and blend are both the same: just copy the color
case 4: case 5:
memset_u16_le<256>(gpu->currDst,backdrop_color);
break;
case 2:
//for non-windowed fade, we can just fade the color and fill
memset_u16_le<256>(gpu->currDst,gpu->currentFadeInColors[backdrop_color]);
break;
case 3:
//likewise for non-windowed fadeout
memset_u16_le<256>(gpu->currDst,gpu->currentFadeOutColors[backdrop_color]);
break;
//this check isnt really helpful. it just slows us down in the cases where we need the most speed
//if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] && !gpu->LayersEnable[2] && !gpu->LayersEnable[3] && !gpu->LayersEnable[4]) return;
//windowed fades need special treatment
case 6: for(int x=0;x<256;x++) gpu->___setFinalColorBck<false,true,6>(backdrop_color,x,1); break;
case 7: for(int x=0;x<256;x++) gpu->___setFinalColorBck<false,true,7>(backdrop_color,x,1); break;
}
memset(gpu->bgPixels,5,256);
// init background color & priorities
memset(sprAlpha, 0, 256);
@ -2392,9 +2314,11 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
memset(sprWin, 0, 256);
// init pixels priorities
for (int i=0; i<NB_PRIORITIES; i++) {
gpu->itemsForPriority[i].nbPixelsX = 0;
}
assert(NB_PRIORITIES==4);
gpu->itemsForPriority[0].nbPixelsX = 0;
gpu->itemsForPriority[1].nbPixelsX = 0;
gpu->itemsForPriority[2].nbPixelsX = 0;
gpu->itemsForPriority[3].nbPixelsX = 0;
// for all the pixels in the line
if (gpu->LayersEnable[4])
@ -2405,7 +2329,6 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
//zero 06-may-09: I properly supported window color effects for backdrop, but I am not sure
//how it interacts with this. I wish we knew why we needed this
gpu->spriteRender(spr, sprAlpha, sprType, sprPrio);
mosaicSpriteLine(gpu, l, spr, sprAlpha, sprType, sprPrio);
@ -2413,7 +2336,7 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
for(int i = 0; i<256; i++)
{
// assign them to the good priority item
prio = sprPrio[i];
int prio = sprPrio[i];
if (prio >=4) continue;
item = &(gpu->itemsForPriority[prio]);
@ -2426,9 +2349,12 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] && !gpu->LayersEnable[2] && !gpu->LayersEnable[3])
BG_enabled = FALSE;
for(int j=0;j<8;j++)
gpu->blend2[j] = (gpu->BLDCNT & (0x100 << j));
// paint lower priorities fist
// then higher priorities on top
for(prio=NB_PRIORITIES; prio > 0; )
for(int prio=NB_PRIORITIES; prio > 0; )
{
prio--;
item = &(gpu->itemsForPriority[prio]);
@ -2442,18 +2368,10 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
{
gpu->currBgNum = i16;
gpu->blend1 = gpu->BLDCNT & (1 << gpu->currBgNum);
for(int j=0;j<8;j++)
gpu->blend2[j] = (gpu->BLDCNT & (0x100 << j));
gpu->currentFadeInColors = &fadeInColors[gpu->BLDY_EVY][0];
gpu->currentFadeOutColors = &fadeOutColors[gpu->BLDY_EVY][0];
//gpu->bgFunc = gpu->setFinalColorBck_funcNum;
struct _BGxCNT *bgCnt = &(gpu->dispx_st)->dispx_BGxCNT[i16].bits;
gpu->curr_mosaic_enabled = bgCnt->Mosaic_Enable;
//mosaic test hacks
//gpu->curr_mosaic_enabled = true;
if (gpu->core == GPU_MAIN)
{
if (i16 == 0 && dispCnt->BG0_3D)
@ -2495,16 +2413,12 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
if (gpu->LayersEnable[4])
{
gpu->currBgNum = 4;
////analyze mosaic configuration
//u16 mosaic_control = T1ReadWord((u8 *)&gpu->dispx_st->dispx_MISC.MOSAIC, 0);
//gpu->curr_mosaic_enabled
gpu->blend1 = gpu->BLDCNT & (1 << gpu->currBgNum);
for (int i=0; i < item->nbPixelsX; i++)
{
i16=item->PixelsX[i];
// T2WriteWord(dst, i16 << 1, T2ReadWord(spr, i16 << 1));
// gpu->bgPixels[i16] = 4;
gpu->setFinalColorSpr(gpu, (i16<<1), gpu->currDst, T2ReadWord(spr, (i16<<1)), sprAlpha[i16], sprType[i16], i16);
setFinalColorSpr(gpu, gpu->currDst, T2ReadWord(spr, (i16<<1)), sprAlpha[i16], sprType[i16], i16);
}
}
}
@ -2686,10 +2600,8 @@ static INLINE void GPU_ligne_MasterBrightness(NDS_Screen * screen, u16 l)
if(factor>16) factor=16;
// Apply final brightness adjust (MASTER_BRIGHT)
// Reference: http://nocash.emubase.de/gbatek.htm#dsvideo (Under MASTER_BRIGHTNESS)
/* Mightymax> it should be more effective if the windowmanager applies brightness when drawing */
/* it will most likly take acceleration, while we are stuck here with CPU power */
//Apply final brightness adjust (MASTER_BRIGHT)
//http://nocash.emubase.de/gbatek.htm#dsvideo (Under MASTER_BRIGHTNESS)
switch (gpu->MasterBrightMode)
{
@ -2841,9 +2753,6 @@ void GPU_ligne(NDS_Screen * screen, u16 l, bool skip)
return;
}
//if(gpu->core == 1)
// printf("%d\n",l);
//blacken the screen if it is turned off by the user
if(!CommonSettings.showGpu.screens[gpu->core])
{
@ -2852,12 +2761,6 @@ void GPU_ligne(NDS_Screen * screen, u16 l, bool skip)
return;
}
//{
// extern int currFrameCounter;
// u8 * dst = GPU_screen + (screen->offset + l) * 512;
// memset(dst,currFrameCounter,512);
//}
//cache some parameters which are assumed to be stable throughout the rendering of the entire line
gpu->currLine = l;
u16 mosaic_control = T1ReadWord((u8 *)&gpu->dispx_st->dispx_MISC.MOSAIC, 0);

View File

@ -123,6 +123,10 @@ typedef union
#define BGxENABLED(cnt,num) ((num<8)? ((cnt.val>>8) & num):0)
enum BlendFunc
{
None, Blend, Increase, Decrease
};
/*******************************************************************************
@ -601,10 +605,19 @@ typedef struct
#define NB_BG 4
typedef struct
{
u8 BGs[NB_BG], nbBGs;
u8 PixelsX[256];
// doh ! yoda says : 256 pixels we can have...
u8 BGs[NB_BG], nbBGs;
u8 pad[1];
u16 nbPixelsX;
//256+8:
u8 pad2[248];
//things were slower when i organized this struct this way. whatever.
//u8 PixelsX[256];
//int BGs[NB_BG], nbBGs;
//int nbPixelsX;
////<-- 256 + 24
//u8 pad2[256-24];
} itemsForPriority_t;
#define ARM9MEM_ABG 0x06000000
#define ARM9MEM_BBG 0x06200000
@ -761,13 +774,13 @@ struct GPU
u16 blend(u16 colA, u16 colB);
typedef void (*FinalOBJColFunct)(GPU *gpu, u32 passing, u8 *dst, u16 color, u8 alpha, u8 type, u16 x);
typedef void (*Final3DColFunct)(GPU *gpu, int dstX, int srcX);
template<bool BACKDROP, BlendFunc FUNC, bool WINDOW>
FORCEINLINE FASTCALL bool _master_setFinalBGColor(u16 &color, const u32 x);
int setFinalColorBck_funcNum;
int bgFunc;
int setFinalColor3d_funcNum;
FinalOBJColFunct setFinalColorSpr;
int setFinalColorSpr_funcNum;
//Final3DColFunct setFinalColor3D;
enum SpriteRenderMode {
SPRITE_1D, SPRITE_2D
@ -775,9 +788,17 @@ struct GPU
template<GPU::SpriteRenderMode MODE>
void _spriteRender(u8 * dst, u8 * dst_alpha, u8 * typeTab, u8 * prioTab);
void spriteRender(u8 * dst, u8 * dst_alpha, u8 * typeTab, u8 * prioTab);
inline void spriteRender(u8 * dst, u8 * dst_alpha, u8 * typeTab, u8 * prioTab)
{
if(spriteRenderMode == SPRITE_1D)
_spriteRender<SPRITE_1D>(dst,dst_alpha,typeTab, prioTab);
else
_spriteRender<SPRITE_2D>(dst,dst_alpha,typeTab, prioTab);
}
template<bool BACKDROP> void setFinalColorBG(u16 color, const u32 x);
template<bool BACKDROP, int FUNCNUM> void setFinalColorBG(u16 color, const u32 x);
void setFinalColor3d(int dstX, int srcX);
template<bool BACKDROP> FORCEINLINE void setFinalBGColorSpecialNone(u16 &color, const u32 x);
@ -799,7 +820,8 @@ struct GPU
FORCEINLINE void setFinal3DColorSpecialDecreaseWnd(int dstX, int srcX);
template<bool MOSAIC, bool BACKDROP> void __setFinalColorBck(u16 color, const u32 x, const bool opaque);
template<bool MOSAIC, bool BACKDROP> FORCEINLINE void __setFinalColorBck(u16 color, const u32 x, const int opaque);
template<bool MOSAIC, bool BACKDROP, int FUNCNUM> FORCEINLINE void ___setFinalColorBck(u16 color, const u32 x, const int opaque);
void setAffineStart(int layer, int xy, u32 val);
void setAffineStartWord(int layer, int xy, u16 val, int word);
u32 getAffineStart(int layer, int xy);

View File

@ -24,6 +24,16 @@
#include <math.h>
#include "types.h"
#include "mem.h"
#if !defined(NOSSE2) && !defined(SSE2_NOINTRIN)
#define SSE2_INTRIN
#endif
#ifdef SSE2_INTRIN
#include <xmmintrin.h>
#include <emmintrin.h>
#endif
extern "C" {
@ -108,7 +118,9 @@ void Vector4Copy(float *dst, const float *src);
//this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available
FORCEINLINE u32 u32floor(float f)
{
#ifndef NOSSE2
#if defined(SSE2_INTRIN)
return (u32)_mm_cvttss_si32(_mm_set_ss(f));
#elif !defined(NOSSE2)
__asm cvttss2si eax, f;
#else
return (u32)f;
@ -116,7 +128,9 @@ FORCEINLINE u32 u32floor(float f)
}
FORCEINLINE u32 u32floor(double d)
{
#ifndef NOSSE2
#if defined(SSE2_INTRIN)
return (u32)_mm_cvttsd_si32(_mm_set_sd(d));
#elif !defined(NOSSE2)
__asm cvttsd2si eax, d;
#else
return (u32)d;
@ -127,7 +141,9 @@ FORCEINLINE u32 u32floor(double d)
//be sure that the results are the same thing as floorf!
FORCEINLINE s32 s32floor(float f)
{
#ifndef NOSSE2
#if defined(SSE2_INTRIN)
return _mm_cvttss_si32( _mm_add_ss(_mm_set_ss(-0.5f),_mm_add_ss(_mm_set_ss(f), _mm_set_ss(f))) ) >> 1;
#elif !defined(NOSSE2)
static const float c = -0.5f;
__asm
{
@ -142,5 +158,49 @@ FORCEINLINE s32 s32floor(float f)
#endif
}
//now comes some sse2 functions coded solely with intrinsics.
//let's wait and see how many people this upsets.
//they can always #define SSE2_NOINTRIN in their userconfig.h....
#ifdef SSE2_INTRIN
template<int NUM>
static FORCEINLINE void memset_u16_le(void* dst, u16 val)
{
u32 u32val;
//just for the endian safety
T1WriteWord((u8*)&u32val,0,val);
T1WriteWord((u8*)&u32val,2,val);
const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
MACRODO_N(NUM/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp));
}
#else
template<int NUM>
static FORCEINLINE void memset_u16_le(void* dst, u16 val)
{
for(int i=0;i<NUM;i++)
T1WriteWord((u8*)dst,i<<1,val);
}
#endif
//WARNING: I do not think this is as fast as a memset, for some reason.
//at least in vc2005 with sse enabled. better figure out why before using it
#ifdef SSE2_INTRIN
template<int NUM>
static FORCEINLINE void memset_u8(void* _dst, u8 val)
{
const u8* dst = (u8*)_dst;
u32 u32val = (val<<24)|(val<<16)|(val<<8)|val;
const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
MACRODO_N(NUM/16,_mm_store_si128((__m128i*)(dst+(X)*16), temp));
}
#else
template<int NUM>
static FORCEINLINE void memset_u8(void* dst, u8 val)
{
memset(dst,val,NUM);
}
#endif
#endif

View File

@ -627,7 +627,7 @@ typedef int fixed28_4;
static bool failure;
// handle floor divides and mods correctly
INLINE void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod)
FORCEINLINE void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod)
{
//These must be caused by invalid or degenerate shapes.. not sure yet.
//check it out in the mario face intro of SM64
@ -658,10 +658,10 @@ INLINE void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod
}
}
INLINE fixed28_4 FloatToFixed28_4( float Value ) {
FORCEINLINE fixed28_4 FloatToFixed28_4( float Value ) {
return (fixed28_4)(Value * 16);
}
INLINE float Fixed28_4ToFloat( fixed28_4 Value ) {
FORCEINLINE float Fixed28_4ToFloat( fixed28_4 Value ) {
return Value / 16.0;
}
//inline fixed16_16 FloatToFixed16_16( float Value ) {
@ -670,11 +670,11 @@ INLINE float Fixed28_4ToFloat( fixed28_4 Value ) {
//inline float Fixed16_16ToFloat( fixed16_16 Value ) {
// return Value / 65536.0;
//}
INLINE fixed28_4 Fixed28_4Mul( fixed28_4 A, fixed28_4 B ) {
FORCEINLINE fixed28_4 Fixed28_4Mul( fixed28_4 A, fixed28_4 B ) {
// could make this asm to prevent overflow
return (A * B) / 16; // 28.4 * 28.4 = 24.8 / 16 = 28.4
}
INLINE int Ceil28_4( fixed28_4 Value ) {
FORCEINLINE int Ceil28_4( fixed28_4 Value ) {
int ReturnValue;
int Numerator = Value - 1 + 16;
if(Numerator >= 0) {
@ -700,7 +700,7 @@ struct edge_fx_fl {
float curr, step, stepExtra;
FORCEINLINE void doStep() { curr += step; }
FORCEINLINE void doStepExtra() { curr += stepExtra; }
void initialize(float top, float bottom, float dx, float dy, long XStep, float XPrestep, float YPrestep) {
FORCEINLINE void initialize(float top, float bottom, float dx, float dy, long XStep, float XPrestep, float YPrestep) {
dx = 0;
dy *= (bottom-top);
curr = top + YPrestep * dy + XPrestep * dx;
@ -764,7 +764,7 @@ FORCEINLINE int edge_fx_fl::Step() {
}
//draws a single scanline
static void drawscanline(edge_fx_fl *pLeft, edge_fx_fl *pRight)
FORCEINLINE static void drawscanline(edge_fx_fl *pLeft, edge_fx_fl *pRight)
{
int XStart = pLeft->X;
int width = pRight->X - XStart;

View File

@ -30,6 +30,11 @@
#define NOSSE2
#endif
//if theres no sse2, also enforce no intrinsics
#if defined(NOSSE2)
#define SSE2_NOINTRIN
#endif
#ifdef _WIN32
#define strcasecmp(x,y) _stricmp(x,y)
#else
@ -331,5 +336,31 @@ char (*BLAHBLAHBLAH( UNALIGNED T (&)[N] ))[N];
#endif
//fairly standard for loop macros
#define MACRODO1(TRICK,TODO) { const int X = TRICK; TODO; }
#define MACRODO2(X,TODO) { MACRODO1((X),TODO) MACRODO1(((X)+1),TODO) }
#define MACRODO4(X,TODO) { MACRODO2((X),TODO) MACRODO2(((X)+2),TODO) }
#define MACRODO8(X,TODO) { MACRODO4((X),TODO) MACRODO4(((X)+4),TODO) }
#define MACRODO16(X,TODO) { MACRODO8((X),TODO) MACRODO8(((X)+8),TODO) }
#define MACRODO32(X,TODO) { MACRODO16((X),TODO) MACRODO16(((X)+16),TODO) }
#define MACRODO64(X,TODO) { MACRODO32((X),TODO) MACRODO32(((X)+32),TODO) }
#define MACRODO128(X,TODO) { MACRODO64((X),TODO) MACRODO64(((X)+64),TODO) }
#define MACRODO256(X,TODO) { MACRODO128((X),TODO) MACRODO128(((X)+128),TODO) }
//this one lets you loop any number of times (as long as N<256)
#define MACRODO_N(N,TODO) {\
if((N)&0x100) MACRODO256(0,TODO); \
if((N)&0x080) MACRODO128((N)&(0x100),TODO); \
if((N)&0x040) MACRODO64((N)&(0x100|0x080),TODO); \
if((N)&0x020) MACRODO32((N)&(0x100|0x080|0x040),TODO); \
if((N)&0x010) MACRODO16((N)&(0x100|0x080|0x040|0x020),TODO); \
if((N)&0x008) MACRODO8((N)&(0x100|0x080|0x040|0x020|0x010),TODO); \
if((N)&0x004) MACRODO4((N)&(0x100|0x080|0x040|0x020|0x010|0x008),TODO); \
if((N)&0x002) MACRODO2((N)&(0x100|0x080|0x040|0x020|0x010|0x008|0x004),TODO); \
if((N)&0x001) MACRODO1((N)&(0x100|0x080|0x040|0x020|0x010|0x008|0x004|0x002),TODO); \
}
#endif

View File

@ -8,6 +8,7 @@
//#define NOSSE2 //disables SSE2 optimizations (better change it in the vc++ codegen options too)
//#define DEVELOPER //enables dev+ features
//#define GDB_STUB //enables the gdb stub. for some reason this is separate from dev+ for now
//#define SSE2_NOINTRIN //indicates that you have a crippled compiler with no sse2 intrinsics (only relevant for SSE2 builds)
#endif //_USERCONFIG_H