grab bag of optimizations: many optimizations to sse functions, now using intrinsic functions. this would enable these functions to work on gcc (i think) if our configuration supported it, but it will fail for some vc++ express installations. those guys will have to either #define SSE2_NOINTRIN or help me figure out which installations are broken and why and how to fix it. also, collapse BG layer pixel blenders into a single function, so watch for regressions there.
This commit is contained in:
parent
baef153e05
commit
062a228877
|
@ -39,6 +39,7 @@
|
||||||
|
|
||||||
//#undef FORCEINLINE
|
//#undef FORCEINLINE
|
||||||
//#define FORCEINLINE
|
//#define FORCEINLINE
|
||||||
|
//#define SSE2_NOINTRIN
|
||||||
|
|
||||||
ARM9_struct ARM9Mem;
|
ARM9_struct ARM9Mem;
|
||||||
|
|
||||||
|
@ -491,29 +492,6 @@ FORCEINLINE void GPU::setFinal3DColorSpecialDecreaseWnd(int dstX, int srcX)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
enum OBJFunc
|
|
||||||
{
|
|
||||||
None, Blend, Increase, Decrease
|
|
||||||
};
|
|
||||||
template<OBJFunc FUNC, bool WINDOW>
|
|
||||||
static void _master_setFinalOBJColor(GPU *gpu, u32 passing, u8 *dst, u16 color, u8 alpha, u8 type, u16 x);
|
|
||||||
|
|
||||||
static void setFinalOBJColorSpecialNoneWnd (GPU *gpu, u32 passing, u8 *dst, u16 color, u8 alpha, u8 type, u16 x);
|
|
||||||
static void setFinalOBJColorSpecialBlendWnd (GPU *gpu, u32 passing, u8 *dst, u16 color, u8 alpha, u8 type, u16 x);
|
|
||||||
static void setFinalOBJColorSpecialIncreaseWnd (GPU *gpu, u32 passing, u8 *dst, u16 color, u8 alpha, u8 type, u16 x);
|
|
||||||
static void setFinalOBJColorSpecialDecreaseWnd (GPU *gpu, u32 passing, u8 *dst, u16 color, u8 alpha, u8 type, u16 x);
|
|
||||||
|
|
||||||
const GPU::FinalOBJColFunct pixelBlittersOBJ[8] = {
|
|
||||||
_master_setFinalOBJColor<None,false>,
|
|
||||||
_master_setFinalOBJColor<Blend,false>,
|
|
||||||
_master_setFinalOBJColor<Increase,false>,
|
|
||||||
_master_setFinalOBJColor<Decrease,false>,
|
|
||||||
_master_setFinalOBJColor<None,true>,
|
|
||||||
_master_setFinalOBJColor<Blend,true>,
|
|
||||||
_master_setFinalOBJColor<Increase,true>,
|
|
||||||
_master_setFinalOBJColor<Decrease,true> };
|
|
||||||
|
|
||||||
/*****************************************************************************/
|
/*****************************************************************************/
|
||||||
// INITIALIZATION
|
// INITIALIZATION
|
||||||
/*****************************************************************************/
|
/*****************************************************************************/
|
||||||
|
@ -591,7 +569,7 @@ GPU * GPU_Init(u8 l)
|
||||||
g->need_update_winh[1] = true;
|
g->need_update_winh[1] = true;
|
||||||
g->setFinalColorBck_funcNum = 0;
|
g->setFinalColorBck_funcNum = 0;
|
||||||
g->setFinalColor3d_funcNum = 0;
|
g->setFinalColor3d_funcNum = 0;
|
||||||
g->setFinalColorSpr = _master_setFinalOBJColor<None,false>;
|
g->setFinalColorSpr_funcNum = 0;
|
||||||
|
|
||||||
return g;
|
return g;
|
||||||
}
|
}
|
||||||
|
@ -602,7 +580,7 @@ void GPU_Reset(GPU *g, u8 l)
|
||||||
|
|
||||||
g->setFinalColorBck_funcNum = 0;
|
g->setFinalColorBck_funcNum = 0;
|
||||||
g->setFinalColor3d_funcNum = 0;
|
g->setFinalColor3d_funcNum = 0;
|
||||||
g->setFinalColorSpr = _master_setFinalOBJColor<None,false>;
|
g->setFinalColorSpr_funcNum = 0;
|
||||||
g->core = l;
|
g->core = l;
|
||||||
g->BGSize[0][0] = g->BGSize[1][0] = g->BGSize[2][0] = g->BGSize[3][0] = 256;
|
g->BGSize[0][0] = g->BGSize[1][0] = g->BGSize[2][0] = g->BGSize[3][0] = 256;
|
||||||
g->BGSize[0][1] = g->BGSize[1][1] = g->BGSize[2][1] = g->BGSize[3][1] = 256;
|
g->BGSize[0][1] = g->BGSize[1][1] = g->BGSize[2][1] = g->BGSize[3][1] = 256;
|
||||||
|
@ -729,7 +707,7 @@ void SetupFinalPixelBlitter (GPU *gpu)
|
||||||
u8 windowUsed = (gpu->WIN0_ENABLED | gpu->WIN1_ENABLED | gpu->WINOBJ_ENABLED);
|
u8 windowUsed = (gpu->WIN0_ENABLED | gpu->WIN1_ENABLED | gpu->WINOBJ_ENABLED);
|
||||||
u8 blendMode = (gpu->BLDCNT >> 6)&3;
|
u8 blendMode = (gpu->BLDCNT >> 6)&3;
|
||||||
|
|
||||||
gpu->setFinalColorSpr = pixelBlittersOBJ[windowUsed*4 + blendMode];
|
gpu->setFinalColorSpr_funcNum = windowUsed*4 + blendMode;
|
||||||
gpu->setFinalColorBck_funcNum = windowUsed*4 + blendMode;
|
gpu->setFinalColorBck_funcNum = windowUsed*4 + blendMode;
|
||||||
gpu->setFinalColor3d_funcNum = windowUsed*4 + blendMode;
|
gpu->setFinalColor3d_funcNum = windowUsed*4 + blendMode;
|
||||||
|
|
||||||
|
@ -961,128 +939,47 @@ FORCEINLINE void GPU::renderline_checkWindows(u16 x, bool &draw, bool &effect) c
|
||||||
// PIXEL RENDERING - BGS
|
// PIXEL RENDERING - BGS
|
||||||
/*****************************************************************************/
|
/*****************************************************************************/
|
||||||
|
|
||||||
template<bool BACKDROP> FORCEINLINE void GPU::setFinalBGColorSpecialNone(u16 &color, const u32 x)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
template<bool BACKDROP> FORCEINLINE void GPU::setFinalBGColorSpecialBlend(u16 &color, const u32 x)
|
template<bool BACKDROP, BlendFunc FUNC, bool WINDOW>
|
||||||
|
FORCEINLINE FASTCALL bool GPU::_master_setFinalBGColor(u16 &color, const u32 x)
|
||||||
{
|
{
|
||||||
|
//no further analysis for no special effects. just draw it.
|
||||||
|
if(FUNC == None) return true;
|
||||||
|
|
||||||
//blend backdrop with what?? this doesn't make sense
|
//blend backdrop with what?? this doesn't make sense
|
||||||
if(BACKDROP) return;
|
if(FUNC==Blend && BACKDROP) return true;
|
||||||
if(blend1)
|
|
||||||
{
|
|
||||||
//If the layer we are drawing on is selected as 2nd source, we can blend
|
|
||||||
int bg_under = bgPixels[x];
|
|
||||||
if(blend2[bg_under])
|
|
||||||
color = blend(color,T2ReadWord(currDst, x<<1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<bool BACKDROP> FORCEINLINE void GPU::setFinalBGColorSpecialIncrease (u16 &color, const u32 x)
|
bool windowEffect = true;
|
||||||
{
|
|
||||||
if(blend1) // the bg to draw has a special color effect
|
|
||||||
{
|
|
||||||
color = currentFadeInColors[color];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<bool BACKDROP> FORCEINLINE void GPU::setFinalBGColorSpecialDecrease(u16 &color, const u32 x)
|
if(WINDOW)
|
||||||
{
|
{
|
||||||
if(blend1) // the bg to draw has a special color effect
|
bool windowDraw;
|
||||||
{
|
|
||||||
color = currentFadeOutColors[color];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<bool BACKDROP> FORCEINLINE bool GPU::setFinalBGColorSpecialNoneWnd(u16 &color, const u32 x)
|
|
||||||
{
|
|
||||||
bool windowDraw = true, windowEffect = true;
|
|
||||||
|
|
||||||
renderline_checkWindows(x, windowDraw, windowEffect);
|
renderline_checkWindows(x, windowDraw, windowEffect);
|
||||||
|
|
||||||
if(BACKDROP) windowDraw = true; //backdrop must always be drawn
|
//backdrop must always be drawn
|
||||||
|
if(BACKDROP) windowDraw = true;
|
||||||
|
|
||||||
if (blend1 && windowEffect) // the bg to draw has a special color effect
|
//we never have anything more to do if the window rejected us
|
||||||
{
|
if(!windowDraw) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
//special effects rejected. just draw it.
|
||||||
|
if(!(blend1 && windowEffect))
|
||||||
return true;
|
return true;
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if ((windowEffect && (BLDCNT & (0x100 << currBgNum))) || windowDraw)
|
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<bool BACKDROP> FORCEINLINE bool GPU::setFinalBGColorSpecialBlendWnd(u16 &color, const u32 x)
|
const u8 bg_under = bgPixels[x];
|
||||||
{
|
|
||||||
bool windowDraw = true, windowEffect = true;
|
|
||||||
|
|
||||||
renderline_checkWindows(x, windowDraw, windowEffect);
|
//perform the special effect
|
||||||
|
switch(FUNC) {
|
||||||
if(BACKDROP) windowDraw = true; //backdrop must always be drawn
|
case Blend: if(blend2[bg_under]) color = blend(color,T2ReadWord(currDst, x<<1)); break;
|
||||||
|
case Increase: color = currentFadeInColors[color]; break;
|
||||||
if(windowDraw)
|
case Decrease: color = currentFadeOutColors[color]; break;
|
||||||
{
|
|
||||||
if(blend1 && windowEffect)
|
|
||||||
{
|
|
||||||
int bg_under = bgPixels[x];
|
|
||||||
|
|
||||||
// If the layer we are drawing on is selected as 2nd source, we can blend
|
|
||||||
if(blend2[bg_under])
|
|
||||||
color = blend(color,T2ReadWord(currDst, x<<1));
|
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<bool BACKDROP> FORCEINLINE bool GPU::setFinalBGColorSpecialIncreaseWnd(u16 &color, const u32 x)
|
template<BlendFunc FUNC, bool WINDOW>
|
||||||
{
|
static FORCEINLINE void _master_setFinalOBJColor(GPU *gpu, u8 *dst, u16 color, u8 alpha, u8 type, u16 x)
|
||||||
bool windowDraw = true, windowEffect = true;
|
|
||||||
|
|
||||||
renderline_checkWindows(x, windowDraw, windowEffect);
|
|
||||||
|
|
||||||
if(BACKDROP) windowDraw = true; //backdrop must always be drawn
|
|
||||||
|
|
||||||
if(windowDraw)
|
|
||||||
{
|
|
||||||
if(blend1 && windowEffect)
|
|
||||||
{
|
|
||||||
color = currentFadeInColors[color];
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<bool BACKDROP> FORCEINLINE bool GPU::setFinalBGColorSpecialDecreaseWnd(u16 &color, const u32 x)
|
|
||||||
{
|
|
||||||
bool windowDraw = true, windowEffect = true;
|
|
||||||
|
|
||||||
renderline_checkWindows(x, windowDraw, windowEffect);
|
|
||||||
|
|
||||||
if(BACKDROP) windowDraw = true; //backdrop must always be drawn
|
|
||||||
|
|
||||||
if(windowDraw)
|
|
||||||
{
|
|
||||||
if(blend1 && windowEffect)
|
|
||||||
{
|
|
||||||
color = currentFadeOutColors[color];
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*****************************************************************************/
|
|
||||||
// PIXEL RENDERING - OBJS
|
|
||||||
/*****************************************************************************/
|
|
||||||
|
|
||||||
template<OBJFunc FUNC, bool WINDOW>
|
|
||||||
static void _master_setFinalOBJColor(GPU *gpu, u32 passing, u8 *dst, u16 color, u8 alpha, u8 type, u16 x)
|
|
||||||
{
|
{
|
||||||
bool windowDraw = true, windowEffect = true;
|
bool windowDraw = true, windowEffect = true;
|
||||||
|
|
||||||
|
@ -1094,19 +991,19 @@ static void _master_setFinalOBJColor(GPU *gpu, u32 passing, u8 *dst, u16 color,
|
||||||
}
|
}
|
||||||
|
|
||||||
//this inspects the layer beneath the sprite to see if the current blend flags make it a candidate for blending
|
//this inspects the layer beneath the sprite to see if the current blend flags make it a candidate for blending
|
||||||
int bg_under = gpu->bgPixels[x];
|
const int bg_under = gpu->bgPixels[x];
|
||||||
bool allowBlend = ((bg_under != 4) && (gpu->BLDCNT & (0x100 << bg_under)));
|
const bool allowBlend = (bg_under != 4) && gpu->blend2[bg_under];
|
||||||
|
|
||||||
bool sourceEffectSelected = (gpu->BLDCNT & 0x10)!=0;
|
const bool sourceEffectSelected = gpu->blend1;
|
||||||
|
|
||||||
//note that the fadein and fadeout is done here before blending,
|
//note that the fadein and fadeout is done here before blending,
|
||||||
//so that a fade and blending can be applied at the same time
|
//so that a fade and blending can be applied at the same time (actually, I don't think that is legal..)
|
||||||
bool forceBlendingForNormal = false;
|
bool forceBlendingForNormal = false;
|
||||||
if(windowEffect && sourceEffectSelected)
|
if(windowEffect && sourceEffectSelected)
|
||||||
switch(FUNC)
|
switch(FUNC)
|
||||||
{
|
{
|
||||||
case Increase: if(!allowBlend) color = fadeInColors[gpu->BLDY_EVY][color&0x7FFF]; break;
|
case Increase: if(!allowBlend) color = gpu->currentFadeInColors[color&0x7FFF]; break;
|
||||||
case Decrease: if(!allowBlend) color = fadeOutColors[gpu->BLDY_EVY][color&0x7FFF]; break;
|
case Decrease: if(!allowBlend) color = gpu->currentFadeOutColors[color&0x7FFF]; break;
|
||||||
|
|
||||||
//only when blend color effect is selected, ordinarily opaque sprites are blended with the color effect params
|
//only when blend color effect is selected, ordinarily opaque sprites are blended with the color effect params
|
||||||
case Blend: forceBlendingForNormal = true; break;
|
case Blend: forceBlendingForNormal = true; break;
|
||||||
|
@ -1115,7 +1012,7 @@ static void _master_setFinalOBJColor(GPU *gpu, u32 passing, u8 *dst, u16 color,
|
||||||
|
|
||||||
if(allowBlend)
|
if(allowBlend)
|
||||||
{
|
{
|
||||||
u16 backColor = T2ReadWord(dst,passing);
|
u16 backColor = T2ReadWord(dst,x<<1);
|
||||||
//this hasn't been tested: this blending occurs without regard to the color effect,
|
//this hasn't been tested: this blending occurs without regard to the color effect,
|
||||||
//but rather purely from the sprite's alpha
|
//but rather purely from the sprite's alpha
|
||||||
if(type == GPU_OBJ_MODE_Bitmap)
|
if(type == GPU_OBJ_MODE_Bitmap)
|
||||||
|
@ -1124,11 +1021,13 @@ static void _master_setFinalOBJColor(GPU *gpu, u32 passing, u8 *dst, u16 color,
|
||||||
color = gpu->blend(color,backColor);
|
color = gpu->blend(color,backColor);
|
||||||
}
|
}
|
||||||
|
|
||||||
T2WriteWord(dst, passing, (color | 0x8000));
|
T2WriteWord(dst, x<<1, (color | 0x8000));
|
||||||
gpu->bgPixels[x] = 4;
|
gpu->bgPixels[x] = 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<bool BACKDROP> FORCEINLINE void GPU::setFinalColorBG(u16 color, const u32 x)
|
//FUNCNUM is only set for backdrop, for an optimization of looking it up early
|
||||||
|
template<bool BACKDROP, int FUNCNUM>
|
||||||
|
FORCEINLINE void GPU::setFinalColorBG(u16 color, const u32 x)
|
||||||
{
|
{
|
||||||
//It is not safe to assert this here.
|
//It is not safe to assert this here.
|
||||||
//This is probably the best place to enforce it, since almost every single color that comes in here
|
//This is probably the best place to enforce it, since almost every single color that comes in here
|
||||||
|
@ -1136,17 +1035,19 @@ template<bool BACKDROP> FORCEINLINE void GPU::setFinalColorBG(u16 color, const u
|
||||||
//assert((color&0x8000)==0);
|
//assert((color&0x8000)==0);
|
||||||
if(!BACKDROP) color &= 0x7FFF; //but for the backdrop we can easily guarantee earlier that theres no bit here
|
if(!BACKDROP) color &= 0x7FFF; //but for the backdrop we can easily guarantee earlier that theres no bit here
|
||||||
|
|
||||||
bool draw=true;
|
bool draw;
|
||||||
switch(setFinalColorBck_funcNum)
|
|
||||||
|
const int test = BACKDROP?FUNCNUM:setFinalColorBck_funcNum;
|
||||||
|
switch(test)
|
||||||
{
|
{
|
||||||
case 0x0: setFinalBGColorSpecialNone<BACKDROP>(color,x); break;
|
case 0: draw = _master_setFinalBGColor<BACKDROP,None,false>(color,x); break;
|
||||||
case 0x1: setFinalBGColorSpecialBlend<BACKDROP>(color,x); break;
|
case 1: draw = _master_setFinalBGColor<BACKDROP,Blend,false>(color,x); break;
|
||||||
case 0x2: setFinalBGColorSpecialIncrease<BACKDROP>(color,x); break;
|
case 2: draw = _master_setFinalBGColor<BACKDROP,Increase,false>(color,x); break;
|
||||||
case 0x3: setFinalBGColorSpecialDecrease<BACKDROP>(color,x); break;
|
case 3: draw = _master_setFinalBGColor<BACKDROP,Decrease,false>(color,x); break;
|
||||||
case 0x4: draw=setFinalBGColorSpecialNoneWnd<BACKDROP>(color,x); break;
|
case 4: draw = _master_setFinalBGColor<BACKDROP,None,true>(color,x); break;
|
||||||
case 0x5: draw=setFinalBGColorSpecialBlendWnd<BACKDROP>(color,x); break;
|
case 5: draw = _master_setFinalBGColor<BACKDROP,Blend,true>(color,x); break;
|
||||||
case 0x6: draw=setFinalBGColorSpecialIncreaseWnd<BACKDROP>(color,x); break;
|
case 6: draw = _master_setFinalBGColor<BACKDROP,Increase,true>(color,x); break;
|
||||||
case 0x7: draw=setFinalBGColorSpecialDecreaseWnd<BACKDROP>(color,x); break;
|
case 7: draw = _master_setFinalBGColor<BACKDROP,Decrease,true>(color,x); break;
|
||||||
};
|
};
|
||||||
|
|
||||||
if(BACKDROP || draw) //backdrop must always be drawn
|
if(BACKDROP || draw) //backdrop must always be drawn
|
||||||
|
@ -1159,7 +1060,6 @@ template<bool BACKDROP> FORCEINLINE void GPU::setFinalColorBG(u16 color, const u
|
||||||
|
|
||||||
FORCEINLINE void GPU::setFinalColor3d(int dstX, int srcX)
|
FORCEINLINE void GPU::setFinalColor3d(int dstX, int srcX)
|
||||||
{
|
{
|
||||||
//if someone disagrees with these, they could be reimplemented as a function pointer easily
|
|
||||||
switch(setFinalColor3d_funcNum)
|
switch(setFinalColor3d_funcNum)
|
||||||
{
|
{
|
||||||
case 0x0: setFinal3DColorSpecialNone(dstX,srcX); break;
|
case 0x0: setFinal3DColorSpecialNone(dstX,srcX); break;
|
||||||
|
@ -1173,9 +1073,31 @@ FORCEINLINE void GPU::setFinalColor3d(int dstX, int srcX)
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FORCEINLINE void setFinalColorSpr(GPU* gpu, u8 *dst, u16 color, u8 alpha, u8 type, u16 x)
|
||||||
|
{
|
||||||
|
switch(gpu->setFinalColorSpr_funcNum)
|
||||||
|
{
|
||||||
|
case 0x0: _master_setFinalOBJColor<None,false>(gpu, dst, color, alpha, type, x); break;
|
||||||
|
case 0x1: _master_setFinalOBJColor<Blend,false>(gpu, dst, color, alpha, type, x); break;
|
||||||
|
case 0x2: _master_setFinalOBJColor<Increase,false>(gpu, dst, color, alpha, type, x); break;
|
||||||
|
case 0x3: _master_setFinalOBJColor<Decrease,false>(gpu, dst, color, alpha, type, x); break;
|
||||||
|
case 0x4: _master_setFinalOBJColor<None,true>(gpu, dst, color, alpha, type, x); break;
|
||||||
|
case 0x5: _master_setFinalOBJColor<Blend,true>(gpu, dst, color, alpha, type, x); break;
|
||||||
|
case 0x6: _master_setFinalOBJColor<Increase,true>(gpu, dst, color, alpha, type, x); break;
|
||||||
|
case 0x7: _master_setFinalOBJColor<Decrease,true>(gpu, dst, color, alpha, type, x); break;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
template<bool MOSAIC, bool BACKDROP>
|
||||||
|
FORCEINLINE void GPU::__setFinalColorBck(u16 color, const u32 x, const int opaque)
|
||||||
|
{
|
||||||
|
return ___setFinalColorBck<MOSAIC, BACKDROP, 0>(color,x,opaque);
|
||||||
|
}
|
||||||
|
|
||||||
//this was forced inline because most of the time it just falls through to setFinalColorBck() and the function call
|
//this was forced inline because most of the time it just falls through to setFinalColorBck() and the function call
|
||||||
//overhead was ridiculous and terrible
|
//overhead was ridiculous and terrible
|
||||||
template<bool MOSAIC, bool BACKDROP> FORCEINLINE void GPU::__setFinalColorBck(u16 color, const u32 x, const bool opaque)
|
template<bool MOSAIC, bool BACKDROP, int FUNCNUM>
|
||||||
|
FORCEINLINE void GPU::___setFinalColorBck(u16 color, const u32 x, const int opaque)
|
||||||
{
|
{
|
||||||
//I commented out this line to make a point.
|
//I commented out this line to make a point.
|
||||||
//under ordinary circumstances, nobody should pass in something >=256
|
//under ordinary circumstances, nobody should pass in something >=256
|
||||||
|
@ -1206,7 +1128,7 @@ template<bool MOSAIC, bool BACKDROP> FORCEINLINE void GPU::__setFinalColorBck(u1
|
||||||
if(color != 0xFFFF)
|
if(color != 0xFFFF)
|
||||||
{
|
{
|
||||||
finish:
|
finish:
|
||||||
setFinalColorBG<BACKDROP>(color,x);
|
setFinalColorBG<BACKDROP,FUNCNUM>(color,x);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1244,7 +1166,7 @@ static void mosaicSpriteLinePixel(GPU * gpu, int x, u16 l, u8 * dst, u8 * dst_al
|
||||||
if(!objColor.opaque) prioTab[x] = 0xFF;
|
if(!objColor.opaque) prioTab[x] = 0xFF;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mosaicSpriteLine(GPU * gpu, u16 l, u8 * dst, u8 * dst_alpha, u8 * typeTab, u8 * prioTab)
|
FORCEINLINE static void mosaicSpriteLine(GPU * gpu, u16 l, u8 * dst, u8 * dst_alpha, u8 * typeTab, u8 * prioTab)
|
||||||
{
|
{
|
||||||
//don't even try this unless the mosaic is effective
|
//don't even try this unless the mosaic is effective
|
||||||
if(gpu->mosaicLookup.widthValue != 0 || gpu->mosaicLookup.heightValue != 0)
|
if(gpu->mosaicLookup.widthValue != 0 || gpu->mosaicLookup.heightValue != 0)
|
||||||
|
@ -1281,7 +1203,7 @@ template<bool MOSAIC> void lineLarge8bpp(GPU * gpu)
|
||||||
XBG &= wmask;
|
XBG &= wmask;
|
||||||
u8 pixel = map[XBG];
|
u8 pixel = map[XBG];
|
||||||
u16 color = T1ReadWord(pal, pixel<<1);
|
u16 color = T1ReadWord(pal, pixel<<1);
|
||||||
gpu->__setFinalColorBck<MOSAIC,false>(color,x,color!=0);
|
gpu->__setFinalColorBck<MOSAIC,false>(color,x,color);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1821,14 +1743,6 @@ FORCEINLINE BOOL compute_sprite_vars(_OAM_ * spriteInfo, u16 l,
|
||||||
// SPRITE RENDERING
|
// SPRITE RENDERING
|
||||||
/*****************************************************************************/
|
/*****************************************************************************/
|
||||||
|
|
||||||
void GPU::spriteRender(u8 * dst, u8 * dst_alpha, u8 * typeTab, u8 * prioTab)
|
|
||||||
{
|
|
||||||
if(spriteRenderMode == SPRITE_1D)
|
|
||||||
_spriteRender<SPRITE_1D>(dst,dst_alpha,typeTab, prioTab);
|
|
||||||
else
|
|
||||||
_spriteRender<SPRITE_2D>(dst,dst_alpha,typeTab, prioTab);
|
|
||||||
}
|
|
||||||
|
|
||||||
//TODO - refactor this so there isnt as much duped code between rotozoomed and non-rotozoomed versions
|
//TODO - refactor this so there isnt as much duped code between rotozoomed and non-rotozoomed versions
|
||||||
|
|
||||||
template<GPU::SpriteRenderMode MODE>
|
template<GPU::SpriteRenderMode MODE>
|
||||||
|
@ -2324,13 +2238,6 @@ void GPU_set_DISPCAPCNT(u32 val)
|
||||||
gpu->dispCapCnt.srcB = (val >> 25) & 0x01;
|
gpu->dispCapCnt.srcB = (val >> 25) & 0x01;
|
||||||
gpu->dispCapCnt.capSrc = (val >> 29) & 0x03;
|
gpu->dispCapCnt.capSrc = (val >> 29) & 0x03;
|
||||||
|
|
||||||
//gpu->dispCapCnt.dstBlock = = (gpu->dispCapCnt.writeBlock * 0x20000) +
|
|
||||||
// (gpu->dispCapCnt.writeOffset * 0x8000);
|
|
||||||
//
|
|
||||||
//gpu->dispCapCnt.src = (gpu->dispCapCnt.readBlock * 0x20000) +
|
|
||||||
// (gpu->dispCapCnt.readOffset * 0x8000);
|
|
||||||
//
|
|
||||||
|
|
||||||
switch((val >> 20) & 0x03)
|
switch((val >> 20) & 0x03)
|
||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
|
@ -2357,33 +2264,48 @@ void GPU_set_DISPCAPCNT(u32 val)
|
||||||
gpu->dispCapCnt.capSrc, gpu->dispCapCnt.dst - ARM9Mem.ARM9_LCD, gpu->dispCapCnt.src - ARM9Mem.ARM9_LCD,
|
gpu->dispCapCnt.capSrc, gpu->dispCapCnt.dst - ARM9Mem.ARM9_LCD, gpu->dispCapCnt.src - ARM9Mem.ARM9_LCD,
|
||||||
gpu->dispCapCnt.srcA, gpu->dispCapCnt.srcB);*/
|
gpu->dispCapCnt.srcA, gpu->dispCapCnt.srcB);*/
|
||||||
}
|
}
|
||||||
// #define BRIGHT_TABLES
|
|
||||||
|
|
||||||
static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
|
static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
|
||||||
{
|
{
|
||||||
|
CACHE_ALIGN u8 spr[512];
|
||||||
|
CACHE_ALIGN u8 sprAlpha[256];
|
||||||
|
CACHE_ALIGN u8 sprType[256];
|
||||||
|
CACHE_ALIGN u8 sprPrio[256];
|
||||||
|
|
||||||
GPU * gpu = screen->gpu;
|
GPU * gpu = screen->gpu;
|
||||||
struct _DISPCNT * dispCnt = &(gpu->dispx_st)->dispx_DISPCNT.bits;
|
struct _DISPCNT * dispCnt = &(gpu->dispx_st)->dispx_DISPCNT.bits;
|
||||||
itemsForPriority_t * item;
|
itemsForPriority_t * item;
|
||||||
u8 spr[512];
|
|
||||||
u8 sprAlpha[256];
|
|
||||||
u8 sprType[256];
|
|
||||||
u8 sprPrio[256];
|
|
||||||
u8 prio;
|
|
||||||
u16 i16;
|
u16 i16;
|
||||||
BOOL BG_enabled = TRUE;
|
BOOL BG_enabled = TRUE;
|
||||||
|
|
||||||
|
gpu->currentFadeInColors = &fadeInColors[gpu->BLDY_EVY][0];
|
||||||
|
gpu->currentFadeOutColors = &fadeOutColors[gpu->BLDY_EVY][0];
|
||||||
|
|
||||||
u16 backdrop_color = T1ReadWord(ARM9Mem.ARM9_VMEM, gpu->core * 0x400) & 0x7FFF;
|
u16 backdrop_color = T1ReadWord(ARM9Mem.ARM9_VMEM, gpu->core * 0x400) & 0x7FFF;
|
||||||
|
|
||||||
//we need to write backdrop colors in the same way as we do BG pixels in order to do correct window processing
|
//we need to write backdrop colors in the same way as we do BG pixels in order to do correct window processing
|
||||||
//this is currently eating up 2fps or so. it is a reasonable candidate for optimization.
|
//this is currently eating up 2fps or so. it is a reasonable candidate for optimization.
|
||||||
gpu->currBgNum = 5;
|
gpu->currBgNum = 5;
|
||||||
for(int x=0;x<256;x++) {
|
switch(gpu->setFinalColorBck_funcNum) {
|
||||||
gpu->__setFinalColorBck<false,true>(backdrop_color,x,1);
|
case 0: case 1: //for backdrops, (even with window enabled) none and blend are both the same: just copy the color
|
||||||
}
|
case 4: case 5:
|
||||||
memset(gpu->bgPixels,5,256);
|
memset_u16_le<256>(gpu->currDst,backdrop_color);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
//for non-windowed fade, we can just fade the color and fill
|
||||||
|
memset_u16_le<256>(gpu->currDst,gpu->currentFadeInColors[backdrop_color]);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
//likewise for non-windowed fadeout
|
||||||
|
memset_u16_le<256>(gpu->currDst,gpu->currentFadeOutColors[backdrop_color]);
|
||||||
|
break;
|
||||||
|
|
||||||
//this check isnt really helpful. it just slows us down in the cases where we need the most speed
|
//windowed fades need special treatment
|
||||||
//if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] && !gpu->LayersEnable[2] && !gpu->LayersEnable[3] && !gpu->LayersEnable[4]) return;
|
case 6: for(int x=0;x<256;x++) gpu->___setFinalColorBck<false,true,6>(backdrop_color,x,1); break;
|
||||||
|
case 7: for(int x=0;x<256;x++) gpu->___setFinalColorBck<false,true,7>(backdrop_color,x,1); break;
|
||||||
|
}
|
||||||
|
|
||||||
|
memset(gpu->bgPixels,5,256);
|
||||||
|
|
||||||
// init background color & priorities
|
// init background color & priorities
|
||||||
memset(sprAlpha, 0, 256);
|
memset(sprAlpha, 0, 256);
|
||||||
|
@ -2392,9 +2314,11 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
|
||||||
memset(sprWin, 0, 256);
|
memset(sprWin, 0, 256);
|
||||||
|
|
||||||
// init pixels priorities
|
// init pixels priorities
|
||||||
for (int i=0; i<NB_PRIORITIES; i++) {
|
assert(NB_PRIORITIES==4);
|
||||||
gpu->itemsForPriority[i].nbPixelsX = 0;
|
gpu->itemsForPriority[0].nbPixelsX = 0;
|
||||||
}
|
gpu->itemsForPriority[1].nbPixelsX = 0;
|
||||||
|
gpu->itemsForPriority[2].nbPixelsX = 0;
|
||||||
|
gpu->itemsForPriority[3].nbPixelsX = 0;
|
||||||
|
|
||||||
// for all the pixels in the line
|
// for all the pixels in the line
|
||||||
if (gpu->LayersEnable[4])
|
if (gpu->LayersEnable[4])
|
||||||
|
@ -2405,7 +2329,6 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
|
||||||
//zero 06-may-09: I properly supported window color effects for backdrop, but I am not sure
|
//zero 06-may-09: I properly supported window color effects for backdrop, but I am not sure
|
||||||
//how it interacts with this. I wish we knew why we needed this
|
//how it interacts with this. I wish we knew why we needed this
|
||||||
|
|
||||||
|
|
||||||
gpu->spriteRender(spr, sprAlpha, sprType, sprPrio);
|
gpu->spriteRender(spr, sprAlpha, sprType, sprPrio);
|
||||||
mosaicSpriteLine(gpu, l, spr, sprAlpha, sprType, sprPrio);
|
mosaicSpriteLine(gpu, l, spr, sprAlpha, sprType, sprPrio);
|
||||||
|
|
||||||
|
@ -2413,7 +2336,7 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
|
||||||
for(int i = 0; i<256; i++)
|
for(int i = 0; i<256; i++)
|
||||||
{
|
{
|
||||||
// assign them to the good priority item
|
// assign them to the good priority item
|
||||||
prio = sprPrio[i];
|
int prio = sprPrio[i];
|
||||||
if (prio >=4) continue;
|
if (prio >=4) continue;
|
||||||
|
|
||||||
item = &(gpu->itemsForPriority[prio]);
|
item = &(gpu->itemsForPriority[prio]);
|
||||||
|
@ -2426,9 +2349,12 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
|
||||||
if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] && !gpu->LayersEnable[2] && !gpu->LayersEnable[3])
|
if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] && !gpu->LayersEnable[2] && !gpu->LayersEnable[3])
|
||||||
BG_enabled = FALSE;
|
BG_enabled = FALSE;
|
||||||
|
|
||||||
|
for(int j=0;j<8;j++)
|
||||||
|
gpu->blend2[j] = (gpu->BLDCNT & (0x100 << j));
|
||||||
|
|
||||||
// paint lower priorities fist
|
// paint lower priorities fist
|
||||||
// then higher priorities on top
|
// then higher priorities on top
|
||||||
for(prio=NB_PRIORITIES; prio > 0; )
|
for(int prio=NB_PRIORITIES; prio > 0; )
|
||||||
{
|
{
|
||||||
prio--;
|
prio--;
|
||||||
item = &(gpu->itemsForPriority[prio]);
|
item = &(gpu->itemsForPriority[prio]);
|
||||||
|
@ -2442,18 +2368,10 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
|
||||||
{
|
{
|
||||||
gpu->currBgNum = i16;
|
gpu->currBgNum = i16;
|
||||||
gpu->blend1 = gpu->BLDCNT & (1 << gpu->currBgNum);
|
gpu->blend1 = gpu->BLDCNT & (1 << gpu->currBgNum);
|
||||||
for(int j=0;j<8;j++)
|
|
||||||
gpu->blend2[j] = (gpu->BLDCNT & (0x100 << j));
|
|
||||||
gpu->currentFadeInColors = &fadeInColors[gpu->BLDY_EVY][0];
|
|
||||||
gpu->currentFadeOutColors = &fadeOutColors[gpu->BLDY_EVY][0];
|
|
||||||
//gpu->bgFunc = gpu->setFinalColorBck_funcNum;
|
|
||||||
|
|
||||||
struct _BGxCNT *bgCnt = &(gpu->dispx_st)->dispx_BGxCNT[i16].bits;
|
struct _BGxCNT *bgCnt = &(gpu->dispx_st)->dispx_BGxCNT[i16].bits;
|
||||||
gpu->curr_mosaic_enabled = bgCnt->Mosaic_Enable;
|
gpu->curr_mosaic_enabled = bgCnt->Mosaic_Enable;
|
||||||
|
|
||||||
//mosaic test hacks
|
|
||||||
//gpu->curr_mosaic_enabled = true;
|
|
||||||
|
|
||||||
if (gpu->core == GPU_MAIN)
|
if (gpu->core == GPU_MAIN)
|
||||||
{
|
{
|
||||||
if (i16 == 0 && dispCnt->BG0_3D)
|
if (i16 == 0 && dispCnt->BG0_3D)
|
||||||
|
@ -2495,16 +2413,12 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
|
||||||
if (gpu->LayersEnable[4])
|
if (gpu->LayersEnable[4])
|
||||||
{
|
{
|
||||||
gpu->currBgNum = 4;
|
gpu->currBgNum = 4;
|
||||||
////analyze mosaic configuration
|
gpu->blend1 = gpu->BLDCNT & (1 << gpu->currBgNum);
|
||||||
//u16 mosaic_control = T1ReadWord((u8 *)&gpu->dispx_st->dispx_MISC.MOSAIC, 0);
|
|
||||||
//gpu->curr_mosaic_enabled
|
|
||||||
|
|
||||||
for (int i=0; i < item->nbPixelsX; i++)
|
for (int i=0; i < item->nbPixelsX; i++)
|
||||||
{
|
{
|
||||||
i16=item->PixelsX[i];
|
i16=item->PixelsX[i];
|
||||||
// T2WriteWord(dst, i16 << 1, T2ReadWord(spr, i16 << 1));
|
setFinalColorSpr(gpu, gpu->currDst, T2ReadWord(spr, (i16<<1)), sprAlpha[i16], sprType[i16], i16);
|
||||||
// gpu->bgPixels[i16] = 4;
|
|
||||||
gpu->setFinalColorSpr(gpu, (i16<<1), gpu->currDst, T2ReadWord(spr, (i16<<1)), sprAlpha[i16], sprType[i16], i16);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2687,9 +2601,7 @@ static INLINE void GPU_ligne_MasterBrightness(NDS_Screen * screen, u16 l)
|
||||||
|
|
||||||
|
|
||||||
//Apply final brightness adjust (MASTER_BRIGHT)
|
//Apply final brightness adjust (MASTER_BRIGHT)
|
||||||
// Reference: http://nocash.emubase.de/gbatek.htm#dsvideo (Under MASTER_BRIGHTNESS)
|
//http://nocash.emubase.de/gbatek.htm#dsvideo (Under MASTER_BRIGHTNESS)
|
||||||
/* Mightymax> it should be more effective if the windowmanager applies brightness when drawing */
|
|
||||||
/* it will most likly take acceleration, while we are stuck here with CPU power */
|
|
||||||
|
|
||||||
switch (gpu->MasterBrightMode)
|
switch (gpu->MasterBrightMode)
|
||||||
{
|
{
|
||||||
|
@ -2841,9 +2753,6 @@ void GPU_ligne(NDS_Screen * screen, u16 l, bool skip)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
//if(gpu->core == 1)
|
|
||||||
// printf("%d\n",l);
|
|
||||||
|
|
||||||
//blacken the screen if it is turned off by the user
|
//blacken the screen if it is turned off by the user
|
||||||
if(!CommonSettings.showGpu.screens[gpu->core])
|
if(!CommonSettings.showGpu.screens[gpu->core])
|
||||||
{
|
{
|
||||||
|
@ -2852,12 +2761,6 @@ void GPU_ligne(NDS_Screen * screen, u16 l, bool skip)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
//{
|
|
||||||
// extern int currFrameCounter;
|
|
||||||
// u8 * dst = GPU_screen + (screen->offset + l) * 512;
|
|
||||||
// memset(dst,currFrameCounter,512);
|
|
||||||
//}
|
|
||||||
|
|
||||||
//cache some parameters which are assumed to be stable throughout the rendering of the entire line
|
//cache some parameters which are assumed to be stable throughout the rendering of the entire line
|
||||||
gpu->currLine = l;
|
gpu->currLine = l;
|
||||||
u16 mosaic_control = T1ReadWord((u8 *)&gpu->dispx_st->dispx_MISC.MOSAIC, 0);
|
u16 mosaic_control = T1ReadWord((u8 *)&gpu->dispx_st->dispx_MISC.MOSAIC, 0);
|
||||||
|
|
|
@ -123,6 +123,10 @@ typedef union
|
||||||
#define BGxENABLED(cnt,num) ((num<8)? ((cnt.val>>8) & num):0)
|
#define BGxENABLED(cnt,num) ((num<8)? ((cnt.val>>8) & num):0)
|
||||||
|
|
||||||
|
|
||||||
|
enum BlendFunc
|
||||||
|
{
|
||||||
|
None, Blend, Increase, Decrease
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
/*******************************************************************************
|
/*******************************************************************************
|
||||||
|
@ -601,10 +605,19 @@ typedef struct
|
||||||
#define NB_BG 4
|
#define NB_BG 4
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
u8 BGs[NB_BG], nbBGs;
|
|
||||||
u8 PixelsX[256];
|
u8 PixelsX[256];
|
||||||
// doh ! yoda says : 256 pixels we can have...
|
u8 BGs[NB_BG], nbBGs;
|
||||||
|
u8 pad[1];
|
||||||
u16 nbPixelsX;
|
u16 nbPixelsX;
|
||||||
|
//256+8:
|
||||||
|
u8 pad2[248];
|
||||||
|
|
||||||
|
//things were slower when i organized this struct this way. whatever.
|
||||||
|
//u8 PixelsX[256];
|
||||||
|
//int BGs[NB_BG], nbBGs;
|
||||||
|
//int nbPixelsX;
|
||||||
|
////<-- 256 + 24
|
||||||
|
//u8 pad2[256-24];
|
||||||
} itemsForPriority_t;
|
} itemsForPriority_t;
|
||||||
#define ARM9MEM_ABG 0x06000000
|
#define ARM9MEM_ABG 0x06000000
|
||||||
#define ARM9MEM_BBG 0x06200000
|
#define ARM9MEM_BBG 0x06200000
|
||||||
|
@ -761,13 +774,13 @@ struct GPU
|
||||||
|
|
||||||
u16 blend(u16 colA, u16 colB);
|
u16 blend(u16 colA, u16 colB);
|
||||||
|
|
||||||
typedef void (*FinalOBJColFunct)(GPU *gpu, u32 passing, u8 *dst, u16 color, u8 alpha, u8 type, u16 x);
|
template<bool BACKDROP, BlendFunc FUNC, bool WINDOW>
|
||||||
typedef void (*Final3DColFunct)(GPU *gpu, int dstX, int srcX);
|
FORCEINLINE FASTCALL bool _master_setFinalBGColor(u16 &color, const u32 x);
|
||||||
|
|
||||||
int setFinalColorBck_funcNum;
|
int setFinalColorBck_funcNum;
|
||||||
int bgFunc;
|
int bgFunc;
|
||||||
int setFinalColor3d_funcNum;
|
int setFinalColor3d_funcNum;
|
||||||
FinalOBJColFunct setFinalColorSpr;
|
int setFinalColorSpr_funcNum;
|
||||||
//Final3DColFunct setFinalColor3D;
|
//Final3DColFunct setFinalColor3D;
|
||||||
enum SpriteRenderMode {
|
enum SpriteRenderMode {
|
||||||
SPRITE_1D, SPRITE_2D
|
SPRITE_1D, SPRITE_2D
|
||||||
|
@ -775,9 +788,17 @@ struct GPU
|
||||||
|
|
||||||
template<GPU::SpriteRenderMode MODE>
|
template<GPU::SpriteRenderMode MODE>
|
||||||
void _spriteRender(u8 * dst, u8 * dst_alpha, u8 * typeTab, u8 * prioTab);
|
void _spriteRender(u8 * dst, u8 * dst_alpha, u8 * typeTab, u8 * prioTab);
|
||||||
void spriteRender(u8 * dst, u8 * dst_alpha, u8 * typeTab, u8 * prioTab);
|
|
||||||
|
|
||||||
template<bool BACKDROP> void setFinalColorBG(u16 color, const u32 x);
|
inline void spriteRender(u8 * dst, u8 * dst_alpha, u8 * typeTab, u8 * prioTab)
|
||||||
|
{
|
||||||
|
if(spriteRenderMode == SPRITE_1D)
|
||||||
|
_spriteRender<SPRITE_1D>(dst,dst_alpha,typeTab, prioTab);
|
||||||
|
else
|
||||||
|
_spriteRender<SPRITE_2D>(dst,dst_alpha,typeTab, prioTab);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<bool BACKDROP, int FUNCNUM> void setFinalColorBG(u16 color, const u32 x);
|
||||||
void setFinalColor3d(int dstX, int srcX);
|
void setFinalColor3d(int dstX, int srcX);
|
||||||
|
|
||||||
template<bool BACKDROP> FORCEINLINE void setFinalBGColorSpecialNone(u16 &color, const u32 x);
|
template<bool BACKDROP> FORCEINLINE void setFinalBGColorSpecialNone(u16 &color, const u32 x);
|
||||||
|
@ -799,7 +820,8 @@ struct GPU
|
||||||
FORCEINLINE void setFinal3DColorSpecialDecreaseWnd(int dstX, int srcX);
|
FORCEINLINE void setFinal3DColorSpecialDecreaseWnd(int dstX, int srcX);
|
||||||
|
|
||||||
|
|
||||||
template<bool MOSAIC, bool BACKDROP> void __setFinalColorBck(u16 color, const u32 x, const bool opaque);
|
template<bool MOSAIC, bool BACKDROP> FORCEINLINE void __setFinalColorBck(u16 color, const u32 x, const int opaque);
|
||||||
|
template<bool MOSAIC, bool BACKDROP, int FUNCNUM> FORCEINLINE void ___setFinalColorBck(u16 color, const u32 x, const int opaque);
|
||||||
void setAffineStart(int layer, int xy, u32 val);
|
void setAffineStart(int layer, int xy, u32 val);
|
||||||
void setAffineStartWord(int layer, int xy, u16 val, int word);
|
void setAffineStartWord(int layer, int xy, u16 val, int word);
|
||||||
u32 getAffineStart(int layer, int xy);
|
u32 getAffineStart(int layer, int xy);
|
||||||
|
|
|
@ -24,6 +24,16 @@
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#include "types.h"
|
#include "types.h"
|
||||||
|
#include "mem.h"
|
||||||
|
|
||||||
|
#if !defined(NOSSE2) && !defined(SSE2_NOINTRIN)
|
||||||
|
#define SSE2_INTRIN
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef SSE2_INTRIN
|
||||||
|
#include <xmmintrin.h>
|
||||||
|
#include <emmintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
||||||
|
@ -108,7 +118,9 @@ void Vector4Copy(float *dst, const float *src);
|
||||||
//this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available
|
//this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available
|
||||||
FORCEINLINE u32 u32floor(float f)
|
FORCEINLINE u32 u32floor(float f)
|
||||||
{
|
{
|
||||||
#ifndef NOSSE2
|
#if defined(SSE2_INTRIN)
|
||||||
|
return (u32)_mm_cvttss_si32(_mm_set_ss(f));
|
||||||
|
#elif !defined(NOSSE2)
|
||||||
__asm cvttss2si eax, f;
|
__asm cvttss2si eax, f;
|
||||||
#else
|
#else
|
||||||
return (u32)f;
|
return (u32)f;
|
||||||
|
@ -116,7 +128,9 @@ FORCEINLINE u32 u32floor(float f)
|
||||||
}
|
}
|
||||||
FORCEINLINE u32 u32floor(double d)
|
FORCEINLINE u32 u32floor(double d)
|
||||||
{
|
{
|
||||||
#ifndef NOSSE2
|
#if defined(SSE2_INTRIN)
|
||||||
|
return (u32)_mm_cvttsd_si32(_mm_set_sd(d));
|
||||||
|
#elif !defined(NOSSE2)
|
||||||
__asm cvttsd2si eax, d;
|
__asm cvttsd2si eax, d;
|
||||||
#else
|
#else
|
||||||
return (u32)d;
|
return (u32)d;
|
||||||
|
@ -127,7 +141,9 @@ FORCEINLINE u32 u32floor(double d)
|
||||||
//be sure that the results are the same thing as floorf!
|
//be sure that the results are the same thing as floorf!
|
||||||
FORCEINLINE s32 s32floor(float f)
|
FORCEINLINE s32 s32floor(float f)
|
||||||
{
|
{
|
||||||
#ifndef NOSSE2
|
#if defined(SSE2_INTRIN)
|
||||||
|
return _mm_cvttss_si32( _mm_add_ss(_mm_set_ss(-0.5f),_mm_add_ss(_mm_set_ss(f), _mm_set_ss(f))) ) >> 1;
|
||||||
|
#elif !defined(NOSSE2)
|
||||||
static const float c = -0.5f;
|
static const float c = -0.5f;
|
||||||
__asm
|
__asm
|
||||||
{
|
{
|
||||||
|
@ -142,5 +158,49 @@ FORCEINLINE s32 s32floor(float f)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//now comes some sse2 functions coded solely with intrinsics.
|
||||||
|
//let's wait and see how many people this upsets.
|
||||||
|
//they can always #define SSE2_NOINTRIN in their userconfig.h....
|
||||||
|
|
||||||
|
#ifdef SSE2_INTRIN
|
||||||
|
|
||||||
|
template<int NUM>
|
||||||
|
static FORCEINLINE void memset_u16_le(void* dst, u16 val)
|
||||||
|
{
|
||||||
|
u32 u32val;
|
||||||
|
//just for the endian safety
|
||||||
|
T1WriteWord((u8*)&u32val,0,val);
|
||||||
|
T1WriteWord((u8*)&u32val,2,val);
|
||||||
|
const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
|
||||||
|
MACRODO_N(NUM/8,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), temp));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template<int NUM>
|
||||||
|
static FORCEINLINE void memset_u16_le(void* dst, u16 val)
|
||||||
|
{
|
||||||
|
for(int i=0;i<NUM;i++)
|
||||||
|
T1WriteWord((u8*)dst,i<<1,val);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//WARNING: I do not think this is as fast as a memset, for some reason.
|
||||||
|
//at least in vc2005 with sse enabled. better figure out why before using it
|
||||||
|
#ifdef SSE2_INTRIN
|
||||||
|
template<int NUM>
|
||||||
|
static FORCEINLINE void memset_u8(void* _dst, u8 val)
|
||||||
|
{
|
||||||
|
const u8* dst = (u8*)_dst;
|
||||||
|
u32 u32val = (val<<24)|(val<<16)|(val<<8)|val;
|
||||||
|
const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
|
||||||
|
MACRODO_N(NUM/16,_mm_store_si128((__m128i*)(dst+(X)*16), temp));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template<int NUM>
|
||||||
|
static FORCEINLINE void memset_u8(void* dst, u8 val)
|
||||||
|
{
|
||||||
|
memset(dst,val,NUM);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -627,7 +627,7 @@ typedef int fixed28_4;
|
||||||
static bool failure;
|
static bool failure;
|
||||||
|
|
||||||
// handle floor divides and mods correctly
|
// handle floor divides and mods correctly
|
||||||
INLINE void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod)
|
FORCEINLINE void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod)
|
||||||
{
|
{
|
||||||
//These must be caused by invalid or degenerate shapes.. not sure yet.
|
//These must be caused by invalid or degenerate shapes.. not sure yet.
|
||||||
//check it out in the mario face intro of SM64
|
//check it out in the mario face intro of SM64
|
||||||
|
@ -658,10 +658,10 @@ INLINE void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
INLINE fixed28_4 FloatToFixed28_4( float Value ) {
|
FORCEINLINE fixed28_4 FloatToFixed28_4( float Value ) {
|
||||||
return (fixed28_4)(Value * 16);
|
return (fixed28_4)(Value * 16);
|
||||||
}
|
}
|
||||||
INLINE float Fixed28_4ToFloat( fixed28_4 Value ) {
|
FORCEINLINE float Fixed28_4ToFloat( fixed28_4 Value ) {
|
||||||
return Value / 16.0;
|
return Value / 16.0;
|
||||||
}
|
}
|
||||||
//inline fixed16_16 FloatToFixed16_16( float Value ) {
|
//inline fixed16_16 FloatToFixed16_16( float Value ) {
|
||||||
|
@ -670,11 +670,11 @@ INLINE float Fixed28_4ToFloat( fixed28_4 Value ) {
|
||||||
//inline float Fixed16_16ToFloat( fixed16_16 Value ) {
|
//inline float Fixed16_16ToFloat( fixed16_16 Value ) {
|
||||||
// return Value / 65536.0;
|
// return Value / 65536.0;
|
||||||
//}
|
//}
|
||||||
INLINE fixed28_4 Fixed28_4Mul( fixed28_4 A, fixed28_4 B ) {
|
FORCEINLINE fixed28_4 Fixed28_4Mul( fixed28_4 A, fixed28_4 B ) {
|
||||||
// could make this asm to prevent overflow
|
// could make this asm to prevent overflow
|
||||||
return (A * B) / 16; // 28.4 * 28.4 = 24.8 / 16 = 28.4
|
return (A * B) / 16; // 28.4 * 28.4 = 24.8 / 16 = 28.4
|
||||||
}
|
}
|
||||||
INLINE int Ceil28_4( fixed28_4 Value ) {
|
FORCEINLINE int Ceil28_4( fixed28_4 Value ) {
|
||||||
int ReturnValue;
|
int ReturnValue;
|
||||||
int Numerator = Value - 1 + 16;
|
int Numerator = Value - 1 + 16;
|
||||||
if(Numerator >= 0) {
|
if(Numerator >= 0) {
|
||||||
|
@ -700,7 +700,7 @@ struct edge_fx_fl {
|
||||||
float curr, step, stepExtra;
|
float curr, step, stepExtra;
|
||||||
FORCEINLINE void doStep() { curr += step; }
|
FORCEINLINE void doStep() { curr += step; }
|
||||||
FORCEINLINE void doStepExtra() { curr += stepExtra; }
|
FORCEINLINE void doStepExtra() { curr += stepExtra; }
|
||||||
void initialize(float top, float bottom, float dx, float dy, long XStep, float XPrestep, float YPrestep) {
|
FORCEINLINE void initialize(float top, float bottom, float dx, float dy, long XStep, float XPrestep, float YPrestep) {
|
||||||
dx = 0;
|
dx = 0;
|
||||||
dy *= (bottom-top);
|
dy *= (bottom-top);
|
||||||
curr = top + YPrestep * dy + XPrestep * dx;
|
curr = top + YPrestep * dy + XPrestep * dx;
|
||||||
|
@ -764,7 +764,7 @@ FORCEINLINE int edge_fx_fl::Step() {
|
||||||
}
|
}
|
||||||
|
|
||||||
//draws a single scanline
|
//draws a single scanline
|
||||||
static void drawscanline(edge_fx_fl *pLeft, edge_fx_fl *pRight)
|
FORCEINLINE static void drawscanline(edge_fx_fl *pLeft, edge_fx_fl *pRight)
|
||||||
{
|
{
|
||||||
int XStart = pLeft->X;
|
int XStart = pLeft->X;
|
||||||
int width = pRight->X - XStart;
|
int width = pRight->X - XStart;
|
||||||
|
|
|
@ -30,6 +30,11 @@
|
||||||
#define NOSSE2
|
#define NOSSE2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//if theres no sse2, also enforce no intrinsics
|
||||||
|
#if defined(NOSSE2)
|
||||||
|
#define SSE2_NOINTRIN
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define strcasecmp(x,y) _stricmp(x,y)
|
#define strcasecmp(x,y) _stricmp(x,y)
|
||||||
#else
|
#else
|
||||||
|
@ -331,5 +336,31 @@ char (*BLAHBLAHBLAH( UNALIGNED T (&)[N] ))[N];
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
//fairly standard for loop macros
|
||||||
|
#define MACRODO1(TRICK,TODO) { const int X = TRICK; TODO; }
|
||||||
|
#define MACRODO2(X,TODO) { MACRODO1((X),TODO) MACRODO1(((X)+1),TODO) }
|
||||||
|
#define MACRODO4(X,TODO) { MACRODO2((X),TODO) MACRODO2(((X)+2),TODO) }
|
||||||
|
#define MACRODO8(X,TODO) { MACRODO4((X),TODO) MACRODO4(((X)+4),TODO) }
|
||||||
|
#define MACRODO16(X,TODO) { MACRODO8((X),TODO) MACRODO8(((X)+8),TODO) }
|
||||||
|
#define MACRODO32(X,TODO) { MACRODO16((X),TODO) MACRODO16(((X)+16),TODO) }
|
||||||
|
#define MACRODO64(X,TODO) { MACRODO32((X),TODO) MACRODO32(((X)+32),TODO) }
|
||||||
|
#define MACRODO128(X,TODO) { MACRODO64((X),TODO) MACRODO64(((X)+64),TODO) }
|
||||||
|
#define MACRODO256(X,TODO) { MACRODO128((X),TODO) MACRODO128(((X)+128),TODO) }
|
||||||
|
|
||||||
|
//this one lets you loop any number of times (as long as N<256)
|
||||||
|
#define MACRODO_N(N,TODO) {\
|
||||||
|
if((N)&0x100) MACRODO256(0,TODO); \
|
||||||
|
if((N)&0x080) MACRODO128((N)&(0x100),TODO); \
|
||||||
|
if((N)&0x040) MACRODO64((N)&(0x100|0x080),TODO); \
|
||||||
|
if((N)&0x020) MACRODO32((N)&(0x100|0x080|0x040),TODO); \
|
||||||
|
if((N)&0x010) MACRODO16((N)&(0x100|0x080|0x040|0x020),TODO); \
|
||||||
|
if((N)&0x008) MACRODO8((N)&(0x100|0x080|0x040|0x020|0x010),TODO); \
|
||||||
|
if((N)&0x004) MACRODO4((N)&(0x100|0x080|0x040|0x020|0x010|0x008),TODO); \
|
||||||
|
if((N)&0x002) MACRODO2((N)&(0x100|0x080|0x040|0x020|0x010|0x008|0x004),TODO); \
|
||||||
|
if((N)&0x001) MACRODO1((N)&(0x100|0x080|0x040|0x020|0x010|0x008|0x004|0x002),TODO); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
//#define NOSSE2 //disables SSE2 optimizations (better change it in the vc++ codegen options too)
|
//#define NOSSE2 //disables SSE2 optimizations (better change it in the vc++ codegen options too)
|
||||||
//#define DEVELOPER //enables dev+ features
|
//#define DEVELOPER //enables dev+ features
|
||||||
//#define GDB_STUB //enables the gdb stub. for some reason this is separate from dev+ for now
|
//#define GDB_STUB //enables the gdb stub. for some reason this is separate from dev+ for now
|
||||||
|
//#define SSE2_NOINTRIN //indicates that you have a crippled compiler with no sse2 intrinsics (only relevant for SSE2 builds)
|
||||||
|
|
||||||
|
|
||||||
#endif //_USERCONFIG_H
|
#endif //_USERCONFIG_H
|
||||||
|
|
Loading…
Reference in New Issue