a number of little graphical speedups, and more intelligent arm7 memory access pattern optimization. good for a few fps.

This commit is contained in:
zeromus 2009-05-13 07:44:06 +00:00
parent 3844e866e0
commit 29297644ac
4 changed files with 77 additions and 49 deletions

View File

@ -674,12 +674,6 @@ static void GPU_resortBGs(GPU *gpu)
struct _DISPCNT * cnt = &gpu->dispx_st->dispx_DISPCNT.bits;
itemsForPriority_t * item;
//zero 29-dec-2008 - this really doesnt make sense to me.
//i changed the sprwin to be line by line,
//and resetting it here is pointless since line rendering is instantaneous
//and completely produces and consumes sprwin after which the contents of this buffer are useless
//memset(gpu->sprWin,0, 256*192);
// we don't need to check for windows here...
// if we tick boxes, invisible layers become invisible & vice versa
#define OP ^ !
@ -1387,7 +1381,7 @@ FORCEINLINE void GPU::setFinalColor3d(int dstX, int srcX)
//this was forced inline because most of the time it just falls through to setFinalColorBck() and the function call
//overhead was ridiculous and terrible
template<bool MOSAIC> FORCEINLINE void GPU::__setFinalColorBck(u16 color, u8 x, bool opaque)
template<bool MOSAIC> FORCEINLINE void GPU::__setFinalColorBck(u16 color, const u8 x, const bool opaque)
{
//I commented out this line to make a point.
//indeed, since x is a u8 we cannot pass in anything >=256
@ -1519,8 +1513,8 @@ template<bool MOSAIC> INLINE void renderline_textBG(GPU * gpu, u16 XBG, u16 YBG,
u16 color;
u16 xoff;
u16 yoff;
u16 x = 0;
u16 xfin;
u32 x = 0;
u32 xfin;
s8 line_dir = 1;
u32 mapinfo;
@ -2610,32 +2604,15 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
u16 backdrop_color = T1ReadWord(ARM9Mem.ARM9_VMEM, gpu->core * 0x400) & 0x7FFF;
///* Apply fading to backdrop */
if((gpu->BLDCNT & 0x20) && (gpu->BLDY_EVY > 0))
{
switch(gpu->BLDCNT & 0xC0)
{
case 0x80: /* Fade in */
backdrop_color = fadeInColors[gpu->BLDY_EVY][backdrop_color];
break;
case 0xC0: /* Fade out */
backdrop_color = fadeOutColors[gpu->BLDY_EVY][backdrop_color];
break;
default: break;
}
}
//we need to write backdrop colors in the same way as we do BG pixels in order to
//do correct window processing
//memset(gpu->bgPixels,6,256); //dont know whether we need this...
//we need to write backdrop colors in the same way as we do BG pixels in order to do correct window processing
//this is currently eating up 2fps or so. it is a reasonable candidate for optimization.
gpu->currBgNum = 5;
for(int x=0;x<256;x++) {
gpu->__setFinalColorBck<false>(backdrop_color,x,1);
}
if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] &&
!gpu->LayersEnable[2] && !gpu->LayersEnable[3] &&
!gpu->LayersEnable[4]) return;
//this check isnt really helpful. it just slows us down in the cases where we need the most speed
//if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] && !gpu->LayersEnable[2] && !gpu->LayersEnable[3] && !gpu->LayersEnable[4]) return;
// init background color & priorities
memset(sprAlpha, 0, 256);

View File

@ -787,7 +787,7 @@ struct GPU
FORCEINLINE void setFinal3DColorSpecialDecreaseWnd(int dstX, int srcX);
template<bool MOSAIC> void __setFinalColorBck(u16 color, u8 x, bool opaque);
template<bool MOSAIC> void __setFinalColorBck(u16 color, const u8 x, const bool opaque);
void setAffineStart(int layer, int xy, u32 val);
void setAffineStartWord(int layer, int xy, u16 val, int word);
u32 getAffineStart(int layer, int xy);

View File

@ -310,17 +310,31 @@ FORCEINLINE u32 _MMU_read32(const int PROCNUM, const MMU_ACCESS_TYPE AT, const u
goto dunno;
}
//for other cases, we have to check from dtcm first because it is patched on top of the main memory range
//special handling for execution from arm7. try reading from main memory first
if(PROCNUM==ARMCPU_ARM7)
{
if ( (addr & 0x0F000000) == 0x02000000)
return T1ReadLong_guaranteedAligned( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK);
else if((addr & 0xFF800000) == 0x03800000)
return T1ReadLong_guaranteedAligned(MMU.ARM7_ERAM, addr&0xFFFF);
else if((addr & 0xFF800000) == 0x03000000)
return T1ReadLong_guaranteedAligned(MMU.SWIRAM, addr&0x7FFF);
}
//for other arm9 cases, we have to check from dtcm first because it is patched on top of the main memory range
if(PROCNUM==ARMCPU_ARM9)
{
if((addr&(~0x3FFF)) == MMU.DTCMRegion)
{
//Returns data from DTCM (ARM9 only)
return T1ReadLong(ARM9Mem.ARM9_DTCM, addr & 0x3FFF);
}
if ( (addr & 0x0F000000) == 0x02000000)
return T1ReadLong( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK);
if ( (addr & 0x0F000000) == 0x02000000)
return T1ReadLong( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK);
}
dunno:
if(PROCNUM==ARMCPU_ARM9) return _MMU_ARM9_read32(addr);

View File

@ -52,6 +52,8 @@
//#undef FORCEINLINE
//#define FORCEINLINE
//#undef INLINE
//#define INLINE
using std::min;
using std::max;
@ -252,6 +254,37 @@ FORCEINLINE int iround(float f) {
return (int)f; //lol
}
//this function is an unreliable, inaccurate floor.
//it should only be used for positive numbers
//this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available
FORCEINLINE u32 u32floor(float f)
{
#ifndef NOSSE2
__asm cvttss2si eax, f;
#else
return (u32)f;
#endif
}
//same as above but works for negative values too.
//be sure that the results are the same thing as floorf!
FORCEINLINE s32 s32floor(float f)
{
#ifndef NOSSE2
static const float c = -0.5f;
__asm
{
movss xmm0, f;
addss xmm0, xmm0;
addss xmm0, c;
cvtss2si eax, xmm0
sar eax, 1
}
#else
return (s32)floorf(f);
#endif
}
static struct Sampler
{
int width, height;
@ -320,8 +353,8 @@ static struct Sampler
{
//finally, we can use floor here. but, it is slower than we want.
//the best solution is probably to wait until the pipeline is full of fixed point
int iu = (int)floorf(u);
int iv = (int)floorf(v);
s32 iu = s32floor(u);
s32 iv = s32floor(v);
dowrap(iu,iv);
FragmentColor color;
@ -460,12 +493,15 @@ static FORCEINLINE void pixel(int adr,float r, float g, float b, float invu, flo
if(gfx3d.wbuffer) {
//not sure about this
//this value was chosen to make the skybox, castle window decals, and water level render correctly in SM64
depth = (u32)(4096*w);
depth = u32floor(4096*w);
}
else
{
float test = -1.2f;
u32 test2 = u32floor(test);
//depth = fastFloor(z*0x7FFF)>>8;
depth = (u32)(z*0x7FFF);
//depth = (u32)(z*0x7FFF);
depth = u32floor(z*0x7FFF);
//depth = z*0xFFFFFF;
}
if(polyAttr.decalMode)
@ -495,9 +531,9 @@ static FORCEINLINE void pixel(int adr,float r, float g, float b, float invu, flo
//this is a HACK:
//we are being very sloppy with our interpolation precision right now
//and rather than fix it, i just want to clamp it
shader.materialColor.r = max(0,min(31,(int)r));
shader.materialColor.g = max(0,min(31,(int)g));
shader.materialColor.b = max(0,min(31,(int)b));
shader.materialColor.r = max(0U,min(31U,u32floor(r)));
shader.materialColor.g = max(0U,min(31U,u32floor(g)));
shader.materialColor.b = max(0U,min(31U,u32floor(b)));
shader.materialColor.a = polyAttr.alpha;
@ -600,12 +636,13 @@ static FORCEINLINE void pixel(int adr,float r, float g, float b, float invu, flo
;
}
typedef int fixed28_4;
static bool failure;
// handle floor divides and mods correctly
inline void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod)
INLINE void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod)
{
//These must be caused by invalid or degenerate shapes.. not sure yet.
//check it out in the mario face intro of SM64
@ -636,10 +673,10 @@ inline void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod
}
}
inline fixed28_4 FloatToFixed28_4( float Value ) {
INLINE fixed28_4 FloatToFixed28_4( float Value ) {
return (fixed28_4)(Value * 16);
}
inline float Fixed28_4ToFloat( fixed28_4 Value ) {
INLINE float Fixed28_4ToFloat( fixed28_4 Value ) {
return Value / 16.0;
}
//inline fixed16_16 FloatToFixed16_16( float Value ) {
@ -648,11 +685,11 @@ inline float Fixed28_4ToFloat( fixed28_4 Value ) {
//inline float Fixed16_16ToFloat( fixed16_16 Value ) {
// return Value / 65536.0;
//}
inline fixed28_4 Fixed28_4Mul( fixed28_4 A, fixed28_4 B ) {
INLINE fixed28_4 Fixed28_4Mul( fixed28_4 A, fixed28_4 B ) {
// could make this asm to prevent overflow
return (A * B) / 16; // 28.4 * 28.4 = 24.8 / 16 = 28.4
}
inline int Ceil28_4( fixed28_4 Value ) {
INLINE int Ceil28_4( fixed28_4 Value ) {
int ReturnValue;
int Numerator = Value - 1 + 16;
if(Numerator >= 0) {
@ -813,7 +850,7 @@ static void runscanlines(edge_fx_fl *left, edge_fx_fl *right)
//rotates verts counterclockwise
template<int type>
inline static void rot_verts() {
INLINE static void rot_verts() {
#define ROTSWAP(X) if(type>X) swap(verts[X-1],verts[X]);
ROTSWAP(1); ROTSWAP(2); ROTSWAP(3); ROTSWAP(4);
ROTSWAP(5); ROTSWAP(6); ROTSWAP(7);