a number of little graphical speedups, and more intelligent arm7 memory access pattern optimization. good for a few fps.
This commit is contained in:
parent
3844e866e0
commit
29297644ac
|
@ -674,12 +674,6 @@ static void GPU_resortBGs(GPU *gpu)
|
|||
struct _DISPCNT * cnt = &gpu->dispx_st->dispx_DISPCNT.bits;
|
||||
itemsForPriority_t * item;
|
||||
|
||||
//zero 29-dec-2008 - this really doesnt make sense to me.
|
||||
//i changed the sprwin to be line by line,
|
||||
//and resetting it here is pointless since line rendering is instantaneous
|
||||
//and completely produces and consumes sprwin after which the contents of this buffer are useless
|
||||
//memset(gpu->sprWin,0, 256*192);
|
||||
|
||||
// we don't need to check for windows here...
|
||||
// if we tick boxes, invisible layers become invisible & vice versa
|
||||
#define OP ^ !
|
||||
|
@ -1387,7 +1381,7 @@ FORCEINLINE void GPU::setFinalColor3d(int dstX, int srcX)
|
|||
|
||||
//this was forced inline because most of the time it just falls through to setFinalColorBck() and the function call
|
||||
//overhead was ridiculous and terrible
|
||||
template<bool MOSAIC> FORCEINLINE void GPU::__setFinalColorBck(u16 color, u8 x, bool opaque)
|
||||
template<bool MOSAIC> FORCEINLINE void GPU::__setFinalColorBck(u16 color, const u8 x, const bool opaque)
|
||||
{
|
||||
//I commented out this line to make a point.
|
||||
//indeed, since x is a u8 we cannot pass in anything >=256
|
||||
|
@ -1519,8 +1513,8 @@ template<bool MOSAIC> INLINE void renderline_textBG(GPU * gpu, u16 XBG, u16 YBG,
|
|||
u16 color;
|
||||
u16 xoff;
|
||||
u16 yoff;
|
||||
u16 x = 0;
|
||||
u16 xfin;
|
||||
u32 x = 0;
|
||||
u32 xfin;
|
||||
|
||||
s8 line_dir = 1;
|
||||
u32 mapinfo;
|
||||
|
@ -2610,32 +2604,15 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
|
|||
|
||||
u16 backdrop_color = T1ReadWord(ARM9Mem.ARM9_VMEM, gpu->core * 0x400) & 0x7FFF;
|
||||
|
||||
///* Apply fading to backdrop */
|
||||
if((gpu->BLDCNT & 0x20) && (gpu->BLDY_EVY > 0))
|
||||
{
|
||||
switch(gpu->BLDCNT & 0xC0)
|
||||
{
|
||||
case 0x80: /* Fade in */
|
||||
backdrop_color = fadeInColors[gpu->BLDY_EVY][backdrop_color];
|
||||
break;
|
||||
case 0xC0: /* Fade out */
|
||||
backdrop_color = fadeOutColors[gpu->BLDY_EVY][backdrop_color];
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
|
||||
//we need to write backdrop colors in the same way as we do BG pixels in order to
|
||||
//do correct window processing
|
||||
//memset(gpu->bgPixels,6,256); //dont know whether we need this...
|
||||
//we need to write backdrop colors in the same way as we do BG pixels in order to do correct window processing
|
||||
//this is currently eating up 2fps or so. it is a reasonable candidate for optimization.
|
||||
gpu->currBgNum = 5;
|
||||
for(int x=0;x<256;x++) {
|
||||
gpu->__setFinalColorBck<false>(backdrop_color,x,1);
|
||||
}
|
||||
|
||||
if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] &&
|
||||
!gpu->LayersEnable[2] && !gpu->LayersEnable[3] &&
|
||||
!gpu->LayersEnable[4]) return;
|
||||
//this check isnt really helpful. it just slows us down in the cases where we need the most speed
|
||||
//if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] && !gpu->LayersEnable[2] && !gpu->LayersEnable[3] && !gpu->LayersEnable[4]) return;
|
||||
|
||||
// init background color & priorities
|
||||
memset(sprAlpha, 0, 256);
|
||||
|
|
|
@ -787,7 +787,7 @@ struct GPU
|
|||
FORCEINLINE void setFinal3DColorSpecialDecreaseWnd(int dstX, int srcX);
|
||||
|
||||
|
||||
template<bool MOSAIC> void __setFinalColorBck(u16 color, u8 x, bool opaque);
|
||||
template<bool MOSAIC> void __setFinalColorBck(u16 color, const u8 x, const bool opaque);
|
||||
void setAffineStart(int layer, int xy, u32 val);
|
||||
void setAffineStartWord(int layer, int xy, u16 val, int word);
|
||||
u32 getAffineStart(int layer, int xy);
|
||||
|
|
|
@ -310,17 +310,31 @@ FORCEINLINE u32 _MMU_read32(const int PROCNUM, const MMU_ACCESS_TYPE AT, const u
|
|||
|
||||
goto dunno;
|
||||
}
|
||||
|
||||
//for other cases, we have to check from dtcm first because it is patched on top of the main memory range
|
||||
|
||||
//special handling for execution from arm7. try reading from main memory first
|
||||
if(PROCNUM==ARMCPU_ARM7)
|
||||
{
|
||||
if ( (addr & 0x0F000000) == 0x02000000)
|
||||
return T1ReadLong_guaranteedAligned( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK);
|
||||
else if((addr & 0xFF800000) == 0x03800000)
|
||||
return T1ReadLong_guaranteedAligned(MMU.ARM7_ERAM, addr&0xFFFF);
|
||||
else if((addr & 0xFF800000) == 0x03000000)
|
||||
return T1ReadLong_guaranteedAligned(MMU.SWIRAM, addr&0x7FFF);
|
||||
}
|
||||
|
||||
|
||||
//for other arm9 cases, we have to check from dtcm first because it is patched on top of the main memory range
|
||||
if(PROCNUM==ARMCPU_ARM9)
|
||||
{
|
||||
if((addr&(~0x3FFF)) == MMU.DTCMRegion)
|
||||
{
|
||||
//Returns data from DTCM (ARM9 only)
|
||||
return T1ReadLong(ARM9Mem.ARM9_DTCM, addr & 0x3FFF);
|
||||
}
|
||||
|
||||
if ( (addr & 0x0F000000) == 0x02000000)
|
||||
return T1ReadLong( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK);
|
||||
|
||||
if ( (addr & 0x0F000000) == 0x02000000)
|
||||
return T1ReadLong( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK);
|
||||
}
|
||||
|
||||
dunno:
|
||||
if(PROCNUM==ARMCPU_ARM9) return _MMU_ARM9_read32(addr);
|
||||
|
|
|
@ -52,6 +52,8 @@
|
|||
|
||||
//#undef FORCEINLINE
|
||||
//#define FORCEINLINE
|
||||
//#undef INLINE
|
||||
//#define INLINE
|
||||
|
||||
using std::min;
|
||||
using std::max;
|
||||
|
@ -252,6 +254,37 @@ FORCEINLINE int iround(float f) {
|
|||
return (int)f; //lol
|
||||
}
|
||||
|
||||
//this function is an unreliable, inaccurate floor.
|
||||
//it should only be used for positive numbers
|
||||
//this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available
|
||||
FORCEINLINE u32 u32floor(float f)
|
||||
{
|
||||
#ifndef NOSSE2
|
||||
__asm cvttss2si eax, f;
|
||||
#else
|
||||
return (u32)f;
|
||||
#endif
|
||||
}
|
||||
|
||||
//same as above but works for negative values too.
|
||||
//be sure that the results are the same thing as floorf!
|
||||
FORCEINLINE s32 s32floor(float f)
|
||||
{
|
||||
#ifndef NOSSE2
|
||||
static const float c = -0.5f;
|
||||
__asm
|
||||
{
|
||||
movss xmm0, f;
|
||||
addss xmm0, xmm0;
|
||||
addss xmm0, c;
|
||||
cvtss2si eax, xmm0
|
||||
sar eax, 1
|
||||
}
|
||||
#else
|
||||
return (s32)floorf(f);
|
||||
#endif
|
||||
}
|
||||
|
||||
static struct Sampler
|
||||
{
|
||||
int width, height;
|
||||
|
@ -320,8 +353,8 @@ static struct Sampler
|
|||
{
|
||||
//finally, we can use floor here. but, it is slower than we want.
|
||||
//the best solution is probably to wait until the pipeline is full of fixed point
|
||||
int iu = (int)floorf(u);
|
||||
int iv = (int)floorf(v);
|
||||
s32 iu = s32floor(u);
|
||||
s32 iv = s32floor(v);
|
||||
dowrap(iu,iv);
|
||||
|
||||
FragmentColor color;
|
||||
|
@ -460,12 +493,15 @@ static FORCEINLINE void pixel(int adr,float r, float g, float b, float invu, flo
|
|||
if(gfx3d.wbuffer) {
|
||||
//not sure about this
|
||||
//this value was chosen to make the skybox, castle window decals, and water level render correctly in SM64
|
||||
depth = (u32)(4096*w);
|
||||
depth = u32floor(4096*w);
|
||||
}
|
||||
else
|
||||
{
|
||||
float test = -1.2f;
|
||||
u32 test2 = u32floor(test);
|
||||
//depth = fastFloor(z*0x7FFF)>>8;
|
||||
depth = (u32)(z*0x7FFF);
|
||||
//depth = (u32)(z*0x7FFF);
|
||||
depth = u32floor(z*0x7FFF);
|
||||
//depth = z*0xFFFFFF;
|
||||
}
|
||||
if(polyAttr.decalMode)
|
||||
|
@ -495,9 +531,9 @@ static FORCEINLINE void pixel(int adr,float r, float g, float b, float invu, flo
|
|||
//this is a HACK:
|
||||
//we are being very sloppy with our interpolation precision right now
|
||||
//and rather than fix it, i just want to clamp it
|
||||
shader.materialColor.r = max(0,min(31,(int)r));
|
||||
shader.materialColor.g = max(0,min(31,(int)g));
|
||||
shader.materialColor.b = max(0,min(31,(int)b));
|
||||
shader.materialColor.r = max(0U,min(31U,u32floor(r)));
|
||||
shader.materialColor.g = max(0U,min(31U,u32floor(g)));
|
||||
shader.materialColor.b = max(0U,min(31U,u32floor(b)));
|
||||
|
||||
shader.materialColor.a = polyAttr.alpha;
|
||||
|
||||
|
@ -600,12 +636,13 @@ static FORCEINLINE void pixel(int adr,float r, float g, float b, float invu, flo
|
|||
;
|
||||
}
|
||||
|
||||
|
||||
typedef int fixed28_4;
|
||||
|
||||
static bool failure;
|
||||
|
||||
// handle floor divides and mods correctly
|
||||
inline void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod)
|
||||
INLINE void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod)
|
||||
{
|
||||
//These must be caused by invalid or degenerate shapes.. not sure yet.
|
||||
//check it out in the mario face intro of SM64
|
||||
|
@ -636,10 +673,10 @@ inline void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod
|
|||
}
|
||||
}
|
||||
|
||||
inline fixed28_4 FloatToFixed28_4( float Value ) {
|
||||
INLINE fixed28_4 FloatToFixed28_4( float Value ) {
|
||||
return (fixed28_4)(Value * 16);
|
||||
}
|
||||
inline float Fixed28_4ToFloat( fixed28_4 Value ) {
|
||||
INLINE float Fixed28_4ToFloat( fixed28_4 Value ) {
|
||||
return Value / 16.0;
|
||||
}
|
||||
//inline fixed16_16 FloatToFixed16_16( float Value ) {
|
||||
|
@ -648,11 +685,11 @@ inline float Fixed28_4ToFloat( fixed28_4 Value ) {
|
|||
//inline float Fixed16_16ToFloat( fixed16_16 Value ) {
|
||||
// return Value / 65536.0;
|
||||
//}
|
||||
inline fixed28_4 Fixed28_4Mul( fixed28_4 A, fixed28_4 B ) {
|
||||
INLINE fixed28_4 Fixed28_4Mul( fixed28_4 A, fixed28_4 B ) {
|
||||
// could make this asm to prevent overflow
|
||||
return (A * B) / 16; // 28.4 * 28.4 = 24.8 / 16 = 28.4
|
||||
}
|
||||
inline int Ceil28_4( fixed28_4 Value ) {
|
||||
INLINE int Ceil28_4( fixed28_4 Value ) {
|
||||
int ReturnValue;
|
||||
int Numerator = Value - 1 + 16;
|
||||
if(Numerator >= 0) {
|
||||
|
@ -813,7 +850,7 @@ static void runscanlines(edge_fx_fl *left, edge_fx_fl *right)
|
|||
|
||||
//rotates verts counterclockwise
|
||||
template<int type>
|
||||
inline static void rot_verts() {
|
||||
INLINE static void rot_verts() {
|
||||
#define ROTSWAP(X) if(type>X) swap(verts[X-1],verts[X]);
|
||||
ROTSWAP(1); ROTSWAP(2); ROTSWAP(3); ROTSWAP(4);
|
||||
ROTSWAP(5); ROTSWAP(6); ROTSWAP(7);
|
||||
|
|
Loading…
Reference in New Issue