From 49bc297032eabe895c6d0842db85f91f2341d9c0 Mon Sep 17 00:00:00 2001 From: zeromus Date: Wed, 16 Sep 2009 07:38:28 +0000 Subject: [PATCH] optimizations (especially for 32bit systems) to sequencing logic in main emulation loop; few fps speedup for cpu heavy games --- desmume/src/NDSSystem.cpp | 59 ++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/desmume/src/NDSSystem.cpp b/desmume/src/NDSSystem.cpp index 39eeb886d..92e123719 100644 --- a/desmume/src/NDSSystem.cpp +++ b/desmume/src/NDSSystem.cpp @@ -1535,8 +1535,7 @@ struct TSequenceItem FORCEINLINE u64 next() { - if(enabled) return timestamp; - else return kNever; + return timestamp; } }; @@ -1645,9 +1644,10 @@ template struct TSequenceItem_DMA : public TSequenceItem return (MMU.DMAing[procnum][chan])&&nds_timer>=(MMU.DMACycle[procnum][chan]); } + FORCEINLINE bool isEnabled() { return MMU.DMAing[procnum][chan]!=0; } + FORCEINLINE u64 next() { - if(!MMU.DMAing[procnum][chan]) return kNever; return MMU.DMACycle[procnum][chan]; } @@ -1673,9 +1673,10 @@ struct TSequenceItem_divider : public TSequenceItem return MMU.divRunning && nds_timer >= MMU.divCycles; } + bool isEnabled() { return MMU.divRunning!=0; } + FORCEINLINE u64 next() { - if(!MMU.divRunning) return kNever; return MMU.divCycles; } @@ -1698,9 +1699,10 @@ struct TSequenceItem_sqrtunit : public TSequenceItem return MMU.sqrtRunning && nds_timer >= MMU.sqrtCycles; } + bool isEnabled() { return MMU.sqrtRunning!=0; } + FORCEINLINE u64 next() { - if(!MMU.sqrtRunning) return kNever; return MMU.sqrtCycles; } @@ -1990,23 +1992,54 @@ void NDS_Reschedule() sequencer.reschedule = true; } +FORCEINLINE u32 _fast_min32(u32 a, u32 b, u32 c, u32 d) +{ + return ((( ((s32)(a-b)) >> (32-1)) & (c^d)) ^ d); +} + +FORCEINLINE u64 _fast_min(u64 a, u64 b) +{ + //you might find that this is faster on a 64bit system; someone should try it + //http://aggregate.org/MAGIC/#Integer%20Selection + //u64 ret = (((((s64)(a-b)) >> (64-1)) & (a^b)) ^ b); + //assert(ret==min(a,b)); + //return ret; + + //but this ends up being the fastest on 32bits + return a>= 63; //no 64bit shra in sse2, what a disappointment + //temp = _mm_and_si128(temp,xorval); + //temp = _mm_xor_si128(temp,__b); + //return temp.m128i_u64[0]; +} + + + u64 Sequencer::findNext() { - u64 next = kNever; - next = min(next,dispcnt.next()); - next = min(next,divider.next()); - next = min(next,sqrtunit.next()); - next = min(next,gxfifo.next()); + //this one is always enabled so dont bother to check it + u64 next = dispcnt.next(); + + if(divider.isEnabled()) next = _fast_min(next,divider.next()); + if(sqrtunit.isEnabled()) next = _fast_min(next,sqrtunit.next()); + if(gxfifo.enabled) next = _fast_min(next,gxfifo.next()); #ifdef EXPERIMENTAL_WIFI - next = min(next,wifi.next()); + next = _fast_min(next,wifi.next()); #endif -#define test(X,Y) next = min(next,dma_##X##_##Y .next()); +#define test(X,Y) if(dma_##X##_##Y .isEnabled()) next = _fast_min(next,dma_##X##_##Y .next()); test(0,0); test(0,1); test(0,2); test(0,3); test(1,0); test(1,1); test(1,2); test(1,3); #undef test -#define test(X,Y) if(timer_##X##_##Y .enabled) next = min(next,timer_##X##_##Y .next()); +#define test(X,Y) if(timer_##X##_##Y .enabled) next = _fast_min(next,timer_##X##_##Y .next()); test(0,0); test(0,1); test(0,2); test(0,3); test(1,0); test(1,1); test(1,2); test(1,3); #undef test