From e5f35a5f3ec41139ab8d01a3cdfcff2580232701 Mon Sep 17 00:00:00 2001 From: zeromus Date: Thu, 14 May 2009 08:08:27 +0000 Subject: [PATCH] a few minor speedups to spu. the doubles have been changed to float. i think this should be safe... --- desmume/src/SPU.cpp | 146 ++++++++++++++++++-------------------- desmume/src/SPU.h | 10 +-- desmume/src/matrix.h | 32 +++++++++ desmume/src/rasterize.cpp | 30 -------- desmume/src/types.h | 18 +++++ 5 files changed, 125 insertions(+), 111 deletions(-) diff --git a/desmume/src/SPU.cpp b/desmume/src/SPU.cpp index bcb4d5d2d..ec7649c68 100644 --- a/desmume/src/SPU.cpp +++ b/desmume/src/SPU.cpp @@ -33,6 +33,10 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #include "readwrite.h" #include "armcpu.h" #include "NDSSystem.h" +#include "matrix.h" + +//#undef FORCEINLINE +//#define FORCEINLINE SPU_struct *SPU_core = 0; SPU_struct *SPU_user = 0; @@ -43,6 +47,8 @@ extern SoundInterface_struct *SNDCoreList[]; #define CHANSTAT_STOPPED 0 #define CHANSTAT_PLAY 1 +static FORCEINLINE u32 sputrunc(float f) { return u32floor(f); } + const s8 indextbl[8] = { -1, -1, -1, -1, 2, 4, 6, 8 @@ -81,7 +87,7 @@ FILE *spufp=NULL; ////////////////////////////////////////////////////////////////////////////// template -static INLINE T MinMax(T val, T min, T max) +static FORCEINLINE T MinMax(T val, T min, T max) { if (val < min) return min; @@ -269,7 +275,7 @@ void SPU_struct::KeyOn(int channel) { channel_struct &thischan = channels[channel]; - thischan.sampinc = (16777216 / (0x10000 - (double)thischan.timer)) / 44100; + thischan.sampinc = (16777216 / (0x10000 - (float)thischan.timer)) / 44100; // LOG("Channel %d key on: vol = %d, datashift = %d, hold = %d, pan = %d, waveduty = %d, repeat = %d, format = %d, source address = %07X, timer = %04X, loop start = %04X, length = %06X, MMU.ARM7_REG[0x501] = %02X\n", channel, chan->vol, chan->datashift, chan->hold, chan->pan, chan->waveduty, chan->repeat, chan->format, chan->addr, chan->timer, chan->loopstart, chan->length, T1ReadByte(MMU.ARM7_REG, 0x501)); switch(thischan.format) @@ -386,7 +392,7 @@ void SPU_struct::WriteWord(u32 addr, u16 val) break; case 0x8: thischan.timer = val & 0xFFFF; - thischan.sampinc = (16777216 / (0x10000 - (double)thischan.timer)) / 44100; + thischan.sampinc = (16777216 / (0x10000 - (float)thischan.timer)) / 44100; break; case 0xA: thischan.loopstart = val; @@ -435,7 +441,7 @@ void SPU_struct::WriteLong(u32 addr, u32 val) case 0x8: thischan.timer = val & 0xFFFF; thischan.loopstart = val >> 16; - thischan.sampinc = (16777216 / (0x10000 - (double)thischan.timer)) / 44100; + thischan.sampinc = (16777216 / (0x10000 - (float)thischan.timer)) / 44100; break; case 0xC: thischan.length = val & 0x3FFFFF; @@ -457,7 +463,7 @@ void SPU_WriteLong(u32 addr, u32 val) } #ifdef SPU_INTERPOLATE -static s32 Interpolate(s32 a, s32 b, double ratio) +static FORCEINLINE s32 Interpolate(s32 a, s32 b, float ratio) { //ratio = ratio - (int)ratio; //double ratio2 = ((1.0f - cos(ratio * 3.14f)) / 2.0f); @@ -465,17 +471,17 @@ static s32 Interpolate(s32 a, s32 b, double ratio) //return (((1-ratio2)*a) + (ratio2*b)); //linear interpolation - ratio = ratio - (int)ratio; + ratio = ratio - sputrunc(ratio); return (1-ratio)*a + ratio*b; } #endif ////////////////////////////////////////////////////////////////////////////// -static INLINE void Fetch8BitData(channel_struct *chan, s32 *data) +static FORCEINLINE void Fetch8BitData(channel_struct *chan, s32 *data) { #ifdef SPU_INTERPOLATE - int loc = (int)chan->sampcnt; + u32 loc = sputrunc(chan->sampcnt); s32 a = (s32)(chan->buf8[loc] << 8); if(loc < (chan->length << 2) - 1) { /* double ratio = chan->sampcnt-loc;*/ @@ -485,16 +491,16 @@ static INLINE void Fetch8BitData(channel_struct *chan, s32 *data) } *data = a; #else - *data = (s32)chan->buf8[(int)chan->sampcnt] << 8; + *data = (s32)chan->buf8[sputrunc(chan->sampcnt)] << 8; #endif } ////////////////////////////////////////////////////////////////////////////// -static INLINE void Fetch16BitData(channel_struct *chan, s32 *data) +static FORCEINLINE void Fetch16BitData(channel_struct *chan, s32 *data) { #ifdef SPU_INTERPOLATE - int loc = (int)chan->sampcnt; + int loc = sputrunc(chan->sampcnt); s32 a = (s32)chan->buf16[loc]; if(loc < (chan->length << 1) - 1) { //double ratio = chan->sampcnt-loc; @@ -504,49 +510,36 @@ static INLINE void Fetch16BitData(channel_struct *chan, s32 *data) } *data = a; #else - *data = (s32)chan->buf16[(int)chan->sampcnt]; + *data = (s32)chan->buf16[sputrunc(chan->sampcnt)]; #endif } + + ////////////////////////////////////////////////////////////////////////////// -static INLINE void FetchADPCMData(channel_struct *chan, s32 *data) +static FORCEINLINE void FetchADPCMData(channel_struct * const chan, s32 * const data) { - u8 data4bit; - int diff; - int i; + // No sense decoding, just return the last sample + if (chan->lastsampcnt == sputrunc(chan->sampcnt)) + goto end; - if (chan->lastsampcnt == (int)chan->sampcnt) + const u32 endExclusive = sputrunc(chan->sampcnt+1); + for (u32 i = chan->lastsampcnt+1; i < endExclusive; i++) { - // No sense decoding, just return the last sample -#ifdef SPU_INTERPOLATE - *data = Interpolate((s32)chan->pcm16b_last,(s32)chan->pcm16b,chan->sampcnt); -#else - *data = (s32)chan->pcm16b; -#endif - return; - } + const u32 shift = (i&1)<<2; + const u32 data4bit = (((u32)chan->buf8[i >> 1]) >> shift); - for (i = chan->lastsampcnt+1; i < (int)chan->sampcnt+1; i++) - { - if (i & 0x1) - data4bit = (chan->buf8[i >> 1] >> 4) & 0xF; - else - data4bit = chan->buf8[i >> 1] & 0xF; - - /*diff = ((data4bit & 0x7) * 2 + 1) * adpcmtbl[chan->index] / 8; - if (data4bit & 0x8) - diff = -diff;*/ - diff = precalcdifftbl[chan->index][data4bit]; + const s32 diff = precalcdifftbl[chan->index][data4bit & 0xF]; + chan->index = precalcindextbl[chan->index][data4bit & 0x7]; chan->pcm16b_last = chan->pcm16b; chan->pcm16b = MinMax(chan->pcm16b+diff, -0x8000, 0x7FFF); - //chan->index = MinMax(chan->index+indextbl[data4bit & 0x7], 0, 88); - chan->index = precalcindextbl[chan->index][data4bit & 0x7]; } - chan->lastsampcnt = (int)chan->sampcnt; + chan->lastsampcnt = sputrunc(chan->sampcnt); +end: #ifdef SPU_INTERPOLATE *data = Interpolate((s32)chan->pcm16b_last,(s32)chan->pcm16b,chan->sampcnt); #else @@ -556,7 +549,7 @@ static INLINE void FetchADPCMData(channel_struct *chan, s32 *data) ////////////////////////////////////////////////////////////////////////////// -static INLINE void FetchPSGData(channel_struct *chan, s32 *data) +static FORCEINLINE void FetchPSGData(channel_struct *chan, s32 *data) { if(chan->num < 8) { @@ -564,17 +557,18 @@ static INLINE void FetchPSGData(channel_struct *chan, s32 *data) } else if(chan->num < 14) { - *data = (s32)wavedutytbl[chan->waveduty][((int)chan->sampcnt) & 0x7]; + *data = (s32)wavedutytbl[chan->waveduty][(sputrunc(chan->sampcnt)) & 0x7]; } else { - if(chan->lastsampcnt == (int)chan->sampcnt) + if(chan->lastsampcnt == sputrunc(chan->sampcnt)) { *data = (s32)chan->psgnoise_last; return; } - for(int i = chan->lastsampcnt; i < (int)chan->sampcnt; i++) + u32 max = sputrunc(chan->sampcnt); + for(u32 i = chan->lastsampcnt; i < max; i++) { if(chan->x & 0x1) { @@ -588,7 +582,7 @@ static INLINE void FetchPSGData(channel_struct *chan, s32 *data) } } - chan->lastsampcnt = (int)chan->sampcnt; + chan->lastsampcnt = sputrunc(chan->sampcnt); *data = (s32)chan->psgnoise_last; } @@ -596,51 +590,42 @@ static INLINE void FetchPSGData(channel_struct *chan, s32 *data) ////////////////////////////////////////////////////////////////////////////// -static INLINE void MixL(SPU_struct* SPU, channel_struct *chan, s32 data) +static FORCEINLINE void MixL(SPU_struct* SPU, channel_struct *chan, s32 data) { - if (data) - { - data = (data * chan->vol / 127) >> chan->datashift; - SPU->sndbuf[SPU->bufpos<<1] += data; - } + data = (data * chan->vol / 127) >> chan->datashift; + SPU->sndbuf[SPU->bufpos<<1] += data; } ////////////////////////////////////////////////////////////////////////////// -static INLINE void MixR(SPU_struct* SPU, channel_struct *chan, s32 data) +static FORCEINLINE void MixR(SPU_struct* SPU, channel_struct *chan, s32 data) { - if (data) - { - data = (data * chan->vol / 127) >> chan->datashift; - SPU->sndbuf[(SPU->bufpos<<1)+1] += data; - } + data = (data * chan->vol / 127) >> chan->datashift; + SPU->sndbuf[(SPU->bufpos<<1)+1] += data; } ////////////////////////////////////////////////////////////////////////////// -static INLINE void MixLR(SPU_struct* SPU, channel_struct *chan, s32 data) +static FORCEINLINE void MixLR(SPU_struct* SPU, channel_struct *chan, s32 data) { - if (data) - { - data = ((data * chan->vol) / 127) >> chan->datashift; - SPU->sndbuf[SPU->bufpos<<1] += data * (127 - chan->pan) / 127; - SPU->sndbuf[(SPU->bufpos<<1)+1] += data * chan->pan / 127; - } + data = ((data * chan->vol) / 127) >> chan->datashift; + SPU->sndbuf[SPU->bufpos<<1] += data * (127 - chan->pan) / 127; + SPU->sndbuf[(SPU->bufpos<<1)+1] += data * chan->pan / 127; } ////////////////////////////////////////////////////////////////////////////// -static INLINE void TestForLoop(SPU_struct *SPU, channel_struct *chan) +static FORCEINLINE void TestForLoop(SPU_struct *SPU, channel_struct *chan) { int shift = (chan->format == 0 ? 2 : 1); chan->sampcnt += chan->sampinc; - if (chan->sampcnt > (double)((chan->length + chan->loopstart) << shift)) + if (chan->sampcnt > (float)((chan->length + chan->loopstart) << shift)) { // Do we loop? Or are we done? if (chan->repeat == 1) - chan->sampcnt = (double)(chan->loopstart << shift); // Is this correct? + chan->sampcnt = (float)(chan->loopstart << shift); // Is this correct? else { chan->status = CHANSTAT_STOPPED; @@ -654,16 +639,16 @@ static INLINE void TestForLoop(SPU_struct *SPU, channel_struct *chan) ////////////////////////////////////////////////////////////////////////////// -static INLINE void TestForLoop2(SPU_struct *SPU, channel_struct *chan) +static FORCEINLINE void TestForLoop2(SPU_struct *SPU, channel_struct *chan) { chan->sampcnt += chan->sampinc; - if (chan->sampcnt > (double)((chan->length + chan->loopstart) << 3)) + if (chan->sampcnt > (float)((chan->length + chan->loopstart) << 3)) { // Do we loop? Or are we done? if (chan->repeat == 1) { - chan->sampcnt = (double)(chan->loopstart << 3); // Is this correct? + chan->sampcnt = (float)(chan->loopstart << 3); // Is this correct? chan->pcm16b = (s16)((chan->buf8[1] << 8) | chan->buf8[0]); chan->index = chan->buf8[2] & 0x7F; chan->lastsampcnt = 7; @@ -1274,7 +1259,7 @@ void SNDFileSetVolume(int volume) void spu_savestate(std::ostream* os) { //version - write32le(0,os); + write32le(1,os); SPU_struct *spu = SPU_core; @@ -1293,8 +1278,8 @@ void spu_savestate(std::ostream* os) write16le(chan.timer,os); write16le(chan.loopstart,os); write32le(chan.length,os); - write64le(double_to_u64(chan.sampcnt),os); - write64le(double_to_u64(chan.sampinc),os); + write32le(float_to_u32(chan.sampcnt),os); + write32le(float_to_u32(chan.sampinc),os); write32le(chan.lastsampcnt,os); write16le(chan.pcm16b,os); write16le(chan.pcm16b_last,os); @@ -1309,7 +1294,7 @@ bool spu_loadstate(std::istream* is, int size) //read version int version; if(read32le(&version,is) != 1) return false; - if(version != 0) return false; + SPU_struct *spu = SPU_core; @@ -1328,9 +1313,17 @@ bool spu_loadstate(std::istream* is, int size) read16le(&chan.timer,is); read16le(&chan.loopstart,is); read32le(&chan.length,is); - u64 temp; - read64le(&temp,is); chan.sampcnt = u64_to_double(temp); - read64le(&temp,is); chan.sampinc = u64_to_double(temp); + if(version == 0) + { + u64 temp; + read64le(&temp,is); chan.sampcnt = (float)u64_to_double(temp); + read64le(&temp,is); chan.sampinc = (float)u64_to_double(temp); + } + else + { + read32le((u32*)&chan.sampcnt,is); + read32le((u32*)&chan.sampinc,is); + } read32le(&chan.lastsampcnt,is); read16le(&chan.pcm16b,is); read16le(&chan.pcm16b_last,is); @@ -1340,7 +1333,6 @@ bool spu_loadstate(std::istream* is, int size) //fixup the pointers which we had are supposed to keep cached chan.buf8 = (s8*)&MMU.MMU_MEM[1][(chan.addr>>20)&0xFF][(chan.addr & MMU.MMU_MASK[1][(chan.addr >> 20) & 0xFF])]; - chan.buf16 = (s16*)chan.buf8; } //copy the core spu (the more accurate) to the user spu diff --git a/desmume/src/SPU.h b/desmume/src/SPU.h index 6fc34b217..3506c2209 100644 --- a/desmume/src/SPU.h +++ b/desmume/src/SPU.h @@ -58,10 +58,12 @@ struct channel_struct u16 timer; u16 loopstart; u32 length; - s8 *buf8; - s16 *buf16; - double sampcnt; - double sampinc; + union { + s8 *buf8; + s16 *buf16; + }; + float sampcnt; + float sampinc; // ADPCM specific int lastsampcnt; s16 pcm16b, pcm16b_last; diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h index bbb3ee6f8..d28a87a47 100644 --- a/desmume/src/matrix.h +++ b/desmume/src/matrix.h @@ -98,4 +98,36 @@ void Vector4Copy(float *dst, const float *src); } //extern "C" +//this function is an unreliable, inaccurate floor. +//it should only be used for positive numbers +//this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available +FORCEINLINE u32 u32floor(float f) +{ +#ifndef NOSSE2 + __asm cvttss2si eax, f; +#else + return (u32)f; +#endif +} + +//same as above but works for negative values too. +//be sure that the results are the same thing as floorf! +FORCEINLINE s32 s32floor(float f) +{ +#ifndef NOSSE2 + static const float c = -0.5f; + __asm + { + movss xmm0, f; + addss xmm0, xmm0; + addss xmm0, c; + cvtss2si eax, xmm0 + sar eax, 1 + } +#else + return (s32)floorf(f); +#endif +} + + #endif diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index 6f9ffc6b0..6aac05639 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -254,36 +254,6 @@ FORCEINLINE int iround(float f) { return (int)f; //lol } -//this function is an unreliable, inaccurate floor. -//it should only be used for positive numbers -//this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available -FORCEINLINE u32 u32floor(float f) -{ -#ifndef NOSSE2 - __asm cvttss2si eax, f; -#else - return (u32)f; -#endif -} - -//same as above but works for negative values too. -//be sure that the results are the same thing as floorf! -FORCEINLINE s32 s32floor(float f) -{ -#ifndef NOSSE2 - static const float c = -0.5f; - __asm - { - movss xmm0, f; - addss xmm0, xmm0; - addss xmm0, c; - cvtss2si eax, xmm0 - sar eax, 1 - } -#else - return (s32)floorf(f); -#endif -} static struct Sampler { diff --git a/desmume/src/types.h b/desmume/src/types.h index 0b3d4ab3e..2799666ed 100644 --- a/desmume/src/types.h +++ b/desmume/src/types.h @@ -320,6 +320,24 @@ inline double u64_to_double(u64 u) { return fuxor.b; } +inline u32 float_to_u32(float f) { + union { + u32 a; + float b; + } fuxor; + fuxor.b = f; + return fuxor.a; +} + +inline float u32_to_float(u32 u) { + union { + u32 a; + float b; + } fuxor; + fuxor.a = u; + return fuxor.b; +} + ///stores a 32bit value into the provided byte array in guaranteed little endian form inline void en32lsb(u8 *buf, u32 morp)