//GiGaHeRz's SPU2 Driver //Copyright (c) 2003-2008, David Quintana // //This library is free software; you can redistribute it and/or //modify it under the terms of the GNU Lesser General Public //License as published by the Free Software Foundation; either //version 2.1 of the License, or (at your option) any later version. // //This library is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU //Lesser General Public License for more details. // //You should have received a copy of the GNU Lesser General Public //License along with this library; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // [TODO] : The layout of this code file is now a complete hackish mess after // numerous timestretch-related additions. The whole thing should really be // rethought and redone at this point. #include "spu2.h" #include "SoundTouch/SoundTouch.h" #include "SoundTouch/WavFile.h" #include static int ts_stats_stretchblocks = 0; static int ts_stats_normalblocks = 0; static int ts_stats_logcounter = 0; class NullOutModule: public SndOutModule { public: s32 Init(SndBuffer *) { return 0; } void Close() { } s32 Test() const { return 0; } void Configure(HWND parent) { } bool Is51Out() const { return false; } int GetEmptySampleCount() const { return 0; } const char* GetIdent() const { return "nullout"; } const char* GetLongName() const { return "No Sound (Emulate SPU2 only)"; } } NullOut; SndOutModule* mods[]= { &NullOut, WaveOut, DSoundOut, //DSound51Out, //ASIOOut, XAudio2Out, NULL // signals the end of our list }; int FindOutputModuleById( const char* omodid ) { int modcnt = 0; while( mods[modcnt] != NULL ) { if( strcmp( mods[modcnt]->GetIdent(), omodid ) == 0 ) break; ++modcnt; } return modcnt; } // Overall master volume shift. // Converts the mixer's 32 bit value into a 16 bit value. int SndOutVolumeShift = SndOutVolumeShiftBase + 1; static __forceinline s16 SndScaleVol( s32 inval ) { return inval >> SndOutVolumeShift; } // records last buffer status (fill %, range -100 to 100, with 0 being 50% full) float lastPct; float lastEmergencyAdj; float cTempo=1; float eTempo = 1; int freezeTempo = 0; soundtouch::SoundTouch* pSoundTouch=NULL; //usefull when timestretch isn't available //#define DYNAMIC_BUFFER_LIMITING class SndBufferImpl: public SndBuffer { private: s32 *buffer; s32 size; s32 rpos; s32 wpos; s32 data; // data prediction amount, used to "commit" data that hasn't // finished timestretch processing. s32 predictData; bool pw; bool underrun_freeze; HANDLE hSyncEvent; CRITICAL_SECTION cs; protected: int GetAlignedBufferSize( int comp ) { return (comp + SndOutPacketSize-1) & ~(SndOutPacketSize-1); } public: SndBufferImpl( float latencyMS ) { rpos=0; wpos=0; data=0; size=GetAlignedBufferSize( (int)(latencyMS * SampleRate / 500.0f ) ); buffer = new s32[size]; pw=false; underrun_freeze = false; predictData = 0; #ifdef DYNAMIC_BUFFER_LIMITING overflows=0; underflows=0; writewaits=0; buffer_limit=size; #endif InitializeCriticalSection(&cs); hSyncEvent = CreateEvent(NULL,FALSE,FALSE,NULL); } virtual ~SndBufferImpl() { pw=false; PulseEvent(hSyncEvent); Sleep(10); EnterCriticalSection(&cs); LeaveCriticalSection(&cs); DeleteCriticalSection(&cs); CloseHandle(hSyncEvent); delete buffer; } virtual void WriteSamples(s32 *bData, int nSamples) { EnterCriticalSection(&cs); int free = size-data; predictData = 0; jASSUME( data <= size ); if( pw && ( free < nSamples ) ) { // Wait for a ReadSamples to pull some stuff out of the buffer. // One SyncEvent will do the trick. ResetEvent( hSyncEvent ); LeaveCriticalSection(&cs); WaitForSingleObject(hSyncEvent,20); EnterCriticalSection(&cs); } // Problem: // If the SPU2 gets out of sync with the SndOut device, the writepos of the // circular buffer will overtake the readpos, leading to a prolonged period // of hopscotching read/write accesses (ie, lots of staticy crap sound for // several seconds). // // Compromise: // When an overrun occurs, we adapt by discarding a portion of the buffer. // The older portion of the buffer is discarded rather than incoming data, // so that the overall audio synchronization is better. if( free < nSamples ) { // Buffer overrun! // Dump samples from the read portion of the buffer instead of dropping // the newly written stuff. s32 comp; if( timeStretchEnabled ) { // If we overran it means the timestretcher failed. We need to speed // up audio playback. cTempo += cTempo * 0.12f; eTempo += eTempo * 0.40f; if( eTempo > 7.5f ) eTempo = 7.5f; pSoundTouch->setTempo( eTempo ); // Throw out just a little bit (two packets worth) to help // give the TS some room to work: comp = SndOutPacketSize*2; } else { // Toss half the buffer plus whatever's being written anew: comp = GetAlignedBufferSize( (size + nSamples ) / 2 ); if( comp > (size-SndOutPacketSize) ) comp = size-SndOutPacketSize; } data-=comp; rpos=(rpos+comp)%size; if( MsgOverruns() ) ConLog(" * SPU2 > Overrun Compensation (%d packets tossed)\n", comp / SndOutPacketSize ); lastPct = 0.0; // normalize the timestretcher } // copy in two phases, since there's a chance the packet // wraps around the buffer (it'd be nice to deal in packets only, but // the timestretcher and DSP options require flexibility). const int endPos = wpos + nSamples; const int secondCopyLen = endPos - size; s32* wposbuffer = &buffer[wpos]; data += nSamples; if( secondCopyLen > 0 ) { nSamples -= secondCopyLen; memcpy( buffer, &bData[nSamples], secondCopyLen * sizeof( *bData ) ); wpos = secondCopyLen; } else wpos += nSamples; memcpy( wposbuffer, bData, nSamples * sizeof( *bData ) ); LeaveCriticalSection(&cs); } protected: // Returns TRUE if there is data to be output, or false if no data // is available to be copied. bool CheckUnderrunStatus( int& nSamples, int& quietSampleCount ) { quietSampleCount = 0; if( underrun_freeze ) { int toFill = (int)(size * ( timeStretchEnabled ? 0.1f : 0.50f ) ); toFill = GetAlignedBufferSize( toFill ); // toFill is now aligned to a SndOutPacket if( data < toFill ) { quietSampleCount = nSamples; return false; } underrun_freeze = false; if( MsgOverruns() ) ConLog(" * SPU2 > Underrun compensation (%d packets buffered)\n", toFill / SndOutPacketSize ); lastPct = 0.0; // normalize timestretcher } else if( data < nSamples ) { nSamples = data; quietSampleCount = SndOutPacketSize - data; underrun_freeze = true; if( timeStretchEnabled ) { // timeStretcher failed it's job. We need to slow down the audio some. cTempo -= (cTempo * 0.12f); eTempo -= (eTempo * 0.30f); if( eTempo < 0.1f ) eTempo = 0.1f; pSoundTouch->setTempo( eTempo ); } return nSamples != 0; } return true; } public: void ReadSamples( s16* bData ) { int nSamples = SndOutPacketSize; EnterCriticalSection(&cs); // Problem: // If the SPU2 gets even the least bit out of sync with the SndOut device, // the readpos of the circular buffer will overtake the writepos, // leading to a prolonged period of hopscotching read/write accesses (ie, // lots of staticy crap sound for several seconds). // // Fix: // If the read position overtakes the write position, abort the // transfer immediately and force the SndOut driver to wait until // the read buffer has filled up again before proceeding. // This will cause one brief hiccup that can never exceed the user's // set buffer length in duration. int quietSamples; if( CheckUnderrunStatus( nSamples, quietSamples ) ) { jASSUME( nSamples <= SndOutPacketSize ); // [Air] [TODO]: This loop is probably a candidiate for SSE2 optimization. const int endPos = rpos + nSamples; const int secondCopyLen = endPos - size; const s32* rposbuffer = &buffer[rpos]; data -= nSamples; if( secondCopyLen > 0 ) { nSamples -= secondCopyLen; for( int i=0; i 0 ) { nSamples -= secondCopyLen; memcpy( &bData[nSamples], buffer, secondCopyLen * sizeof( *bData ) ); rpos = secondCopyLen; } else rpos += nSamples; memcpy( bData, &buffer[oldrpos], nSamples * sizeof( *bData ) ); } // If quietSamples != 0 it means we have an underrun... // Let's just dull out some silence, because that's usually the least // painful way of dealing with underruns: memset( bData, 0, quietSamples * sizeof(*bData) ); PulseEvent(hSyncEvent); LeaveCriticalSection(&cs); } void PredictDataWrite( int samples ) { predictData += samples; } virtual void PauseOnWrite(bool doPause) { pw = doPause; } // Calculate the buffer status percentage. // Returns range from -1.0 to 1.0 // 1.0 = buffer overflow! // 0.0 = buffer nominal (50% full) // -1.0 = buffer underflow! float GetStatusPct() { EnterCriticalSection(&cs); // Get the buffer status of the output driver too, so that we can // obtain a more accurate overall buffer status. int drvempty = mods[OutputModule]->GetEmptySampleCount(); // / 2; //ConLog( "Data %d >>> driver: %d predict: %d\n", data, drvempty, predictData ); float result = (float)(data + predictData - drvempty) - (size/2); result /= (size/2); LeaveCriticalSection(&cs); return result; } }; SndBufferImpl *sndBuffer=NULL; s32* sndTempBuffer=NULL; s32 sndTempProgress=NULL; s16* sndTempBuffer16=NULL; void UpdateTempoChange() { if( --freezeTempo > 0 ) { return; } float statusPct = sndBuffer->GetStatusPct(); float pctChange = statusPct - lastPct; float tempoChange; float emergencyAdj = 0; float newcee = cTempo; // workspace var. for cTempo // IMPORTANT! // If you plan to tweak these values, make sure you're using a release build // OUTSIDE THE DEBUGGER to test it! The Visual Studio debugger can really cause // erratic behavior in the audio buffers, and makes the timestretcher seem a // lot more inconsistent than it really is. // We have two factors. // * Distance from nominal buffer status (50% full) // * The change from previous update to this update. // Prediction based on the buffer change: // (linear seems to work better here) tempoChange = pctChange * 0.75f; if( statusPct * tempoChange < 0.0f ) { // only apply tempo change if it is in synch with the buffer status. // In other words, if the buffer is high (over 0%), and is decreasing, // ignore it. It'll just muck things up. tempoChange = 0; } // Sudden spikes in framerate can cause the nominal buffer status // to go critical, in which case we have to enact an emergency // stretch. The following cubic formulas do that. Values near // the extremeites give much larger results than those near 0. // And the value is added only this time, and does not accumulate. // (otherwise a large value like this would cause problems down the road) // Constants: // Weight - weights the statusPct's "emergency" consideration. // higher values here will make the buffer perform more drastic // compensations at the outter edges of the buffer (at -75 or +75% // or beyond, for example). // Range - scales the adjustment to the given range (more or less). // The actual range is dependent on the weight used, so if you increase // Weight you'll usually want to decrease Range somewhat to compensate. // Prediction based on the buffer fill status: const float statusWeight = 2.99f; const float statusRange = 0.068f; // "non-emergency" deadzone: In this area stretching will be strongly discouraged. // Note: due tot he nature of timestretch latency, it's always a wee bit harder to // cope with low fps (underruns) tha it is high fps (overruns). So to help out a // little, the low-end portions of this check are less forgiving than the high-sides. if( cTempo < 0.965f || cTempo > 1.060f || pctChange < -0.38f || pctChange > 0.54f || statusPct < -0.32f || statusPct > 0.39f || eTempo < 0.89f || eTempo > 1.19f ) { emergencyAdj = ( pow( statusPct*statusWeight, 3.0f ) * statusRange); } // Smooth things out by factoring our previous adjustment into this one. // It helps make the system 'feel' a little smarter by giving it at least // one packet worth of history to help work off of: emergencyAdj = (emergencyAdj * 0.75f) + (lastEmergencyAdj * 0.25f ); lastEmergencyAdj = emergencyAdj; lastPct = statusPct; // Accumulate a fraction of the tempo change into the tempo itself. // This helps the system run "smarter" to games that run consistently // fast or slow by altering the base tempo to something closer to the // game's active speed. In tests most games normalize within 2 seconds // at 100ms latency, which is pretty good (larger buffers normalize even // quicker). newcee += newcee * (tempoChange+emergencyAdj) * 0.03f; // Apply tempoChange as a scale of cTempo. That way the effect is proportional // to the current tempo. (otherwise tempos rate of change at the extremes would // be too drastic) float newTempo = newcee + ( emergencyAdj * cTempo ); // ... and as a final optimization, only stretch if the new tempo is outside // a nominal threshold. Keep this threshold check small, because it could // cause some serious side effects otherwise. (enlarging the cTempo check above // is usually better/safer) if( newTempo < 0.970f || newTempo > 1.045f ) { cTempo = (float)newcee; if( newTempo < 0.10f ) newTempo = 0.10f; else if( newTempo > 10.0f ) newTempo = 10.0f; if( cTempo < 0.15f ) cTempo = 0.15f; else if( cTempo > 7.5f ) cTempo = 7.5f; pSoundTouch->setTempo( eTempo = (float)newTempo ); ts_stats_stretchblocks++; /*ConLog(" * SPU2: [Nominal %d%%] [Emergency: %d%%] (baseTempo: %d%% ) (newTempo: %d%%) (buffer: %d%%)\n", //(relation < 0.0) ? "Normalize" : "", (int)(tempoChange * 100.0 * 0.03), (int)(emergencyAdj * 100.0), (int)(cTempo * 100.0), (int)(newTempo * 100.0), (int)(statusPct * 100.0) );*/ } else { // Nominal operation -- turn off stretching. // note: eTempo 'slides' toward 1.0 for smoother audio and better // protection against spikes. if( cTempo != 1.0f ) { cTempo = 1.0f; eTempo = ( 1.0f + eTempo ) * 0.5f; pSoundTouch->setTempo( eTempo ); } else { if( eTempo != cTempo ) pSoundTouch->setTempo( eTempo=cTempo ); ts_stats_normalblocks++; } } } void soundtouchInit() { pSoundTouch = new soundtouch::SoundTouch(); pSoundTouch->setSampleRate(SampleRate); pSoundTouch->setChannels(2); pSoundTouch->setSetting(SETTING_USE_QUICKSEEK, 0); pSoundTouch->setSetting(SETTING_USE_AA_FILTER, 0); pSoundTouch->setTempo(1); // some timestretch management vars: cTempo = 1.0; eTempo = 1.0; lastPct = 0; lastEmergencyAdj = 0; // just freeze tempo changes for a while at startup. // the driver buffers are bogus anyway. freezeTempo = 8; } static void _sndInitFail() { // If a failure occurs, just initialize the NoSound driver. This'll allow // the game to emulate properly (hopefully), albeit without sound. OutputModule = FindOutputModuleById( NullOut.GetIdent() ); mods[OutputModule]->Init( sndBuffer ); } s32 SndInit() { if( mods[OutputModule] == NULL ) { _sndInitFail(); return 0; } // initialize sound buffer // Buffer actually attempts to run ~50%, so allocate near double what // the requested latency is: try { sndBuffer = new SndBufferImpl( SndOutLatencyMS * (timeStretchEnabled ? 2.0f : 1.5f) ); sndTempBuffer = new s32[SndOutPacketSize]; sndTempBuffer16 = new s16[SndOutPacketSize]; } catch( std::bad_alloc& ) { // out of memory exception (most likely) SysMessage( "Out of memory error occured while initializing SPU2." ); _sndInitFail(); return 0; } // clear buffers! // Fixes loopy sounds on emu resets. memset( sndTempBuffer, 0, sizeof(s32) * SndOutPacketSize ); memset( sndTempBuffer16, 0, sizeof(s16) * SndOutPacketSize ); sndTempProgress = 0; soundtouchInit(); // initializes the timestretching if(LimitMode!=0) { sndBuffer->PauseOnWrite(true); } // some crap spdif_set51(mods[OutputModule]->Is51Out()); // initialize module if( mods[OutputModule]->Init(sndBuffer) == -1 ) { _sndInitFail(); } return 0; } void SndClose() { mods[OutputModule]->Close(); SAFE_DELETE_OBJ( sndBuffer ); SAFE_DELETE_ARRAY( sndTempBuffer ); SAFE_DELETE_ARRAY( sndTempBuffer16 ); SAFE_DELETE_OBJ( pSoundTouch ); } void SndUpdateLimitMode() { //sndBuffer->PauseOnWrite(LimitMode!=0); if(LimitMode!=0) { timeStretchEnabled = true; //printf(" * SPU2 limiter is now ON.\n"); printf(" * SPU2 timestretch is now ON.\n"); } else { //printf(" * SPU2 limiter is now OFF.\n"); printf(" * SPU2 timestretch is now OFF.\n"); timeStretchEnabled = false; } } s32 SndWrite(s32 ValL, s32 ValR) { #ifndef PUBLIC if(WaveLog() && wavedump_ok) { wavedump_write(SndScaleVol(ValL),SndScaleVol(ValR)); } #endif if(recording!=0) RecordWrite(SndScaleVol(ValL),SndScaleVol(ValR)); if(mods[OutputModule] == &NullOut) // null output doesn't need buffering or stretching! :p return 0; sndTempBuffer[sndTempProgress++] = ValL; sndTempBuffer[sndTempProgress++] = ValR; // If we haven't accumulated a full packet yet, do nothing more: if(sndTempProgress < SndOutPacketSize) return 1; if(dspPluginEnabled) { for(int i=0;i>1)<<1; for(int i=0;iPredictDataWrite( (int)( sndTempProgress / eTempo ) ); for(int i=0;iputSamples((float*)sndTempBuffer, sndTempProgress>>1); while( ( sndTempProgress = pSoundTouch->receiveSamples((float*)sndTempBuffer, sndTempProgress>>1)<<1 ) != 0 ) { // [Air] [TODO] : Implement an SSE downsampler to int. for(int i=0;iWriteSamples(sndTempBuffer, sndTempProgress); progress = true; } UpdateTempoChange(); if( MsgOverruns() ) { if( progress ) { if( ++ts_stats_logcounter > 300 ) { ts_stats_logcounter = 0; ConLog( " * SPU2 > Timestretch Stats > %d%% of packets stretched.\n", ( ts_stats_stretchblocks * 100 ) / ( ts_stats_normalblocks + ts_stats_stretchblocks ) ); ts_stats_normalblocks = 0; ts_stats_stretchblocks = 0; } } } } else { sndBuffer->WriteSamples(sndTempBuffer, sndTempProgress); sndTempProgress=0; } return 1; } s32 SndTest() { if( mods[OutputModule] == NULL ) return -1; return mods[OutputModule]->Test(); } void SndConfigure(HWND parent, u32 module ) { if( mods[module] == NULL ) return; mods[module]->Configure(parent); } #if 0 ////////////////////////////////////////////////////////////// // Basic Timestretcher (50% to 150%) const s32 StretchBufferSize = 2048; s32 stretchBufferL[StretchBufferSize*2]; s32 stretchBufferR[StretchBufferSize*2]; s32 stretchPosition=0; s32 stretchOutputSize = 2048; // valid values from 1024 to 3072 s32 blah; extern float cspeed; void TimestretchUpdate(int bufferusage,int buffersize) { if(cspeed>1.01) { stretchOutputSize+=10; } else if (cspeed<0.99) { stretchOutputSize-=10; } blah++; if(blah>=2) { blah=0; printf(" * Stretch = %d of %d\n",stretchOutputSize,StretchBufferSize); } } s32 SndWriteStretch(s32 ValL, s32 ValR) { // TODO: update stretchOutputSize according to speed :P stretchBufferL[stretchPosition] = ValL; stretchBufferR[stretchPosition] = ValR; stretchPosition++; if(stretchPosition>=StretchBufferSize) { stretchPosition=0; if(stretchOutputSize < (StretchBufferSize/2)) stretchOutputSize=(StretchBufferSize/2); if(stretchOutputSize > (StretchBufferSize*3/2)) stretchOutputSize=(StretchBufferSize*3/2); if(stretchOutputSize>StretchBufferSize) { int K = (stretchOutputSize-StretchBufferSize); int J = StretchBufferSize - K; // K samples offset for(int i=StretchBufferSize;i