SPU2ghz: Re-optimized the DMA write code so that DMA writes don't incur as much cache miss penalty anymore. Also made a couple minor tweaks to the timestretcher's overrun handler (Which only affects people who like their games to run at 120 fps. ;)

git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@264 a6443dda-0b58-4228-96e9-037be469359c
2008-11-01 07:34:34 +00:00 · 2008-11-01 07:34:34 +00:00 · d8bd81f00d
parent 2b93aa0709
commit d8bd81f00d
5 changed files with 80 additions and 30 deletions
--- a/plugins/spu2ghz/dma.cpp
+++ b/plugins/spu2ghz/dma.cpp
@ -90,14 +90,16 @@ void DMALogClose() {
 }


-u16 DmaRead(u32 core) {
+__forceinline u16 DmaRead(u32 core)
+{
+	Cores[core].TDA&=0xfffff;
 	const u16 ret = (u16)spu2M_Read(Cores[core].TDA);
 	Cores[core].TDA++;
-	Cores[core].TDA&=0xfffff;
 	return ret;
 }

-void DmaWrite(u32 core, u16 value) {
+__forceinline void DmaWrite(u32 core, u16 value)
+{
 	spu2M_Write( Cores[core].TSA, value );
 	Cores[core].TSA++;
 	Cores[core].TSA&=0xfffff;
@ -189,14 +191,21 @@ void DoDMAWrite(int core,u16 *pMem,u32 size)
 {
 	u32 i;

-	u32 pa = ((u32)pMem)&7;
-	u32 pm = Cores[core].TSA&0x7;
-
-	if(pa || pm)
 	{
-		printf("* Missaligned addr in DMA write!\n");
+		// Perform an alignment check.
+		// Not really important.  Everythign should work regardless,
+		// but it could be indicative of an emulation foopah elsewhere.
+
+		uptr pa = ((uptr)pMem)&7;
+		uptr pm = Cores[core].TSA&0x7;
+
+		if(pa || pm)
+		{
+			fprintf(stderr, "* SPU2 : Missaligned addr in DMA write!\n");
+		}
 	}

+
 	if(core==0)
 		DMA4LogWrite(pMem,size<<1);
 	else
@ -204,9 +213,36 @@ void DoDMAWrite(int core,u16 *pMem,u32 size)

 	if(MsgDMA()) ConLog(" * SPU2: DMA%c Transfer of %d bytes to %x (%02x %x %04x).\n",(core==0)?'4':'7',size<<1,Cores[core].TSA,Cores[core].DMABits,Cores[core].AutoDMACtrl,(~Cores[core].Regs.ATTR)&0x7fff);

-	Cores[core].TDA=Cores[core].TSA;
-	for (i=0;i<size;i++) {
-		spu2M_Write( Cores[core].TDA, pMem[i] );
+	// Optimized!
+	// Instead of checking the adpcm cache for every word, we check for every block.
+	// That way we can use the optimized fast write instruction to commit the memory.
+
+	Cores[core].TDA = Cores[core].TSA & 0xfffff;
+
+	{
+		u32 nexta = Cores[core].TDA >> 3;
+		u32 flagbitmask = 1ul << ( nexta & 31 );
+		nexta >>= 5;
+
+		// Traverse from start to finish in 8 word blocks,
+		// and clear the pcm cache flag for each block.
+		u32 stmp = ( size + 7 ) >> 3;		// round up
+		for( i=0; i<stmp; i++ )
+		{
+			pcm_cache_flags[nexta] &= ~flagbitmask;
+			flagbitmask <<= 1;
+			if( flagbitmask == 0 )
+			{
+				nexta++;
+				flagbitmask = 1;
+			}
+		}
+	}
+
+	for(i=0;i<size;i++)
+	{
+		*GetMemPtr( Cores[core].TDA ) = pMem[i];
+		//spu2M_Write( Cores[core].TDA, pMem[i] );
 		Cores[core].TDA++;
 		Cores[core].TDA&=0xfffff;
 	}
--- a/plugins/spu2ghz/dma.h
+++ b/plugins/spu2ghz/dma.h
@ -23,9 +23,9 @@ void DMA4LogWrite(void *lpData, u32 ulSize);
 void DMA7LogWrite(void *lpData, u32 ulSize);
 void DMALogClose();

-void DmaWrite(u32 core, u16 data);
-u16 DmaRead(u32 core);
+extern void DmaWrite(u32 core, u16 data);
+extern u16 DmaRead(u32 core);

-void AutoDMAReadBuffer(int core, int mode);
+extern void AutoDMAReadBuffer(int core, int mode);

 #endif // DMA_H_INCLUDED //
--- a/plugins/spu2ghz/mixer.cpp
+++ b/plugins/spu2ghz/mixer.cpp
@ -998,7 +998,7 @@ double rfactor=1;
 double cfactor=1;
 double diff=0;

-static s32 __forceinline ApplyVolume(s32 data, s32 volume)
+static __forceinline s32 ApplyVolume(s32 data, s32 volume)
 {
 	return (volume * data);
 }
@ -1006,7 +1006,7 @@ static s32 __forceinline ApplyVolume(s32 data, s32 volume)
 // writes a signed value to the SPU2 ram
 // Performs no cache invalidation -- use only for dynamic memory ranges
 // of the SPU2 (between 0x0000 and SPU2_DYN_MEMLINE)
-static void __forceinline spu2M_WriteFast( u32 addr, s16 value )
+static __forceinline void spu2M_WriteFast( u32 addr, s16 value )
 {
 	// throw an assertion if the memory range is invalid:
 	jASSUME( addr < SPU2_DYN_MEMLINE );
@ -1014,7 +1014,7 @@ static void __forceinline spu2M_WriteFast( u32 addr, s16 value )
 }


-static void __forceinline MixVoice( V_Core& thiscore, V_Voice& vc, s32& VValL, s32& VValR )
+static __forceinline void MixVoice( V_Core& thiscore, V_Voice& vc, s32& VValL, s32& VValR )
 {
 	s32 Value=0;

--- a/plugins/spu2ghz/sndout.cpp
+++ b/plugins/spu2ghz/sndout.cpp
@ -187,9 +187,7 @@ public:
 			// Dump samples from the read portion of the buffer instead of dropping
 			// the newly written stuff.

-			// Toss half the buffer plus whatever's being written anew:
-			s32 comp = GetAlignedBufferSize( (size + nSamples ) / 2 );
-			if( comp > (size-SndOutPacketSize) ) comp = size-SndOutPacketSize;
+			s32 comp;

 			if( timeStretchEnabled )
 			{
@ -199,8 +197,18 @@ public:
 				eTempo += eTempo * 0.25f;
 				if( eTempo > 7.5f ) eTempo = 5.0f;
 				pSoundTouch->setTempo( eTempo );
-				freezeTempo = (comp / SndOutPacketSize) - 1;
-				if( freezeTempo < 1 ) freezeTempo = 1;
+				freezeTempo = 0;		// disabled tempo freeze for now.  May not be needed anymore.
+
+				// Throw out just a little bit (one packet worth) to help
+				// give the TS some room to work:
+
+				comp = SndOutPacketSize;
+			}
+			else
+			{
+				// Toss half the buffer plus whatever's being written anew:
+				s32 comp = GetAlignedBufferSize( (size + nSamples ) / 2 );
+				if( comp > (size-SndOutPacketSize) ) comp = size-SndOutPacketSize;
 			}

 			data-=comp;
@ -241,7 +249,7 @@ public:
 		quietSampleCount = 0;
 		if( underrun_freeze )
 		{			
-			int toFill = (int)(size * ( timeStretchEnabled ? 0.1 : 0.70 ) );
+			int toFill = (int)(size * ( timeStretchEnabled ? 0.1 : 0.50 ) );
 			toFill = GetAlignedBufferSize( toFill );

 			// toFill is now aligned to a SndOutPacket
@ -599,6 +607,11 @@ s32 SndInit()
 	sndTempBuffer = new s32[SndOutPacketSize];
 	sndTempBuffer16 = new s16[SndOutPacketSize];

+	// clear buffers!
+	// Fixes loopy sounds on emu resets.
+	memset( sndTempBuffer, 0, sizeof(s32) * SndOutPacketSize );
+	memset( sndTempBuffer16, 0, sizeof(s16) * SndOutPacketSize );
+
 	cTempo = 1.0;
 	eTempo = 1.0;

--- a/plugins/spu2ghz/spu2.cpp
+++ b/plugins/spu2ghz/spu2.cpp
@ -153,13 +153,15 @@ void SysMessage(char *fmt, ...)
 	MessageBox(0, tmp, "SPU2ghz Msg", 0);
 }

-s16 __forceinline * __fastcall GetMemPtr(u32 addr)
+__forceinline s16 * __fastcall GetMemPtr(u32 addr)
 {
+	// In case you're wondering, this assert is the reason spu2ghz
+	// runs so incrediously slow in Debug mode. :P
 	assert(addr<0x100000);
 	return (_spu2mem+addr);
 }

-s16 __forceinline __fastcall spu2M_Read( u32 addr )
+__forceinline s16 __fastcall spu2M_Read( u32 addr )
 {
 	return *GetMemPtr( addr & 0xfffff );
 }
@ -168,7 +170,7 @@ s16 __forceinline __fastcall spu2M_Read( u32 addr )
 // Invalidates the ADPCM cache in the process.
 // Optimization note: don't use __forceinline because the footprint of this
 // function is a little too heavy now.  Better to let the compiler decide.
-void __inline __fastcall spu2M_Write( u32 addr, s16 value )
+__inline void __fastcall spu2M_Write( u32 addr, s16 value )
 {
 	// Make sure the cache is invalidated:
 	// (note to self : addr address WORDs, not bytes)
@ -182,7 +184,7 @@ void __inline __fastcall spu2M_Write( u32 addr, s16 value )
 }

 // writes an unsigned value to the SPU2 ram
-void __inline __fastcall spu2M_Write( u32 addr, u16 value )
+__inline void __fastcall spu2M_Write( u32 addr, u16 value )
 {
 	spu2M_Write( addr, (s16)value );
 }
@ -620,7 +622,6 @@ void UpdateDebugDialog()
 }
 #endif

-//SHOULD be 768, but 751/752 seems to get better results
 #define TickInterval 768

 u32 TicksCore=0;
@ -655,10 +656,10 @@ void __fastcall TimeUpdate(u32 cClocks, u32 syncType)
 	//  If for some reason our clock value seems way off base, just mix
 	//  out a little bit, skip the rest, and hope the ship "rights" itself later on.

-	if( dClocks > TickInterval*48 )
+	if( dClocks > TickInterval*72 )
 	{
 		ConLog( " * SPU2 > TimeUpdate Sanity Check (Tick Delta: %d) (PS2 Ticks: %d)\n", dClocks/TickInterval, cClocks/TickInterval );
-		dClocks = TickInterval*48;
+		dClocks = TickInterval*72;
 		lClocks = cClocks-dClocks;
 	}