SPU2ghz: Third try's the charm? This is an even better yet fix for the crackling sound in SO3. Note: This revision upgrades spu2ghz savestates to 0x101. Old states should still load fine for the most part.

git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@682 a6443dda-0b58-4228-96e9-037be469359c
2009-02-03 00:55:37 +00:00 · 2009-02-03 00:55:37 +00:00 · a92fc9900f
parent ef489b45bd
commit a92fc9900f
4 changed files with 158 additions and 122 deletions
--- a/plugins/spu2ghz/dma.cpp
+++ b/plugins/spu2ghz/dma.cpp
@ -220,27 +220,6 @@ void DoDMAWrite(int core,u16 *pMem,u32 size)
 	Cores[core].TSA &= 0xfffff;

 	u32 buff1end = Cores[core].TSA + size;
-
-	// Pcm Cache Invalidation!
-	// Ideally we would only mask bits actually written to, but it's a complex algorithm
-	// that is way more work than it's worth.  Masking out bytes would in theory work a
-	// little more effiently, but was buggy in practice for some reason.  So a dumb and
-	// dirty 32-bit mask will suffice.
-
-	// Note: When clearing cache flags, the *endpoint* needs to be rounded upward.
-	// just rounding the count upward could cause problems if both start and end
-	// points are mis-aligned.
-
-	// indexer scalar - 8 addresses per block, and 32 bits per dword:
-	const u32 indexer_scalar = 8*32;
-
-	const int roundUp = indexer_scalar-1;
-	const int flagTSA = Cores[core].TSA / indexer_scalar;
-	int flagTDA = (buff1end + roundUp) / indexer_scalar;	// endpoint, rounded up
-	u8* cache = (u8*)pcm_cache_flags;
-
-	memset( &pcm_cache_flags[flagTSA], 0, (flagTDA - flagTSA) * 4 );
-
 	u32 buff2end=0;
 	if( buff1end > 0x100000 )
 	{
@ -248,6 +227,60 @@ void DoDMAWrite(int core,u16 *pMem,u32 size)
 		buff1end = 0x100000;
 	}

+	const int cacheIdxStart = Cores[core].TSA / pcm_WordsPerBlock;
+	const int cacheIdxEnd = (buff1end+pcm_WordsPerBlock-1) / pcm_WordsPerBlock;
+	PcmCacheEntry* cacheLine = &pcm_cache_data[cacheIdxStart];
+	PcmCacheEntry& cacheEnd = pcm_cache_data[cacheIdxEnd];
+
+	do 
+	{
+		cacheLine->Validated = false;
+		cacheLine++;
+	} while ( cacheLine != &cacheEnd );
+
+#if 0
+	// Pcm Cache Invalidation!
+	// It's a requirement that we mask bits for the blocks that are written to *only*,
+	// because doing anything else can cause the cache to fail, thanks to the progressive
+	// nature of the SPU2's ADPCM encoding.  (the same thing that makes it impossible
+	// to use SSE optimizations on it).
+
+	u8* cache = (u8*)pcm_cache_flags;
+
+	// Step 1: Clear bits in the front remainder.
+
+	const int pcmTSA = Cores[core].TSA / pcm_WordsPerBlock;
+	const int pcmTDA = buff1end / pcm_WordsPerBlock;
+	const int remFront = pcmTSA & 31;
+	const int remBack = ((buff1end+pcm_WordsPerBlock-1)/pcm_WordsPerBlock) & 31;	// round up to get the end remainder
+
+	int flagTSA = pcmTSA / 32;
+
+	if( remFront )
+	{
+		// need to clear some upper bits of this u32
+		uint mask = (1ul<<remFront)-1;
+		cache[flagTSA++] &= mask;
+	}
+
+	// Step 2: Clear the middle run
+	const int flagClearLen = pcmTDA-pcmTSA;
+	memset( &cache[flagTSA], 0, flagClearLen );
+
+	// Step 3: Clear bits in the end remainder.
+
+	if( remBack )
+	{
+		// need to clear some lower bits in this u32
+		uint mask = ~(1ul<<remBack)-1;
+		cache[flagTSA + flagClearLen] &= mask;
+	}
+#endif
+
+	//ConLog( " * SPU2 : Cache Clear Range!  TSA=0x%x, TDA=0x%x (low8=0x%x, high8=0x%x, len=0x%x)\n",
+	//	Cores[core].TSA, buff1end, flagTSA, flagTDA, clearLen );
+
+
 	// First Branch needs cleared:
 	// It starts at TSA and goes to buff1end.

--- a/plugins/spu2ghz/mixer.cpp
+++ b/plugins/spu2ghz/mixer.cpp
@ -128,7 +128,7 @@ static void __forceinline XA_decode_block(s16* buffer, const s16* block, s32& pr
 			pcm = data>>shift;
 			pcm+=((pred1*prev1)+(pred2*prev2))>>6;
 			if(pcm> 32767) pcm= 32767;
-			if(pcm<-32768) pcm=-32768;
+			else if(pcm<-32768) pcm=-32768;
 			*(buffer++) = pcm;
 		}

@ -140,7 +140,7 @@ static void __forceinline XA_decode_block(s16* buffer, const s16* block, s32& pr
 			pcm2 = data>>shift;
 			pcm2+=((pred1*pcm)+(pred2*prev1))>>6;
 			if(pcm2> 32767) pcm2= 32767;
-			if(pcm2<-32768) pcm2=-32768;
+			else if(pcm2<-32768) pcm2=-32768;
 			*(buffer++) = pcm2;
 		}

@ -203,9 +203,10 @@ static void __forceinline IncrementNextA( const V_Core& thiscore, V_Voice& vc )
 	vc.NextA&=0xFFFFF;
 }

-
-u32 *pcm_cache_flags = NULL;
-s16 *pcm_cache_data = NULL;
+// decoded pcm data, used to cache the decoded data so that it needn't be decoded
+// multiple times.  Cache chunks are decoded when the mixer requests the blocks, and
+// invalided when DMA transfers and memory writes are performed.
+PcmCacheEntry *pcm_cache_data = NULL;

 #ifndef PUBLIC
 int g_counter_cache_hits=0;
@ -249,16 +250,20 @@ static void __forceinline __fastcall GetNextDataBuffered( V_Core& thiscore, V_Vo

 		s16* memptr = GetMemPtr(vc.NextA&0xFFFFF);
 		vc.LoopFlags = *memptr >> 8;	// grab loop flags from the upper byte.
-		int nexta = vc.NextA / 8;		// 8 words per encoded block.
-		
-		vc.SBuffer = &pcm_cache_data[nexta * 28];

-		const u32 flagbitmask = 1ul<<(nexta & 31);  // 32 flags per array entry
-		nexta /= 32;
+		const int cacheIdx = vc.NextA / pcm_WordsPerBlock;
+		PcmCacheEntry& cacheLine = pcm_cache_data[cacheIdx];
+		vc.SBuffer = cacheLine.Sampledata;

-		if( pcm_cache_flags[nexta] & flagbitmask )
+		if( cacheLine.Validated )
 		{
-			// Cached block!  Read from the cache directly (ie, do nothing)
+			// Cached block!  Read from the cache directly.
+			// Make sure to propagate the prev1/prev2 ADPCM:
+
+			vc.Prev1 = vc.SBuffer[27];
+			vc.Prev2 = vc.SBuffer[26];
+
+			//ConLog( " * SPU2 : Cache Hit! NextA=0x%x, cacheIdx=0x%x\n", vc.NextA, cacheIdx );

 			#ifndef PUBLIC
 			g_counter_cache_hits++;
@ -267,19 +272,20 @@ static void __forceinline __fastcall GetNextDataBuffered( V_Core& thiscore, V_Vo
 		else
 		{
 			// Only flag the cache if it's a non-dynamic memory range.
-			if( nexta >= (SPU2_DYN_MEMLINE / (8*32)) )
-				pcm_cache_flags[nexta] |= flagbitmask;
+			if( vc.NextA >= SPU2_DYN_MEMLINE )
+				cacheLine.Validated = true;

 			#ifndef PUBLIC
-			if( nexta < (SPU2_DYN_MEMLINE / (8*32)) )
+			if( vc.NextA < SPU2_DYN_MEMLINE )
 				g_counter_cache_ignores++;
 			else
 				g_counter_cache_misses++;
 			#endif

-			// saturated decoder
+			s16* sbuffer = cacheLine.Sampledata;

-			XA_decode_block(vc.SBuffer, memptr, vc.Prev1, vc.Prev2);
+			// saturated decoder
+			XA_decode_block( sbuffer, memptr, vc.Prev1, vc.Prev2 );

 			// [Air]: Testing use of a new unsaturated decoder. (benchmark needed)
 			//   Chances are the saturation isn't needed, but for a very few exception games.
@ -288,7 +294,6 @@ static void __forceinline __fastcall GetNextDataBuffered( V_Core& thiscore, V_Vo
 			//   heavy use of the SPU2 via music or sfx will mostly use the cache anyway.

 			//XA_decode_block_unsaturated( vc.SBuffer, memptr, vc.Prev1, vc.Prev2 );
-
 		}

 		vc.SCurrent = 0;
@ -1173,8 +1178,8 @@ void __fastcall Mix()
 	}

 	// Commit Core 0 output to ram before mixing Core 1:
-	ExtL>>=13;
-	ExtR>>=13;
+	ExtL>>=14;
+	ExtR>>=14;
 	spu2M_WriteFast( 0x800 + OutPos, ExtL>>3 );
 	spu2M_WriteFast( 0xA00 + OutPos, ExtR>>3 );

--- a/plugins/spu2ghz/spu2.cpp
+++ b/plugins/spu2ghz/spu2.cpp
@ -226,11 +226,10 @@ __inline void __fastcall spu2M_Write( u32 addr, s16 value )
 	addr &= 0xfffff;
 	if( addr >= SPU2_DYN_MEMLINE )
 	{
-		const u32 nexta = addr >> 3;		// 8 words per encoded block.
-		const u32 flagbitmask = 1ul<<(nexta & 31);  // 31 flags per array entry
-		pcm_cache_flags[nexta/32] &= ~flagbitmask;
+		const int cacheIdx = addr / pcm_WordsPerBlock;
+		pcm_cache_data[cacheIdx].Validated = false;

-		ConLog( " * SPU2 : PcmCache Block Clear at 0x%x (idx=0x%x, bit=%d)\n", addr, nexta, nexta & 31);
+		ConLog( " * SPU2 : PcmCache Block Clear at 0x%x (cacheIdx=0x%x)\n", addr, cacheIdx);
 	}
 	*GetMemPtr( addr ) = value;
 }
@ -327,11 +326,6 @@ void CoreReset(int c)

 extern void LowPassFilterInit();

-// number of cachable ADPCM blocks (any blocks above the SPU2_DYN_MEMLINE)
-static const int pcm_BlockCount = 0x100000 / 8; // (0x100000-SPU2_DYN_MEMLINE) / 8;
-
-static const int pcm_DecodedSamplesPerBlock = 28;
-
 EXPORT_C_(s32) SPU2init() 
 {
 #define MAKESURE(a,b) \
@ -374,11 +368,10 @@ EXPORT_C_(s32) SPU2init()
 	//  Expanded: 16 bytes expands to 56 bytes [3.5:1 ratio]
 	//    Resulting in 2MB * 3.5.

-	pcm_cache_flags = (u32*)calloc( pcm_BlockCount / 32, sizeof(u32) );
-	pcm_cache_data = (s16*)calloc( pcm_BlockCount * pcm_DecodedSamplesPerBlock, sizeof(s16) );
+	pcm_cache_data = (PcmCacheEntry*)calloc( pcm_BlockCount, sizeof(PcmCacheEntry) );

 	if( (spu2regs == NULL) || (_spu2mem == NULL) ||
-		(pcm_cache_data == NULL) || (pcm_cache_flags == NULL) )
+		(pcm_cache_data == NULL) )
 	{
 		SysMessage("SPU2: Error allocating Memory\n"); return -1;
 	}
@ -543,12 +536,10 @@ EXPORT_C_(void) SPU2shutdown()
 	SAFE_FREE(spu2regs);
 	SAFE_FREE(_spu2mem);

-	SAFE_FREE( pcm_cache_flags );
 	SAFE_FREE( pcm_cache_data );

 	spu2regs = NULL;
 	_spu2mem = NULL;
-	pcm_cache_flags = NULL;
 	pcm_cache_data = NULL;

 #ifdef SPU2_LOG
@ -1705,12 +1696,6 @@ EXPORT_C_(u16) SPU2read(u32 rmem)
 	return ret;
 }

-struct cacheFreezeData
-{
-	u32 flags[pcm_BlockCount/32];
-	s16 startData;
-};
-
 typedef struct 
 {
 	// compatibility with zerospu2 removed...
@ -1734,7 +1719,7 @@ typedef struct

 	int lClocks;

-	cacheFreezeData cacheData;
+	PcmCacheEntry cacheData;

 } SPU2freezeData;

@ -1747,7 +1732,7 @@ typedef struct
 // Increment this if changes to V_Core or V_Voice structs are made.
 // Chances are we'll never explicitly support older save versions,
 // but might as well version them anyway.  Could come in handly someday!
-#define SAVE_VERSION 0x0100
+#define SAVE_VERSION 0x0101

 static int getFreezeSize()
 {
@ -1759,9 +1744,8 @@ static int getFreezeSize()

 	for( int bidx=0; bidx<pcm_BlockCount; bidx++ )
 	{
-		const u32 flagmask = 1ul << (bidx & 31);
-		if( pcm_cache_flags[bidx/32] & flagmask )
-			size += pcm_DecodedSamplesPerBlock*sizeof(s16);
+		if( pcm_cache_data[bidx].Validated )
+			size += pcm_DecodedSamplesPerBlock*sizeof(PcmCacheEntry);
 	}
 	return size;
 }
@ -1769,17 +1753,19 @@ static int getFreezeSize()

 static void wipe_the_cache()
 {
-	memset( pcm_cache_flags, 0, pcm_BlockCount/32 * sizeof(u32) );
-	memset( pcm_cache_data, 0, pcm_BlockCount * pcm_DecodedSamplesPerBlock * sizeof(s16) );
+	memset( pcm_cache_data, 0, pcm_BlockCount * sizeof(PcmCacheEntry) );
 }

+
+static s16 old_state_sBuffer[pcm_DecodedSamplesPerBlock] = {0};
+
 EXPORT_C_(s32) SPU2freeze(int mode, freezeData *data)
 {
 	if (mode == FREEZE_LOAD)
 	{
 		const SPU2freezeData *spud = (SPU2freezeData*)data->data;

-		if( spud->id != SAVE_ID || spud->version != SAVE_VERSION )
+		if( spud->id != SAVE_ID || spud->version < 0x100 )
 		{
 			printf("\n*** SPU2Ghz Warning:\n");
 			printf("  The savestate you are trying to load was not made with this plugin.\n");
@ -1826,36 +1812,51 @@ EXPORT_C_(s32) SPU2freeze(int mode, freezeData *data)

 			// Load the ADPCM cache:

-			const cacheFreezeData &cfd = spud->cacheData;
-			const s16* pcmSrc = &cfd.startData;
-
-			memcpy( pcm_cache_flags, cfd.flags, (pcm_BlockCount/32) * sizeof(u32) );
-
-			int blksLoaded=0;
-
-			for( int bidx=0; bidx<pcm_BlockCount; bidx++ )
+			wipe_the_cache();
+			if( spud->version == 0x100 )		// don't support 0x100 cache anymore.
 			{
-				const u32 flagmask = 1ul << (bidx & 31);
-				if( cfd.flags[bidx/32] & flagmask )
+				printf("\n*** SPU2Ghz Warning:\n");
+				printf("\tSavestate version is from an older version of this plugin.\n");
+				printf("\tAudio may not recover correctly.");
+
+				const PcmCacheEntry* pcmSrc = &spud->cacheData;
+				int blksLoaded=0;
+
+				for( int bidx=0; bidx<pcm_BlockCount; bidx++ )
 				{
-					// load a cache block!
-					memcpy( &pcm_cache_data[bidx*pcm_DecodedSamplesPerBlock],
-						pcmSrc, pcm_DecodedSamplesPerBlock*sizeof(s16) );
-					pcmSrc += pcm_DecodedSamplesPerBlock;
-					blksLoaded++;
+					if( pcm_cache_data[bidx].Validated )
+					{
+						// load a cache block!
+						memcpy( &pcm_cache_data[bidx], pcmSrc, sizeof(PcmCacheEntry) );
+						pcmSrc++;
+						blksLoaded++;
+					}
+				}
+
+				// Go through the V_Voice structs and recalculate SBuffer pointer from
+				// the NextA setting.
+
+				for( int c=0; c<2; c++ )
+				{
+					for( int v=0; v<24; v++ )
+					{
+						const int cacheIdx = Cores[c].Voices[v].NextA / pcm_WordsPerBlock;
+						Cores[c].Voices[v].SBuffer = pcm_cache_data[cacheIdx].Sampledata;
+					}
+				}
+			}
+			else
+			{
+				// We don't support the cache, so make sure the SBuffer pointers
+				// are safe (don't want any GPFs reading bad data)
+
+				for( int c=0; c<2; c++ )
+				{
+					for( int v=0; v<24; v++ )
+						Cores[c].Voices[v].SBuffer = old_state_sBuffer;
 				}
 			}

-			// Go through the V_Voice structs and replace the SBuffer pointer
-			// with an absolute address into our cache buffer this session.
-
-			for( int c=0; c<2; c++ )
-			{
-				for( int v=0; v<24; v++ )
-				{
-					Cores[c].Voices[v].SBuffer = (s16*) ((uptr)spud->Cores[c].Voices[v].SBuffer + (uptr)pcm_cache_data );
-				}
-			}

 			//printf( " * SPU2 > FreezeLoad > Loaded %d cache blocks.\n", blksLoaded++ );
 		}
@ -1905,36 +1906,20 @@ EXPORT_C_(s32) SPU2freeze(int mode, freezeData *data)
 		//   decoded blocks currently in use by active voices.  This allows
 		//   voices to resume seamlessly on load.

-		cacheFreezeData &cfd = spud->cacheData;
-		s16* pcmDst = &cfd.startData;
-
-		memcpy( cfd.flags, pcm_cache_flags, sizeof(cfd.flags) );
-
+		PcmCacheEntry* pcmDst = &spud->cacheData;
 		int blksSaved=0;
+
 		for( int bidx=0; bidx<pcm_BlockCount; bidx++ )
 		{
-			const u32 flagmask = 1ul << (bidx & 31);
-			if( cfd.flags[bidx/32] & flagmask )
+			if( pcm_cache_data[bidx].Validated )
 			{
 				// save a cache block!
-				memcpy( pcmDst, &pcm_cache_data[bidx*pcm_DecodedSamplesPerBlock],
-					pcm_DecodedSamplesPerBlock*sizeof(s16) );
-				pcmDst += pcm_DecodedSamplesPerBlock;
+				memcpy( pcmDst, &pcm_cache_data[bidx], sizeof(PcmCacheEntry) );
+				pcmDst++;
 				blksSaved++;
 			}
 		}

-		// Time to go through the V_Voice structs and replace the SBuffer pointer
-		// with a relative address that can be applied later on when the state is loaded.
-
-		for( int c=0; c<2; c++ )
-		{
-			for( int v=0; v<24; v++ )
-			{
-				spud->Cores[c].Voices[v].SBuffer = 
-					(s16*) ((uptr)spud->Cores[c].Voices[v].SBuffer - (uptr)pcm_cache_data );
-			}
-		}
 		//printf( " * SPU2 > FreezeSave > Saved %d cache blocks.\n", blksSaved++ );

 	}
--- a/plugins/spu2ghz/spu2.h
+++ b/plugins/spu2ghz/spu2.h
@ -138,22 +138,35 @@ default: \
 #	define SAFE_RELEASE(p)      { if(p) { (p)->Release(); (p)=NULL; } }
 #endif

+// The SPU2 has a dynamic memory range which is used for several internal operations, such as
+// registers, CORE 1/2 mixing, AutoDMAs, and some other fancy stuff.  We exclude this range
+// from the cache here:
+static const s32 SPU2_DYN_MEMLINE = 0x2800;
+
+// 8 short words per encoded PCM block. (as stored in SPU2 ram)
+static const int pcm_WordsPerBlock = 8;
+
+// number of cachable ADPCM blocks (any blocks above the SPU2_DYN_MEMLINE)
+static const int pcm_BlockCount = 0x100000 / pcm_WordsPerBlock;
+
+// 28 samples per decoded PCM block (as stored in our cache)
+static const int pcm_DecodedSamplesPerBlock = 28;
+
+struct PcmCacheEntry
+{
+	bool Validated; 
+	s16 Sampledata[pcm_DecodedSamplesPerBlock];
+};

 extern void spdif_set51(u32 is_5_1_out);
 extern u32  spdif_init();
 extern void spdif_shutdown();
 extern void spdif_get_samples(s32 *samples); // fills the buffer with [l,r,c,lfe,sl,sr] if using 5.1 output, or [l,r] if using stereo

-// The SPU2 has a dynamic memory range which is used for several internal operations, such as
-// registers, CORE 1/2 mixing, AutoDMAs, and some other fancy stuff.  We exclude this range
-// from the cache here:
-static const s32 SPU2_DYN_MEMLINE = 0x2800;
-
 extern short *spu2regs;
 extern short *_spu2mem;

-extern u32 *pcm_cache_flags;
-extern s16 *pcm_cache_data;
+extern PcmCacheEntry* pcm_cache_data;

 extern s16 __forceinline * __fastcall GetMemPtr(u32 addr);
 extern s16 __forceinline __fastcall spu2M_Read( u32 addr );