SPU2ghz: Third try's the charm? This is an even better yet fix for the crackling sound in SO3. Note: This revision upgrades spu2ghz savestates to 0x101. Old states should still load fine for the most part.

git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@682 a6443dda-0b58-4228-96e9-037be469359c
This commit is contained in:
Jake.Stine 2009-02-03 00:55:37 +00:00 committed by Gregory Hainaut
parent ef489b45bd
commit a92fc9900f
4 changed files with 158 additions and 122 deletions

View File

@ -220,27 +220,6 @@ void DoDMAWrite(int core,u16 *pMem,u32 size)
Cores[core].TSA &= 0xfffff;
u32 buff1end = Cores[core].TSA + size;
// Pcm Cache Invalidation!
// Ideally we would only mask bits actually written to, but it's a complex algorithm
// that is way more work than it's worth. Masking out bytes would in theory work a
// little more effiently, but was buggy in practice for some reason. So a dumb and
// dirty 32-bit mask will suffice.
// Note: When clearing cache flags, the *endpoint* needs to be rounded upward.
// just rounding the count upward could cause problems if both start and end
// points are mis-aligned.
// indexer scalar - 8 addresses per block, and 32 bits per dword:
const u32 indexer_scalar = 8*32;
const int roundUp = indexer_scalar-1;
const int flagTSA = Cores[core].TSA / indexer_scalar;
int flagTDA = (buff1end + roundUp) / indexer_scalar; // endpoint, rounded up
u8* cache = (u8*)pcm_cache_flags;
memset( &pcm_cache_flags[flagTSA], 0, (flagTDA - flagTSA) * 4 );
u32 buff2end=0;
if( buff1end > 0x100000 )
{
@ -248,6 +227,60 @@ void DoDMAWrite(int core,u16 *pMem,u32 size)
buff1end = 0x100000;
}
const int cacheIdxStart = Cores[core].TSA / pcm_WordsPerBlock;
const int cacheIdxEnd = (buff1end+pcm_WordsPerBlock-1) / pcm_WordsPerBlock;
PcmCacheEntry* cacheLine = &pcm_cache_data[cacheIdxStart];
PcmCacheEntry& cacheEnd = pcm_cache_data[cacheIdxEnd];
do
{
cacheLine->Validated = false;
cacheLine++;
} while ( cacheLine != &cacheEnd );
#if 0
// Pcm Cache Invalidation!
// It's a requirement that we mask bits for the blocks that are written to *only*,
// because doing anything else can cause the cache to fail, thanks to the progressive
// nature of the SPU2's ADPCM encoding. (the same thing that makes it impossible
// to use SSE optimizations on it).
u8* cache = (u8*)pcm_cache_flags;
// Step 1: Clear bits in the front remainder.
const int pcmTSA = Cores[core].TSA / pcm_WordsPerBlock;
const int pcmTDA = buff1end / pcm_WordsPerBlock;
const int remFront = pcmTSA & 31;
const int remBack = ((buff1end+pcm_WordsPerBlock-1)/pcm_WordsPerBlock) & 31; // round up to get the end remainder
int flagTSA = pcmTSA / 32;
if( remFront )
{
// need to clear some upper bits of this u32
uint mask = (1ul<<remFront)-1;
cache[flagTSA++] &= mask;
}
// Step 2: Clear the middle run
const int flagClearLen = pcmTDA-pcmTSA;
memset( &cache[flagTSA], 0, flagClearLen );
// Step 3: Clear bits in the end remainder.
if( remBack )
{
// need to clear some lower bits in this u32
uint mask = ~(1ul<<remBack)-1;
cache[flagTSA + flagClearLen] &= mask;
}
#endif
//ConLog( " * SPU2 : Cache Clear Range! TSA=0x%x, TDA=0x%x (low8=0x%x, high8=0x%x, len=0x%x)\n",
// Cores[core].TSA, buff1end, flagTSA, flagTDA, clearLen );
// First Branch needs cleared:
// It starts at TSA and goes to buff1end.

View File

@ -128,7 +128,7 @@ static void __forceinline XA_decode_block(s16* buffer, const s16* block, s32& pr
pcm = data>>shift;
pcm+=((pred1*prev1)+(pred2*prev2))>>6;
if(pcm> 32767) pcm= 32767;
if(pcm<-32768) pcm=-32768;
else if(pcm<-32768) pcm=-32768;
*(buffer++) = pcm;
}
@ -140,7 +140,7 @@ static void __forceinline XA_decode_block(s16* buffer, const s16* block, s32& pr
pcm2 = data>>shift;
pcm2+=((pred1*pcm)+(pred2*prev1))>>6;
if(pcm2> 32767) pcm2= 32767;
if(pcm2<-32768) pcm2=-32768;
else if(pcm2<-32768) pcm2=-32768;
*(buffer++) = pcm2;
}
@ -203,9 +203,10 @@ static void __forceinline IncrementNextA( const V_Core& thiscore, V_Voice& vc )
vc.NextA&=0xFFFFF;
}
u32 *pcm_cache_flags = NULL;
s16 *pcm_cache_data = NULL;
// decoded pcm data, used to cache the decoded data so that it needn't be decoded
// multiple times. Cache chunks are decoded when the mixer requests the blocks, and
// invalided when DMA transfers and memory writes are performed.
PcmCacheEntry *pcm_cache_data = NULL;
#ifndef PUBLIC
int g_counter_cache_hits=0;
@ -249,16 +250,20 @@ static void __forceinline __fastcall GetNextDataBuffered( V_Core& thiscore, V_Vo
s16* memptr = GetMemPtr(vc.NextA&0xFFFFF);
vc.LoopFlags = *memptr >> 8; // grab loop flags from the upper byte.
int nexta = vc.NextA / 8; // 8 words per encoded block.
vc.SBuffer = &pcm_cache_data[nexta * 28];
const u32 flagbitmask = 1ul<<(nexta & 31); // 32 flags per array entry
nexta /= 32;
const int cacheIdx = vc.NextA / pcm_WordsPerBlock;
PcmCacheEntry& cacheLine = pcm_cache_data[cacheIdx];
vc.SBuffer = cacheLine.Sampledata;
if( pcm_cache_flags[nexta] & flagbitmask )
if( cacheLine.Validated )
{
// Cached block! Read from the cache directly (ie, do nothing)
// Cached block! Read from the cache directly.
// Make sure to propagate the prev1/prev2 ADPCM:
vc.Prev1 = vc.SBuffer[27];
vc.Prev2 = vc.SBuffer[26];
//ConLog( " * SPU2 : Cache Hit! NextA=0x%x, cacheIdx=0x%x\n", vc.NextA, cacheIdx );
#ifndef PUBLIC
g_counter_cache_hits++;
@ -267,19 +272,20 @@ static void __forceinline __fastcall GetNextDataBuffered( V_Core& thiscore, V_Vo
else
{
// Only flag the cache if it's a non-dynamic memory range.
if( nexta >= (SPU2_DYN_MEMLINE / (8*32)) )
pcm_cache_flags[nexta] |= flagbitmask;
if( vc.NextA >= SPU2_DYN_MEMLINE )
cacheLine.Validated = true;
#ifndef PUBLIC
if( nexta < (SPU2_DYN_MEMLINE / (8*32)) )
if( vc.NextA < SPU2_DYN_MEMLINE )
g_counter_cache_ignores++;
else
g_counter_cache_misses++;
#endif
// saturated decoder
s16* sbuffer = cacheLine.Sampledata;
XA_decode_block(vc.SBuffer, memptr, vc.Prev1, vc.Prev2);
// saturated decoder
XA_decode_block( sbuffer, memptr, vc.Prev1, vc.Prev2 );
// [Air]: Testing use of a new unsaturated decoder. (benchmark needed)
// Chances are the saturation isn't needed, but for a very few exception games.
@ -288,7 +294,6 @@ static void __forceinline __fastcall GetNextDataBuffered( V_Core& thiscore, V_Vo
// heavy use of the SPU2 via music or sfx will mostly use the cache anyway.
//XA_decode_block_unsaturated( vc.SBuffer, memptr, vc.Prev1, vc.Prev2 );
}
vc.SCurrent = 0;
@ -1173,8 +1178,8 @@ void __fastcall Mix()
}
// Commit Core 0 output to ram before mixing Core 1:
ExtL>>=13;
ExtR>>=13;
ExtL>>=14;
ExtR>>=14;
spu2M_WriteFast( 0x800 + OutPos, ExtL>>3 );
spu2M_WriteFast( 0xA00 + OutPos, ExtR>>3 );

View File

@ -226,11 +226,10 @@ __inline void __fastcall spu2M_Write( u32 addr, s16 value )
addr &= 0xfffff;
if( addr >= SPU2_DYN_MEMLINE )
{
const u32 nexta = addr >> 3; // 8 words per encoded block.
const u32 flagbitmask = 1ul<<(nexta & 31); // 31 flags per array entry
pcm_cache_flags[nexta/32] &= ~flagbitmask;
const int cacheIdx = addr / pcm_WordsPerBlock;
pcm_cache_data[cacheIdx].Validated = false;
ConLog( " * SPU2 : PcmCache Block Clear at 0x%x (idx=0x%x, bit=%d)\n", addr, nexta, nexta & 31);
ConLog( " * SPU2 : PcmCache Block Clear at 0x%x (cacheIdx=0x%x)\n", addr, cacheIdx);
}
*GetMemPtr( addr ) = value;
}
@ -327,11 +326,6 @@ void CoreReset(int c)
extern void LowPassFilterInit();
// number of cachable ADPCM blocks (any blocks above the SPU2_DYN_MEMLINE)
static const int pcm_BlockCount = 0x100000 / 8; // (0x100000-SPU2_DYN_MEMLINE) / 8;
static const int pcm_DecodedSamplesPerBlock = 28;
EXPORT_C_(s32) SPU2init()
{
#define MAKESURE(a,b) \
@ -374,11 +368,10 @@ EXPORT_C_(s32) SPU2init()
// Expanded: 16 bytes expands to 56 bytes [3.5:1 ratio]
// Resulting in 2MB * 3.5.
pcm_cache_flags = (u32*)calloc( pcm_BlockCount / 32, sizeof(u32) );
pcm_cache_data = (s16*)calloc( pcm_BlockCount * pcm_DecodedSamplesPerBlock, sizeof(s16) );
pcm_cache_data = (PcmCacheEntry*)calloc( pcm_BlockCount, sizeof(PcmCacheEntry) );
if( (spu2regs == NULL) || (_spu2mem == NULL) ||
(pcm_cache_data == NULL) || (pcm_cache_flags == NULL) )
(pcm_cache_data == NULL) )
{
SysMessage("SPU2: Error allocating Memory\n"); return -1;
}
@ -543,12 +536,10 @@ EXPORT_C_(void) SPU2shutdown()
SAFE_FREE(spu2regs);
SAFE_FREE(_spu2mem);
SAFE_FREE( pcm_cache_flags );
SAFE_FREE( pcm_cache_data );
spu2regs = NULL;
_spu2mem = NULL;
pcm_cache_flags = NULL;
pcm_cache_data = NULL;
#ifdef SPU2_LOG
@ -1705,12 +1696,6 @@ EXPORT_C_(u16) SPU2read(u32 rmem)
return ret;
}
struct cacheFreezeData
{
u32 flags[pcm_BlockCount/32];
s16 startData;
};
typedef struct
{
// compatibility with zerospu2 removed...
@ -1734,7 +1719,7 @@ typedef struct
int lClocks;
cacheFreezeData cacheData;
PcmCacheEntry cacheData;
} SPU2freezeData;
@ -1747,7 +1732,7 @@ typedef struct
// Increment this if changes to V_Core or V_Voice structs are made.
// Chances are we'll never explicitly support older save versions,
// but might as well version them anyway. Could come in handly someday!
#define SAVE_VERSION 0x0100
#define SAVE_VERSION 0x0101
static int getFreezeSize()
{
@ -1759,9 +1744,8 @@ static int getFreezeSize()
for( int bidx=0; bidx<pcm_BlockCount; bidx++ )
{
const u32 flagmask = 1ul << (bidx & 31);
if( pcm_cache_flags[bidx/32] & flagmask )
size += pcm_DecodedSamplesPerBlock*sizeof(s16);
if( pcm_cache_data[bidx].Validated )
size += pcm_DecodedSamplesPerBlock*sizeof(PcmCacheEntry);
}
return size;
}
@ -1769,17 +1753,19 @@ static int getFreezeSize()
static void wipe_the_cache()
{
memset( pcm_cache_flags, 0, pcm_BlockCount/32 * sizeof(u32) );
memset( pcm_cache_data, 0, pcm_BlockCount * pcm_DecodedSamplesPerBlock * sizeof(s16) );
memset( pcm_cache_data, 0, pcm_BlockCount * sizeof(PcmCacheEntry) );
}
static s16 old_state_sBuffer[pcm_DecodedSamplesPerBlock] = {0};
EXPORT_C_(s32) SPU2freeze(int mode, freezeData *data)
{
if (mode == FREEZE_LOAD)
{
const SPU2freezeData *spud = (SPU2freezeData*)data->data;
if( spud->id != SAVE_ID || spud->version != SAVE_VERSION )
if( spud->id != SAVE_ID || spud->version < 0x100 )
{
printf("\n*** SPU2Ghz Warning:\n");
printf(" The savestate you are trying to load was not made with this plugin.\n");
@ -1826,36 +1812,51 @@ EXPORT_C_(s32) SPU2freeze(int mode, freezeData *data)
// Load the ADPCM cache:
const cacheFreezeData &cfd = spud->cacheData;
const s16* pcmSrc = &cfd.startData;
memcpy( pcm_cache_flags, cfd.flags, (pcm_BlockCount/32) * sizeof(u32) );
int blksLoaded=0;
for( int bidx=0; bidx<pcm_BlockCount; bidx++ )
wipe_the_cache();
if( spud->version == 0x100 ) // don't support 0x100 cache anymore.
{
const u32 flagmask = 1ul << (bidx & 31);
if( cfd.flags[bidx/32] & flagmask )
printf("\n*** SPU2Ghz Warning:\n");
printf("\tSavestate version is from an older version of this plugin.\n");
printf("\tAudio may not recover correctly.");
const PcmCacheEntry* pcmSrc = &spud->cacheData;
int blksLoaded=0;
for( int bidx=0; bidx<pcm_BlockCount; bidx++ )
{
// load a cache block!
memcpy( &pcm_cache_data[bidx*pcm_DecodedSamplesPerBlock],
pcmSrc, pcm_DecodedSamplesPerBlock*sizeof(s16) );
pcmSrc += pcm_DecodedSamplesPerBlock;
blksLoaded++;
if( pcm_cache_data[bidx].Validated )
{
// load a cache block!
memcpy( &pcm_cache_data[bidx], pcmSrc, sizeof(PcmCacheEntry) );
pcmSrc++;
blksLoaded++;
}
}
// Go through the V_Voice structs and recalculate SBuffer pointer from
// the NextA setting.
for( int c=0; c<2; c++ )
{
for( int v=0; v<24; v++ )
{
const int cacheIdx = Cores[c].Voices[v].NextA / pcm_WordsPerBlock;
Cores[c].Voices[v].SBuffer = pcm_cache_data[cacheIdx].Sampledata;
}
}
}
else
{
// We don't support the cache, so make sure the SBuffer pointers
// are safe (don't want any GPFs reading bad data)
for( int c=0; c<2; c++ )
{
for( int v=0; v<24; v++ )
Cores[c].Voices[v].SBuffer = old_state_sBuffer;
}
}
// Go through the V_Voice structs and replace the SBuffer pointer
// with an absolute address into our cache buffer this session.
for( int c=0; c<2; c++ )
{
for( int v=0; v<24; v++ )
{
Cores[c].Voices[v].SBuffer = (s16*) ((uptr)spud->Cores[c].Voices[v].SBuffer + (uptr)pcm_cache_data );
}
}
//printf( " * SPU2 > FreezeLoad > Loaded %d cache blocks.\n", blksLoaded++ );
}
@ -1905,36 +1906,20 @@ EXPORT_C_(s32) SPU2freeze(int mode, freezeData *data)
// decoded blocks currently in use by active voices. This allows
// voices to resume seamlessly on load.
cacheFreezeData &cfd = spud->cacheData;
s16* pcmDst = &cfd.startData;
memcpy( cfd.flags, pcm_cache_flags, sizeof(cfd.flags) );
PcmCacheEntry* pcmDst = &spud->cacheData;
int blksSaved=0;
for( int bidx=0; bidx<pcm_BlockCount; bidx++ )
{
const u32 flagmask = 1ul << (bidx & 31);
if( cfd.flags[bidx/32] & flagmask )
if( pcm_cache_data[bidx].Validated )
{
// save a cache block!
memcpy( pcmDst, &pcm_cache_data[bidx*pcm_DecodedSamplesPerBlock],
pcm_DecodedSamplesPerBlock*sizeof(s16) );
pcmDst += pcm_DecodedSamplesPerBlock;
memcpy( pcmDst, &pcm_cache_data[bidx], sizeof(PcmCacheEntry) );
pcmDst++;
blksSaved++;
}
}
// Time to go through the V_Voice structs and replace the SBuffer pointer
// with a relative address that can be applied later on when the state is loaded.
for( int c=0; c<2; c++ )
{
for( int v=0; v<24; v++ )
{
spud->Cores[c].Voices[v].SBuffer =
(s16*) ((uptr)spud->Cores[c].Voices[v].SBuffer - (uptr)pcm_cache_data );
}
}
//printf( " * SPU2 > FreezeSave > Saved %d cache blocks.\n", blksSaved++ );
}

View File

@ -138,22 +138,35 @@ default: \
# define SAFE_RELEASE(p) { if(p) { (p)->Release(); (p)=NULL; } }
#endif
// The SPU2 has a dynamic memory range which is used for several internal operations, such as
// registers, CORE 1/2 mixing, AutoDMAs, and some other fancy stuff. We exclude this range
// from the cache here:
static const s32 SPU2_DYN_MEMLINE = 0x2800;
// 8 short words per encoded PCM block. (as stored in SPU2 ram)
static const int pcm_WordsPerBlock = 8;
// number of cachable ADPCM blocks (any blocks above the SPU2_DYN_MEMLINE)
static const int pcm_BlockCount = 0x100000 / pcm_WordsPerBlock;
// 28 samples per decoded PCM block (as stored in our cache)
static const int pcm_DecodedSamplesPerBlock = 28;
struct PcmCacheEntry
{
bool Validated;
s16 Sampledata[pcm_DecodedSamplesPerBlock];
};
extern void spdif_set51(u32 is_5_1_out);
extern u32 spdif_init();
extern void spdif_shutdown();
extern void spdif_get_samples(s32 *samples); // fills the buffer with [l,r,c,lfe,sl,sr] if using 5.1 output, or [l,r] if using stereo
// The SPU2 has a dynamic memory range which is used for several internal operations, such as
// registers, CORE 1/2 mixing, AutoDMAs, and some other fancy stuff. We exclude this range
// from the cache here:
static const s32 SPU2_DYN_MEMLINE = 0x2800;
extern short *spu2regs;
extern short *_spu2mem;
extern u32 *pcm_cache_flags;
extern s16 *pcm_cache_data;
extern PcmCacheEntry* pcm_cache_data;
extern s16 __forceinline * __fastcall GetMemPtr(u32 addr);
extern s16 __forceinline __fastcall spu2M_Read( u32 addr );