pcsx2/plugins/spu2-x/src/Mixer.cpp

/* SPU2-X, A plugin for Emulating the Sound Processing Unit of the Playstation 2
 * Developed and maintained by the Pcsx2 Development Team.
 *
 * Original portions from SPU2ghz are (c) 2008 by David Quintana [gigaherz]
 *
 * SPU2-X is free software: you can redistribute it and/or modify it under the terms
 * of the GNU Lesser General Public License as published by the Free Software Found-
 * ation, either version 3 of the License, or (at your option) any later version.
 *
 * SPU2-X is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 * PURPOSE.  See the GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with SPU2-X.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "Global.h"

void ADMAOutLogWrite(void *lpData, u32 ulSize);

static const s32 tbl_XA_Factor[5][2] =
{
	{    0,   0 },
	{   60,   0 },
	{  115, -52 },
	{   98, -55 },
	{  122, -60 }
};


// Performs a 64-bit multiplication between two values and returns the
// high 32 bits as a result (discarding the fractional 32 bits).
// The combined fractional bits of both inputs must be 32 bits for this
// to work properly.
//
// This is meant to be a drop-in replacement for times when the 'div' part
// of a MulDiv is a constant.  (example: 1<<8, or 4096, etc)
//
// [Air] Performance breakdown: This is over 10 times faster than MulDiv in
//   a *worst case* scenario.  It's also more accurate since it forces the
//   caller to  extend the inputs so that they make use of all 32 bits of
//   precision.
//
static __forceinline s32 MulShr32( s32 srcval, s32 mulval )
{
	return (s64)srcval * mulval >> 32;
}

__forceinline s32 clamp_mix( s32 x, u8 bitshift )
{
	return GetClamped( x, -0x8000<<bitshift, 0x7fff<<bitshift );
}

#if _MSC_VER
__forceinline
// Without the keyword static, gcc compilation fails on the inlining...
// Unfortunately the function is also used in Reverb.cpp. In order to keep the code
// clean we just disable it.
// We will need link-time code generation / Whole Program optimization to do a clean
// inline. Gcc 4.5 has the experimental options -flto, -fwhopr and -fwhole-program to
// do it but it still experimental...
#endif
StereoOut32 clamp_mix( const StereoOut32& sample, u8 bitshift )
{
	// We should clampify between -0x8000 and 0x7fff, however some audio output
	// modules or sound drivers could (will :p) overshoot with that. So giving it a small safety.

	return StereoOut32(
		GetClamped( sample.Left, -0x7f00<<bitshift, 0x7f00<<bitshift ),
		GetClamped( sample.Right, -0x7f00<<bitshift, 0x7f00<<bitshift )
	);
}

static void __forceinline XA_decode_block(s16* buffer, const s16* block, s32& prev1, s32& prev2)
{
	const s32 header = *block;
	const s32 shift =  (header&0xF)+16;
	const s32 pred1 = tbl_XA_Factor[(header>> 4)&0xF][0];
	const s32 pred2 = tbl_XA_Factor[(header>> 4)&0xF][1];

	const s8* blockbytes	= (s8*)&block[1];
	const s8* blockend		= &blockbytes[13];

	for(; blockbytes<=blockend; ++blockbytes)
	{
		s32 data	= ((*blockbytes)<<28) & 0xF0000000;
		s32 pcm		= (data >> shift) + (((pred1*prev1)+(pred2*prev2)) >> 6);

		Clampify( pcm, -0x8000, 0x7fff );
		*(buffer++) = pcm;

		data		= ((*blockbytes)<<24) & 0xF0000000;
		s32 pcm2	= (data >> shift) + (((pred1*pcm)+(pred2*prev1)) >> 6);

		Clampify( pcm2, -0x8000, 0x7fff );
		*(buffer++) = pcm2;

		prev2 = pcm;
		prev1 = pcm2;
	}
}

static void __forceinline IncrementNextA(V_Core& thiscore, uint voiceidx)
{
	V_Voice &vc(thiscore.Voices[voiceidx]);

	// Important!  Both cores signal IRQ when an address is read, regardless of
	// which core actually reads the address.

	for( uint i=0; i<2; i++ )
	{
		if( Cores[i].IRQEnable && (vc.NextA==Cores[i].IRQA ) )
		{
			if( IsDevBuild )
				ConLog(" * SPU2 Core %d: IRQ Called (IRQA (%05X) passed; voice %d).\n", i, Cores[i].IRQA, thiscore.Index * 24 + voiceidx);

			SetIrqCall(i);
		}
	}

	vc.NextA++;
	vc.NextA&=0xFFFFF;
}

// decoded pcm data, used to cache the decoded data so that it needn't be decoded
// multiple times.  Cache chunks are decoded when the mixer requests the blocks, and
// invalided when DMA transfers and memory writes are performed.
PcmCacheEntry *pcm_cache_data = NULL;

int g_counter_cache_hits = 0;
int g_counter_cache_misses = 0;
int g_counter_cache_ignores = 0;

#define XAFLAG_LOOP_END		(1ul<<0)
#define XAFLAG_LOOP			(1ul<<1)
#define XAFLAG_LOOP_START	(1ul<<2)

static __forceinline s32 GetNextDataBuffered( V_Core& thiscore, uint voiceidx )
{
	V_Voice& vc( thiscore.Voices[voiceidx] );

	if( vc.SCurrent == 28 )
	{
		if(vc.LoopFlags & XAFLAG_LOOP_END)
		{
			thiscore.Regs.ENDX |= (1 << voiceidx);

			if( vc.LoopFlags & XAFLAG_LOOP )
			{
				vc.NextA = vc.LoopStartA;
			}
			else
			{
				vc.Stop();
				if( IsDevBuild )
				{
					if(MsgVoiceOff()) ConLog("* SPU2-X: Voice Off by EndPoint: %d \n", voiceidx);
				}
			}
		}

		// We'll need the loop flags and buffer pointers regardless of cache status:
		// Note to Self : NextA addresses WORDS (not bytes).

		s16* memptr = GetMemPtr(vc.NextA&0xFFFFF);
		vc.LoopFlags = *memptr >> 8;	// grab loop flags from the upper byte.

		const int cacheIdx = vc.NextA / pcm_WordsPerBlock;
		PcmCacheEntry& cacheLine = pcm_cache_data[cacheIdx];
		vc.SBuffer = cacheLine.Sampledata;

		if( cacheLine.Validated )
		{
			// Cached block!  Read from the cache directly.
			// Make sure to propagate the prev1/prev2 ADPCM:

			vc.Prev1 = vc.SBuffer[27];
			vc.Prev2 = vc.SBuffer[26];

			//ConLog( "* SPU2-X: Cache Hit! NextA=0x%x, cacheIdx=0x%x\n", vc.NextA, cacheIdx );

			if( IsDevBuild )
				g_counter_cache_hits++;
		}
		else
		{
			// Only flag the cache if it's a non-dynamic memory range.
			if( vc.NextA >= SPU2_DYN_MEMLINE )
				cacheLine.Validated = true;

			if( IsDevBuild )
			{
				if( vc.NextA < SPU2_DYN_MEMLINE )
					g_counter_cache_ignores++;
				else
					g_counter_cache_misses++;
			}

			XA_decode_block( vc.SBuffer, memptr, vc.Prev1, vc.Prev2 );
		}

		vc.SCurrent = 0;
		if( (vc.LoopFlags & XAFLAG_LOOP_START) && !vc.LoopMode )
			vc.LoopStartA = vc.NextA;

		goto _Increment;
	}

	if( (vc.SCurrent&3) == 3 )
	{
_Increment:
		IncrementNextA( thiscore, voiceidx );
	}

	return vc.SBuffer[vc.SCurrent++];
}

static __forceinline void GetNextDataDummy(V_Core& thiscore, uint voiceidx)
{
	V_Voice& vc( thiscore.Voices[voiceidx] );

	if (vc.SCurrent == 28)
	{
		if(vc.LoopFlags & XAFLAG_LOOP_END)
		{
			thiscore.Regs.ENDX |= (1 << voiceidx);

			if( vc.LoopFlags & XAFLAG_LOOP )
				vc.NextA = vc.LoopStartA;
			// no else, already stopped
		}

		vc.LoopFlags = *GetMemPtr(vc.NextA&0xFFFFF) >> 8;	// grab loop flags from the upper byte.

		if ((vc.LoopFlags & XAFLAG_LOOP_START) && !vc.LoopMode)
			vc.LoopStartA = vc.NextA;

		IncrementNextA(thiscore, voiceidx);

		vc.SCurrent = 0;
	}

	vc.SP -= 4096 * (4 - (vc.SCurrent & 3));
	vc.SCurrent += 4 - (vc.SCurrent & 3);
	IncrementNextA(thiscore, voiceidx);
}

/////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////
//                                                                                     //

static s32 __forceinline GetNoiseValues()
{
	static s32 Seed = 0x41595321;
	s32 retval = 0x8000;

	if( Seed&0x100 )
		retval = (Seed&0xff) << 8;
	else if( Seed&0xffff )
		retval = 0x7fff;
#ifdef _WIN32
	__asm {
		MOV eax,Seed
		ROR eax,5
		XOR eax,0x9a
		MOV ebx,eax
		ROL eax,2
		ADD eax,ebx
		XOR eax,ebx
		ROR eax,3
		MOV Seed,eax
	}
#else
	__asm__ (
		".intel_syntax\n"
		"MOV %%eax,%1\n"
		"ROR %%eax,5\n"
		"XOR %%eax,0x9a\n"
		"MOV %%esi,%%eax\n"
		"ROL %%eax,2\n"
		"ADD %%eax,%%esi\n"
		"XOR %%eax,%%esi\n"
		"ROR %%eax,3\n"
		"MOV %0,%%eax\n"
		".att_syntax\n" : "=r"(Seed) :"r"(Seed)
		: "%eax", "%esi"
		);
#endif
	return retval;
}
/////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////
//                                                                                     //

// Data is expected to be 16 bit signed (typical stuff!).
// volume is expected to be 32 bit signed (31 bits with reverse phase)
// Data is shifted up by 1 bit to give the output an effective 16 bit range.
static __forceinline s32 ApplyVolume(s32 data, s32 volume)
{
	//return (volume * data) >> 15;
	return MulShr32( data<<1, volume );
}

static __forceinline StereoOut32 ApplyVolume( const StereoOut32& data, const V_VolumeLR& volume )
{
	return StereoOut32(
		ApplyVolume( data.Left, volume.Left ),
		ApplyVolume( data.Right, volume.Right )
	);
}

static __forceinline StereoOut32 ApplyVolume( const StereoOut32& data, const V_VolumeSlideLR& volume )
{
	return StereoOut32(
		ApplyVolume( data.Left, volume.Left.Value ),
		ApplyVolume( data.Right, volume.Right.Value )
	);
}

static void __forceinline UpdatePitch( uint coreidx, uint voiceidx )
{
	V_Voice& vc( Cores[coreidx].Voices[voiceidx] );
	s32 pitch;

	// [Air] : re-ordered comparisons: Modulated is much more likely to be zero than voice,
	//   and so the way it was before it's have to check both voice and modulated values
	//   most of the time.  Now it'll just check Modulated and short-circuit past the voice
	//   check (not that it amounts to much, but eh every little bit helps).
	if( (vc.Modulated==0) || (voiceidx==0) )
		pitch = vc.Pitch;
	else
		pitch = (vc.Pitch*(32768 + Cores[coreidx].Voices[voiceidx-1].OutX))>>15;

	vc.SP+=pitch;
}


static __forceinline void CalculateADSR( V_Core& thiscore, uint voiceidx )
{
	V_Voice& vc( thiscore.Voices[voiceidx] );

	if( vc.ADSR.Phase==0 )
	{
		vc.ADSR.Value = 0;
		return;
	}

	if( !vc.ADSR.Calculate() )
	{
		if( IsDevBuild )
		{
			if(MsgVoiceOff()) ConLog("* SPU2-X: Voice Off by ADSR: %d \n", voiceidx);
		}
		vc.Stop();
		thiscore.Regs.ENDX |= (1 << voiceidx);
	}

	jASSUME( vc.ADSR.Value >= 0 );	// ADSR should never be negative...
}

/*
   Tension: 65535 is high, 32768 is normal, 0 is low
*/
template<s32 i_tension>
 __forceinline
static s32 HermiteInterpolate(
	s32 y0, // 16.0
	s32 y1, // 16.0
	s32 y2, // 16.0
	s32 y3, // 16.0
	s32 mu  //  0.12
	)
{
	s32 m00 = ((y1-y0)*i_tension) >> 16; // 16.0
	s32 m01 = ((y2-y1)*i_tension) >> 16; // 16.0
	s32 m0  = m00 + m01;

	s32 m10 = ((y2-y1)*i_tension) >> 16; // 16.0
	s32 m11 = ((y3-y2)*i_tension) >> 16; // 16.0
	s32 m1  = m10 + m11;

	s32 val = ((  2*y1 +   m0 + m1 - 2*y2) * mu) >> 12; // 16.0
	val = ((val - 3*y1 - 2*m0 - m1 + 3*y2) * mu) >> 12; // 16.0
	val = ((val        +   m0            ) * mu) >> 11; // 16.0

	return(val + (y1<<1));
}

__forceinline
static s32 CatmullRomInterpolate(
	s32 y0, // 16.0
	s32 y1, // 16.0
	s32 y2, // 16.0
	s32 y3, // 16.0
	s32 mu  //  0.12
	)
{
	//q(t) = 0.5 *(    	(2 * P1) +
	//	(-P0 + P2) * t +
	//	(2*P0 - 5*P1 + 4*P2 - P3) * t2 +
	//	(-P0 + 3*P1- 3*P2 + P3) * t3)

	s32 a3 = (-  y0 + 3*y1 - 3*y2 + y3);
	s32 a2 = ( 2*y0 - 5*y1 + 4*y2 - y3);
	s32 a1 = (-  y0        +   y2     );
	s32 a0 = (        2*y1            );

	s32 val = ((a3  ) * mu) >> 12;
	val = ((a2 + val) * mu) >> 12;
	val = ((a1 + val) * mu) >> 12;

	return (a0 + val);
}

__forceinline
static s32 CubicInterpolate(
	s32 y0, // 16.0
	s32 y1, // 16.0
	s32 y2, // 16.0
	s32 y3, // 16.0
	s32 mu  //  0.12
	)
{
	const s32 a0 = y3 - y2 - y0 + y1;
	const s32 a1 = y0 - y1 - a0;
	const s32 a2 = y2 - y0;

	s32 val = ((  a0) * mu) >> 12;
	val = ((val + a1) * mu) >> 12;
	val = ((val + a2) * mu) >> 11;

	return(val + (y1<<1));
}

// Returns a 16 bit result in Value.
// Uses standard template-style optimization techniques to statically generate five different
// versions of this function (one for each type of interpolation).
template< int InterpType >
static __forceinline s32 GetVoiceValues( V_Core& thiscore, uint voiceidx )
{
	V_Voice& vc( thiscore.Voices[voiceidx] );

	while( vc.SP > 0 )
	{
		if( InterpType >= 2 )
		{
			vc.PV4 = vc.PV3;
			vc.PV3 = vc.PV2;
		}
		vc.PV2 = vc.PV1;
		vc.PV1 = GetNextDataBuffered( thiscore, voiceidx );
		vc.SP -= 4096;
	}

	const s32 mu = vc.SP + 4096;

	switch( InterpType )
	{
		case 0: return vc.PV1<<1;
		case 1: return (vc.PV1<<1) - (( (vc.PV2 - vc.PV1) * vc.SP)>>11);

		case 2: return CubicInterpolate				(vc.PV4, vc.PV3, vc.PV2, vc.PV1, mu);
		case 3: return HermiteInterpolate<16384>	(vc.PV4, vc.PV3, vc.PV2, vc.PV1, mu);
		case 4: return CatmullRomInterpolate		(vc.PV4, vc.PV3, vc.PV2, vc.PV1, mu);

		jNO_DEFAULT;
	}

	return 0;		// technically unreachable!
}

// Noise values need to be mixed without going through interpolation, since it
// can wreak havoc on the noise (causing muffling or popping).  Not that this noise
// generator is accurate in its own right.. but eh, ah well :)
static __forceinline s32 GetNoiseValues( V_Core& thiscore, uint voiceidx )
{
	V_Voice& vc( thiscore.Voices[voiceidx] );

	s32 retval = GetNoiseValues();

	/*while(vc.SP>=4096)
	{
		retval = GetNoiseValues();
		vc.SP-=4096;
	}*/

	// GetNoiseValues can't set the phase zero on us unexpectedly
	// like GetVoiceValues can.  Better assert just in case though..
	jASSUME( vc.ADSR.Phase != 0 );

	return retval;
}

/////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////
//                                                                                     //

// writes a signed value to the SPU2 ram
// Performs no cache invalidation -- use only for dynamic memory ranges
// of the SPU2 (between 0x0000 and SPU2_DYN_MEMLINE)
static __forceinline void spu2M_WriteFast( u32 addr, s16 value )
{
	// Fixes some of the oldest hangs in pcsx2's history! :p
	for( uint i=0; i<2; i++ )
	{
		if( Cores[i].IRQEnable && Cores[i].IRQA == addr )
		{
			//printf("Core %d special write IRQ Called (IRQ passed). IRQA = %x\n",i,addr);
			SetIrqCall(i);
		}
	}
	// throw an assertion if the memory range is invalid:
#ifndef DEBUG_FAST
	jASSUME( addr < SPU2_DYN_MEMLINE );
#endif
	*GetMemPtr( addr ) = value;
}


static __forceinline StereoOut32 MixVoice( uint coreidx, uint voiceidx )
{
	V_Core& thiscore( Cores[coreidx] );
	V_Voice& vc( thiscore.Voices[voiceidx] );

	// If this assertion fails, it mans SCurrent is being corrupted somewhere, or is not initialized
	// properly.  Invalid values in SCurrent will cause errant IRQs and corrupted audio.
	pxAssumeMsg( (vc.SCurrent <= 28) && (vc.SCurrent != 0), "Current sample should always range from 1->28" );

	// Most games don't use much volume slide effects.  So only call the UpdateVolume
	// methods when needed by checking the flag outside the method here...
	// (Note: Ys 6 : Ark of Nephistm uses these effects)

	vc.Volume.Update();

	// SPU2 Note: The spu2 continues to process voices for eternity, always, so we
	// have to run through all the motions of updating the voice regardless of it's
	// audible status.  Otherwise IRQs might not trigger and emulation might fail.

	if( vc.ADSR.Phase > 0 )
	{
		UpdatePitch( coreidx, voiceidx );

		s32 Value;

		if( vc.Noise )
			Value = GetNoiseValues( thiscore, voiceidx );
		else
		{
			// Optimization : Forceinline'd Templated Dispatch Table.  Any halfwit compiler will
			// turn this into a clever jump dispatch table (no call/rets, no compares, uber-efficient!)

			switch( Interpolation )
			{
				case 0: Value = GetVoiceValues<0>( thiscore, voiceidx ); break;
				case 1: Value = GetVoiceValues<1>( thiscore, voiceidx ); break;
				case 2: Value = GetVoiceValues<2>( thiscore, voiceidx ); break;
				case 3: Value = GetVoiceValues<3>( thiscore, voiceidx ); break;
				case 4: Value = GetVoiceValues<4>( thiscore, voiceidx ); break;

				jNO_DEFAULT;
			}
		}

		// Update and Apply ADSR  (applies to normal and noise sources)
		//
		// Note!  It's very important that ADSR stay as accurate as possible.  By the way
		// it is used, various sound effects can end prematurely if we truncate more than
		// one or two bits.  Best result comes from no truncation at all, which is why we
		// use a full 64-bit multiply/result here.

		CalculateADSR( thiscore, voiceidx );
		Value	= MulShr32( Value, vc.ADSR.Value );

		// Store Value for eventual modulation later
		// Pseudonym's Crest calculation idea. Actually calculates a crest, unlike the old code which was just peak.
		u32 Amplitude = std::abs(Value);
		if(Amplitude < vc.NextCrest)
		{
			vc.OutX = vc.NextCrest;
			vc.NextCrest = 0;
		}
		if(Amplitude > vc.PrevAmp)
		{
			vc.NextCrest = Amplitude;
		}
		vc.PrevAmp = Amplitude;

		if( IsDevBuild )
			DebugCores[coreidx].Voices[voiceidx].displayPeak = std::max(DebugCores[coreidx].Voices[voiceidx].displayPeak,(s32)vc.OutX);

		// Write-back of raw voice data (post ADSR applied)

		if (voiceidx==1)      spu2M_WriteFast( ( (0==coreidx) ? 0x400 : 0xc00 ) + OutPos, vc.OutX );
		else if (voiceidx==3) spu2M_WriteFast( ( (0==coreidx) ? 0x600 : 0xe00 ) + OutPos, vc.OutX );

		return ApplyVolume( StereoOut32( Value, Value ), vc.Volume );
	}
	else
	{
		// Continue processing voice, even if it's "off". Or else we miss interrupts! (Fatal Frame engine died because of this.)
		if ((vc.LoopFlags & 3) != 3 || vc.LoopStartA != (vc.NextA & ~7)) {
			UpdatePitch(coreidx, voiceidx);

			while (vc.SP > 0)
				GetNextDataDummy(thiscore, voiceidx); // Dummy is enough
		}

		// Write-back of raw voice data (some zeros since the voice is "dead")
		if (voiceidx==1)      spu2M_WriteFast( ( (0==coreidx) ? 0x400 : 0xc00 ) + OutPos, 0 );
		else if (voiceidx==3) spu2M_WriteFast( ( (0==coreidx) ? 0x600 : 0xe00 ) + OutPos, 0 );

		return StereoOut32( 0, 0 );
	}
}

const VoiceMixSet VoiceMixSet::Empty( (StereoOut32()), (StereoOut32()) );	// Don't use SteroOut32::Empty because C++ doesn't make any dep/order checks on global initializers.

static __forceinline void MixCoreVoices( VoiceMixSet& dest, const uint coreidx )
{
	V_Core& thiscore( Cores[coreidx] );

	for( uint voiceidx=0; voiceidx<V_Core::NumVoices; ++voiceidx )
	{
		StereoOut32 VVal( MixVoice( coreidx, voiceidx ) );

		// Note: Results from MixVoice are ranged at 16 bits.

		dest.Dry.Left	+= VVal.Left	& thiscore.VoiceGates[voiceidx].DryL;
		dest.Dry.Right	+= VVal.Right	& thiscore.VoiceGates[voiceidx].DryR;
		dest.Wet.Left	+= VVal.Left	& thiscore.VoiceGates[voiceidx].WetL;
		dest.Wet.Right	+= VVal.Right	& thiscore.VoiceGates[voiceidx].WetR;
	}
}

StereoOut32 V_Core::Mix( const VoiceMixSet& inVoices, const StereoOut32& Input, const StereoOut32& Ext )
{
	MasterVol.Update();

	// Saturate final result to standard 16 bit range.
	const VoiceMixSet Voices( clamp_mix( inVoices.Dry ), clamp_mix( inVoices.Wet ) );

	// Write Mixed results To Output Area
	spu2M_WriteFast( ( (0==Index) ? 0x1000 : 0x1800 ) + OutPos, Voices.Dry.Left );
	spu2M_WriteFast( ( (0==Index) ? 0x1200 : 0x1A00 ) + OutPos, Voices.Dry.Right );
	spu2M_WriteFast( ( (0==Index) ? 0x1400 : 0x1C00 ) + OutPos, Voices.Wet.Left );
	spu2M_WriteFast( ( (0==Index) ? 0x1600 : 0x1E00 ) + OutPos, Voices.Wet.Right );

	// Write mixed results to logfile (if enabled)

	WaveDump::WriteCore( Index, CoreSrc_DryVoiceMix, Voices.Dry );
	WaveDump::WriteCore( Index, CoreSrc_WetVoiceMix, Voices.Wet );

	// Mix in the Input data

	StereoOut32 TD(
		Input.Left	& DryGate.InpL,
		Input.Right	& DryGate.InpR
	);

	// Mix in the Voice data
	TD.Left		+= Voices.Dry.Left	& DryGate.SndL;
	TD.Right	+= Voices.Dry.Right	& DryGate.SndR;

	// Mix in the External (nothing/core0) data
	TD.Left		+= Ext.Left			& DryGate.ExtL;
	TD.Right	+= Ext.Right		& DryGate.ExtR;

	// User-level Effects disabling.  Nice speedup but breaks games that depend on
	// reverb IRQs (very few -- if you find one name it here!).
	if( EffectsDisabled ) return TD;

	// ----------------------------------------------------------------------------
	//    Reverberation Effects Processing
	// ----------------------------------------------------------------------------
	// SPU2 has an FxEnable bit which seems to disable all reverb processing *and*
	// output, but does *not* disable the advancing buffers.  IRQs are not triggered
	// and reverb is rendered silent.
	//
	// Technically we should advance the buffers even when fx are disabled.  However
	// there are two things that make this very unlikely to matter:
	//
	//  1. Any SPU2 app wanting to avoid noise or pops needs to clear the reverb buffers
	//     when adjusting settings anyway; so the read/write positions in the reverb
	//     buffer after FxEnabled is set back to 1 doesn't really matter.
	//
	//  2. Writes to ESA (and possibly EEA) reset the buffer pointers to 0.
	//
	// On the other hand, updating the buffer is cheap and easy, so might as well. ;)

	Reverb_AdvanceBuffer(); // Updates the reverb work area as well, if needed.
	if (!FxEnable) return TD;

	StereoOut32 TW;

	// Mix Input, Voice, and External data:

	TW.Left		 = Input.Left		& WetGate.InpL;
	TW.Right	 = Input.Right		& WetGate.InpR;

	TW.Left		+= Voices.Wet.Left	& WetGate.SndL;
	TW.Right	+= Voices.Wet.Right	& WetGate.SndR;
	TW.Left		+= Ext.Left			& WetGate.ExtL;
	TW.Right	+= Ext.Right		& WetGate.ExtR;

	WaveDump::WriteCore( Index, CoreSrc_PreReverb, TW );

	StereoOut32 RV( DoReverb( TW ) );

	WaveDump::WriteCore( Index, CoreSrc_PostReverb, RV );

	// Boost reverb volume
	int temp = 1;
	switch (ReverbBoost)
	{
		case 0: break;
		case 1: temp = 2; break;
		case 2: temp = 4; break;
		case 3: temp = 8; break;
	}
	// Mix Dry + Wet
	// (master volume is applied later to the result of both outputs added together).
	return TD + ApplyVolume( RV*temp, FxVol );
}

// used to throttle the output rate of cache stat reports
static int p_cachestat_counter=0;

__forceinline void Mix()
{
	// Note: Playmode 4 is SPDIF, which overrides other inputs.
	StereoOut32 InputData[2] =
	{
		// SPDIF is on Core 0:
		(PlayMode&4) ? StereoOut32::Empty : ApplyVolume( Cores[0].ReadInput(), Cores[0].InpVol ),

		// CDDA is on Core 1:
		(PlayMode&8) ? StereoOut32::Empty : ApplyVolume( Cores[1].ReadInput(), Cores[1].InpVol )
	};

	WaveDump::WriteCore( 0, CoreSrc_Input, InputData[0] );
	WaveDump::WriteCore( 1, CoreSrc_Input, InputData[1] );

	// Todo: Replace me with memzero initializer!
	VoiceMixSet VoiceData[2] = { VoiceMixSet::Empty, VoiceMixSet::Empty };	// mixed voice data for each core.
	MixCoreVoices( VoiceData[0], 0 );
	MixCoreVoices( VoiceData[1], 1 );

	StereoOut32 Ext( Cores[0].Mix( VoiceData[0], InputData[0], StereoOut32::Empty ) );

	if( (PlayMode & 4) || (Cores[0].Mute!=0) )
		Ext = StereoOut32::Empty;
	else
	{
		Ext = clamp_mix( ApplyVolume( Ext, Cores[0].MasterVol ) );
	}

	// Commit Core 0 output to ram before mixing Core 1:

	spu2M_WriteFast( 0x800 + OutPos, Ext.Left );
	spu2M_WriteFast( 0xA00 + OutPos, Ext.Right );
	WaveDump::WriteCore( 0, CoreSrc_External, Ext );

	ApplyVolume( Ext, Cores[1].ExtVol );
	StereoOut32 Out( Cores[1].Mix( VoiceData[1], InputData[1], Ext ) );

	if( PlayMode & 8 )
	{
		// Experimental CDDA support
		// The CDDA overrides all other mixer output.  It's a direct feed!

		Out = Cores[1].ReadInput_HiFi();
		//WaveLog::WriteCore( 1, "CDDA-32", OutL, OutR );
	}
	else
	{
		Out.Left = MulShr32( Out.Left<<(SndOutVolumeShift+1), Cores[1].MasterVol.Left.Value );
		Out.Right = MulShr32( Out.Right<<(SndOutVolumeShift+1), Cores[1].MasterVol.Right.Value );

		// Final Clamp!
		// Like any good audio system, the PS2 pumps the volume and incurs some distortion in its
		// output, giving us a nice thumpy sound at times.  So we add 1 above (2x volume pump) and
		// then clamp it all here.

		Out = clamp_mix( Out, SndOutVolumeShift );
	}

	SndBuffer::Write( Out );

	// Update AutoDMA output positioning
	OutPos++;
	if (OutPos>=0x200) OutPos=0;

	if( IsDevBuild )
	{
		p_cachestat_counter++;
		if(p_cachestat_counter > (48000*10) )
		{
			p_cachestat_counter = 0;
			if( MsgCache() ) ConLog( " * SPU2 > CacheStats > Hits: %d  Misses: %d  Ignores: %d\n",
				g_counter_cache_hits,
				g_counter_cache_misses,
				g_counter_cache_ignores );

			g_counter_cache_hits =
			g_counter_cache_misses =
			g_counter_cache_ignores = 0;
		}
	}
}

/////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////
//                                                                                     //

/*
-----------------------------------------------------------------------------
PSX reverb hardware notes
by Neill Corlett
-----------------------------------------------------------------------------

Yadda yadda disclaimer yadda probably not perfect yadda well it's okay anyway
yadda yadda.

-----------------------------------------------------------------------------

Basics
------

- The reverb buffer is 22khz 16-bit mono PCM.
- It starts at the reverb address given by 1DA2, extends to
  the end of sound RAM, and wraps back to the 1DA2 address.

Setting the address at 1DA2 resets the current reverb work address.

This work address ALWAYS increments every 1/22050 sec., regardless of
whether reverb is enabled (bit 7 of 1DAA set).

And the contents of the reverb buffer ALWAYS play, scaled by the
"reverberation depth left/right" volumes (1D84/1D86).
(which, by the way, appear to be scaled so 3FFF=approx. 1.0, 4000=-1.0)

-----------------------------------------------------------------------------

Register names
--------------

These are probably not their real names.
These are probably not even correct names.
We will use them anyway, because we can.

1DC0: FB_SRC_A       (offset)
1DC2: FB_SRC_B       (offset)
1DC4: IIR_ALPHA      (coef.)
1DC6: ACC_COEF_A     (coef.)
1DC8: ACC_COEF_B     (coef.)
1DCA: ACC_COEF_C     (coef.)
1DCC: ACC_COEF_D     (coef.)
1DCE: IIR_COEF       (coef.)
1DD0: FB_ALPHA       (coef.)
1DD2: FB_X           (coef.)
1DD4: IIR_DEST_A0    (offset)
1DD6: IIR_DEST_A1    (offset)
1DD8: ACC_SRC_A0     (offset)
1DDA: ACC_SRC_A1     (offset)
1DDC: ACC_SRC_B0     (offset)
1DDE: ACC_SRC_B1     (offset)
1DE0: IIR_SRC_A0     (offset)
1DE2: IIR_SRC_A1     (offset)
1DE4: IIR_DEST_B0    (offset)
1DE6: IIR_DEST_B1    (offset)
1DE8: ACC_SRC_C0     (offset)
1DEA: ACC_SRC_C1     (offset)
1DEC: ACC_SRC_D0     (offset)
1DEE: ACC_SRC_D1     (offset)
1DF0: IIR_SRC_B1     (offset)
1DF2: IIR_SRC_B0     (offset)
1DF4: MIX_DEST_A0    (offset)
1DF6: MIX_DEST_A1    (offset)
1DF8: MIX_DEST_B0    (offset)
1DFA: MIX_DEST_B1    (offset)
1DFC: IN_COEF_L      (coef.)
1DFE: IN_COEF_R      (coef.)

The coefficients are signed fractional values.
-32768 would be -1.0
 32768 would be  1.0 (if it were possible... the highest is of course 32767)

The offsets are (byte/8) offsets into the reverb buffer.
i.e. you multiply them by 8, you get byte offsets.
You can also think of them as (samples/4) offsets.
They appear to be signed.  They can be negative.
None of the documented presets make them negative, though.

Yes, 1DF0 and 1DF2 appear to be backwards.  Not a typo.

-----------------------------------------------------------------------------

What it does
------------

We take all reverb sources:
- regular channels that have the reverb bit on
- cd and external sources, if their reverb bits are on
and mix them into one stereo 44100hz signal.

Lowpass/downsample that to 22050hz.  The PSX uses a proper bandlimiting
algorithm here, but I haven't figured out the hysterically exact specifics.
I use an 8-tap filter with these coefficients, which are nice but probably
not the real ones:

0.037828187894
0.157538631280
0.321159685278
0.449322115345
0.449322115345
0.321159685278
0.157538631280
0.037828187894

So we have two input samples (INPUT_SAMPLE_L, INPUT_SAMPLE_R) every 22050hz.

* IN MY EMULATION, I divide these by 2 to make it clip less.
  (and of course the L/R output coefficients are adjusted to compensate)
  The real thing appears to not do this.

At every 22050hz tick:
- If the reverb bit is enabled (bit 7 of 1DAA), execute the reverb
  steady-state algorithm described below
- AFTERWARDS, retrieve the "wet out" L and R samples from the reverb buffer
  (This part may not be exactly right and I guessed at the coefs. TODO: check later.)
  L is: 0.333 * (buffer[MIX_DEST_A0] + buffer[MIX_DEST_B0])
  R is: 0.333 * (buffer[MIX_DEST_A1] + buffer[MIX_DEST_B1])
- Advance the current buffer position by 1 sample

The wet out L and R are then upsampled to 44100hz and played at the
"reverberation depth left/right" (1D84/1D86) volume, independent of the main
volume.

-----------------------------------------------------------------------------

Reverb steady-state
-------------------

The reverb steady-state algorithm is fairly clever, and of course by
"clever" I mean "batshit insane".

buffer[x] is relative to the current buffer position, not the beginning of
the buffer.  Note that all buffer offsets must wrap around so they're
contained within the reverb work area.

Clipping is performed at the end... maybe also sooner, but definitely at
the end.

IIR_INPUT_A0 = buffer[IIR_SRC_A0] * IIR_COEF + INPUT_SAMPLE_L * IN_COEF_L;
IIR_INPUT_A1 = buffer[IIR_SRC_A1] * IIR_COEF + INPUT_SAMPLE_R * IN_COEF_R;
IIR_INPUT_B0 = buffer[IIR_SRC_B0] * IIR_COEF + INPUT_SAMPLE_L * IN_COEF_L;
IIR_INPUT_B1 = buffer[IIR_SRC_B1] * IIR_COEF + INPUT_SAMPLE_R * IN_COEF_R;

IIR_A0 = IIR_INPUT_A0 * IIR_ALPHA + buffer[IIR_DEST_A0] * (1.0 - IIR_ALPHA);
IIR_A1 = IIR_INPUT_A1 * IIR_ALPHA + buffer[IIR_DEST_A1] * (1.0 - IIR_ALPHA);
IIR_B0 = IIR_INPUT_B0 * IIR_ALPHA + buffer[IIR_DEST_B0] * (1.0 - IIR_ALPHA);
IIR_B1 = IIR_INPUT_B1 * IIR_ALPHA + buffer[IIR_DEST_B1] * (1.0 - IIR_ALPHA);

buffer[IIR_DEST_A0 + 1sample] = IIR_A0;
buffer[IIR_DEST_A1 + 1sample] = IIR_A1;
buffer[IIR_DEST_B0 + 1sample] = IIR_B0;
buffer[IIR_DEST_B1 + 1sample] = IIR_B1;

ACC0 = buffer[ACC_SRC_A0] * ACC_COEF_A +
       buffer[ACC_SRC_B0] * ACC_COEF_B +
       buffer[ACC_SRC_C0] * ACC_COEF_C +
       buffer[ACC_SRC_D0] * ACC_COEF_D;
ACC1 = buffer[ACC_SRC_A1] * ACC_COEF_A +
       buffer[ACC_SRC_B1] * ACC_COEF_B +
       buffer[ACC_SRC_C1] * ACC_COEF_C +
       buffer[ACC_SRC_D1] * ACC_COEF_D;

FB_A0 = buffer[MIX_DEST_A0 - FB_SRC_A];
FB_A1 = buffer[MIX_DEST_A1 - FB_SRC_A];
FB_B0 = buffer[MIX_DEST_B0 - FB_SRC_B];
FB_B1 = buffer[MIX_DEST_B1 - FB_SRC_B];

buffer[MIX_DEST_A0] = ACC0 - FB_A0 * FB_ALPHA;
buffer[MIX_DEST_A1] = ACC1 - FB_A1 * FB_ALPHA;
buffer[MIX_DEST_B0] = (FB_ALPHA * ACC0) - FB_A0 * (FB_ALPHA^0x8000) - FB_B0 * FB_X;
buffer[MIX_DEST_B1] = (FB_ALPHA * ACC1) - FB_A1 * (FB_ALPHA^0x8000) - FB_B1 * FB_X;

Air notes:
  The above is effectivly the same as:
    buffer[MIX_DEST_B0] = (ACC0 * FB_ALPHA) + (FB_A0 * (1.0-FB_ALPHA)) - FB_B0 * FB_X;
    buffer[MIX_DEST_B1] = (ACC1 * FB_ALPHA) + (FB_A1 * (1.0-FB_ALPHA)) - FB_B1 * FB_X;

  Which reduces to:
    buffer[MIX_DEST_B0] = ACC0 + ((FB_A0-ACC0) * FB_ALPHA) - FB_B0 * FB_X;
    buffer[MIX_DEST_B1] = ACC1 + ((FB_A1-ACC1) * FB_ALPHA) - FB_B1 * FB_X;


-----------------------------------------------------------------------------
*/