mirror of https://github.com/PCSX2/pcsx2.git
940 lines
27 KiB
C++
940 lines
27 KiB
C++
/* SPU2-X, A plugin for Emulating the Sound Processing Unit of the Playstation 2
|
|
* Developed and maintained by the Pcsx2 Development Team.
|
|
*
|
|
* Original portions from SPU2ghz are (c) 2008 by David Quintana [gigaherz]
|
|
*
|
|
* SPU2-X is free software: you can redistribute it and/or modify it under the terms
|
|
* of the GNU Lesser General Public License as published by the Free Software Found-
|
|
* ation, either version 3 of the License, or (at your option) any later version.
|
|
*
|
|
* SPU2-X is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
|
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
* PURPOSE. See the GNU Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with SPU2-X. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "Spu2.h"
|
|
#include <float.h>
|
|
|
|
extern void spdif_update();
|
|
|
|
void ADMAOutLogWrite(void *lpData, u32 ulSize);
|
|
|
|
static const s32 tbl_XA_Factor[5][2] =
|
|
{
|
|
{ 0, 0 },
|
|
{ 60, 0 },
|
|
{ 115, -52 },
|
|
{ 98, -55 },
|
|
{ 122, -60 }
|
|
};
|
|
|
|
|
|
// Performs a 64-bit multiplication between two values and returns the
|
|
// high 32 bits as a result (discarding the fractional 32 bits).
|
|
// The combined fractional bits of both inputs must be 32 bits for this
|
|
// to work properly.
|
|
//
|
|
// This is meant to be a drop-in replacement for times when the 'div' part
|
|
// of a MulDiv is a constant. (example: 1<<8, or 4096, etc)
|
|
//
|
|
// [Air] Performance breakdown: This is over 10 times faster than MulDiv in
|
|
// a *worst case* scenario. It's also more accurate since it forces the
|
|
// caller to extend the inputs so that they make use of all 32 bits of
|
|
// precision.
|
|
//
|
|
__forceinline s32 MulShr32( s32 srcval, s32 mulval )
|
|
{
|
|
s64 tmp = ((s64)srcval * mulval );
|
|
return ((s32*)&tmp)[1];
|
|
|
|
// Performance note: Using the temp var and memory reference
|
|
// actually ends up being roughly 2x faster than using a bitshift.
|
|
// It won't fly on big endian machines though... :)
|
|
}
|
|
|
|
__forceinline s32 clamp_mix( s32 x, u8 bitshift )
|
|
{
|
|
return GetClamped( x, -0x8000<<bitshift, 0x7fff<<bitshift );
|
|
}
|
|
|
|
__forceinline StereoOut32 clamp_mix( const StereoOut32& sample, u8 bitshift )
|
|
{
|
|
return StereoOut32(
|
|
GetClamped( sample.Left, -0x8000<<bitshift, 0x7fff<<bitshift ),
|
|
GetClamped( sample.Right, -0x8000<<bitshift, 0x7fff<<bitshift )
|
|
);
|
|
}
|
|
|
|
static void __forceinline XA_decode_block(s16* buffer, const s16* block, s32& prev1, s32& prev2)
|
|
{
|
|
const s32 header = *block;
|
|
const s32 shift = ((header>> 0)&0xF)+16;
|
|
const s32 pred1 = tbl_XA_Factor[(header>> 4)&0xF][0];
|
|
const s32 pred2 = tbl_XA_Factor[(header>> 4)&0xF][1];
|
|
|
|
const s8* blockbytes = (s8*)&block[1];
|
|
|
|
for(int i=0; i<14; i++, blockbytes++)
|
|
{
|
|
s32 pcm, pcm2;
|
|
{
|
|
s32 data = ((*blockbytes)<<28) & 0xF0000000;
|
|
pcm = data>>shift;
|
|
pcm+=((pred1*prev1)+(pred2*prev2))>>6;
|
|
if(pcm> 32767) pcm= 32767;
|
|
else if(pcm<-32768) pcm=-32768;
|
|
*(buffer++) = pcm;
|
|
}
|
|
|
|
//prev2=prev1;
|
|
//prev1=pcm;
|
|
|
|
{
|
|
s32 data = ((*blockbytes)<<24) & 0xF0000000;
|
|
pcm2 = data>>shift;
|
|
pcm2+=((pred1*pcm)+(pred2*prev1))>>6;
|
|
if(pcm2> 32767) pcm2= 32767;
|
|
else if(pcm2<-32768) pcm2=-32768;
|
|
*(buffer++) = pcm2;
|
|
}
|
|
|
|
prev2=pcm;
|
|
prev1=pcm2;
|
|
}
|
|
}
|
|
|
|
static void __forceinline XA_decode_block_unsaturated(s16* buffer, const s16* block, s32& prev1, s32& prev2)
|
|
{
|
|
const u8 header = *(u8*)block;
|
|
s32 shift = (header&0xF) + 16;
|
|
s32 pred1 = tbl_XA_Factor[header>>4][0];
|
|
s32 pred2 = tbl_XA_Factor[header>>4][1];
|
|
|
|
const s8* blockbytes = (s8*)&block[1];
|
|
|
|
for(int i=0; i<14; i++, blockbytes++)
|
|
{
|
|
s32 pcm, pcm2;
|
|
{
|
|
s32 data = ((*blockbytes)<<28) & 0xF0000000;
|
|
pcm = data>>shift;
|
|
pcm+=((pred1*prev1)+(pred2*prev2))>>6;
|
|
*(buffer++) = pcm;
|
|
}
|
|
|
|
{
|
|
s32 data = ((*blockbytes)<<24) & 0xF0000000;
|
|
pcm2 = data>>shift;
|
|
pcm2+=((pred1*pcm)+(pred2*prev1))>>6;
|
|
*(buffer++) = pcm2;
|
|
}
|
|
|
|
prev2 = pcm;
|
|
prev1 = pcm2;
|
|
}
|
|
}
|
|
|
|
static void __forceinline IncrementNextA( const V_Core& thiscore, V_Voice& vc )
|
|
{
|
|
// Important! Both cores signal IRQ when an address is read, regardless of
|
|
// which core actually reads the address.
|
|
|
|
for( int i=0; i<2; i++ )
|
|
{
|
|
if( Cores[i].IRQEnable && (vc.NextA==Cores[i].IRQA ) )
|
|
{
|
|
if( IsDevBuild )
|
|
ConLog(" * SPU2 Core %d: IRQ Called (IRQ passed).\n", i);
|
|
|
|
Spdif.Info = 4 << i;
|
|
SetIrqCall();
|
|
}
|
|
}
|
|
|
|
vc.NextA++;
|
|
vc.NextA&=0xFFFFF;
|
|
}
|
|
|
|
// decoded pcm data, used to cache the decoded data so that it needn't be decoded
|
|
// multiple times. Cache chunks are decoded when the mixer requests the blocks, and
|
|
// invalided when DMA transfers and memory writes are performed.
|
|
PcmCacheEntry *pcm_cache_data = NULL;
|
|
|
|
int g_counter_cache_hits = 0;
|
|
int g_counter_cache_misses = 0;
|
|
int g_counter_cache_ignores = 0;
|
|
|
|
#define XAFLAG_LOOP_END (1ul<<0)
|
|
#define XAFLAG_LOOP (1ul<<1)
|
|
#define XAFLAG_LOOP_START (1ul<<2)
|
|
|
|
static __forceinline s32 __fastcall GetNextDataBuffered( V_Core& thiscore, uint voiceidx )
|
|
{
|
|
V_Voice& vc( thiscore.Voices[voiceidx] );
|
|
|
|
if( vc.SCurrent == 28 )
|
|
{
|
|
if(vc.LoopFlags & XAFLAG_LOOP_END)
|
|
{
|
|
thiscore.Regs.ENDX |= (1 << voiceidx);
|
|
|
|
if( vc.LoopFlags & XAFLAG_LOOP )
|
|
{
|
|
vc.NextA = vc.LoopStartA;
|
|
}
|
|
else
|
|
{
|
|
vc.Stop();
|
|
if( IsDevBuild )
|
|
{
|
|
if(MsgVoiceOff()) ConLog(" * SPU2: Voice Off by EndPoint: %d \n", voiceidx);
|
|
}
|
|
}
|
|
}
|
|
|
|
// We'll need the loop flags and buffer pointers regardless of cache status:
|
|
// Note to Self : NextA addresses WORDS (not bytes).
|
|
|
|
s16* memptr = GetMemPtr(vc.NextA&0xFFFFF);
|
|
vc.LoopFlags = *memptr >> 8; // grab loop flags from the upper byte.
|
|
|
|
const int cacheIdx = vc.NextA / pcm_WordsPerBlock;
|
|
PcmCacheEntry& cacheLine = pcm_cache_data[cacheIdx];
|
|
vc.SBuffer = cacheLine.Sampledata;
|
|
|
|
if( cacheLine.Validated )
|
|
{
|
|
// Cached block! Read from the cache directly.
|
|
// Make sure to propagate the prev1/prev2 ADPCM:
|
|
|
|
vc.Prev1 = vc.SBuffer[27];
|
|
vc.Prev2 = vc.SBuffer[26];
|
|
|
|
//ConLog( " * SPU2 : Cache Hit! NextA=0x%x, cacheIdx=0x%x\n", vc.NextA, cacheIdx );
|
|
|
|
if( IsDevBuild )
|
|
g_counter_cache_hits++;
|
|
}
|
|
else
|
|
{
|
|
// Only flag the cache if it's a non-dynamic memory range.
|
|
if( vc.NextA >= SPU2_DYN_MEMLINE )
|
|
cacheLine.Validated = true;
|
|
|
|
if( IsDevBuild )
|
|
{
|
|
if( vc.NextA < SPU2_DYN_MEMLINE )
|
|
g_counter_cache_ignores++;
|
|
else
|
|
g_counter_cache_misses++;
|
|
}
|
|
|
|
s16* sbuffer = cacheLine.Sampledata;
|
|
|
|
//if( vc.LoopFlags & XAFLAG_LOOP )
|
|
// vc.Prev1 = vc.Prev2 = 0;
|
|
|
|
// saturated decoder
|
|
//XA_decode_block( sbuffer, memptr, vc.Prev1, vc.Prev2 );
|
|
|
|
// [Air]: Testing use of a new unsaturated decoder. (benchmark needed)
|
|
// Chances are the saturation isn't needed, but for a very few exception games.
|
|
// This is definitely faster than the above version, but is it by enough to
|
|
// merit possible lower compatibility? Especially now that games that make
|
|
// heavy use of the SPU2 via music or sfx will mostly use the cache anyway.
|
|
|
|
XA_decode_block_unsaturated( vc.SBuffer, memptr, vc.Prev1, vc.Prev2 );
|
|
}
|
|
|
|
vc.SCurrent = 0;
|
|
if( (vc.LoopFlags & XAFLAG_LOOP_START) && !vc.LoopMode )
|
|
vc.LoopStartA = vc.NextA;
|
|
|
|
goto _Increment;
|
|
}
|
|
|
|
if( (vc.SCurrent&3) == 3 )
|
|
{
|
|
_Increment:
|
|
IncrementNextA( thiscore, vc );
|
|
}
|
|
|
|
return vc.SBuffer[vc.SCurrent++];
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////
|
|
/////////////////////////////////////////////////////////////////////////////////////////
|
|
// //
|
|
static s32 __forceinline GetNoiseValues()
|
|
{
|
|
static s32 Seed = 0x41595321;
|
|
s32 retval = 0x8000;
|
|
|
|
if( Seed&0x100 )
|
|
retval = (Seed&0xff) << 8;
|
|
else if( Seed&0xffff )
|
|
retval = 0x7fff;
|
|
|
|
#ifdef _WIN32
|
|
__asm {
|
|
MOV eax,Seed
|
|
ROR eax,5
|
|
XOR eax,0x9a
|
|
MOV ebx,eax
|
|
ROL eax,2
|
|
ADD eax,ebx
|
|
XOR eax,ebx
|
|
ROR eax,3
|
|
MOV Seed,eax
|
|
}
|
|
#else
|
|
__asm__ (
|
|
".intel_syntax\n"
|
|
"MOV %%eax,%0\n"
|
|
"ROR %%eax,5\n"
|
|
"XOR %%eax,0x9a\n"
|
|
"MOV %%ebx,%%eax\n"
|
|
"ROL %%eax,2\n"
|
|
"ADD %%eax,%%ebx\n"
|
|
"XOR %%eax,%%ebx\n"
|
|
"ROR %%eax,3\n"
|
|
"MOV %0,%%eax\n"
|
|
".att_syntax\n" : :"r"(Seed));
|
|
#endif
|
|
return retval;
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////
|
|
/////////////////////////////////////////////////////////////////////////////////////////
|
|
// //
|
|
|
|
// Data is expected to be 16 bit signed (typical stuff!).
|
|
// volume is expected to be 32 bit signed (31 bits with reverse phase)
|
|
// Output is effectively 15 bit range, thanks to the signed volume.
|
|
static __forceinline s32 ApplyVolume(s32 data, s32 volume)
|
|
{
|
|
//return (volume * data) >> 15;
|
|
return MulShr32( data<<1, volume );
|
|
}
|
|
|
|
static __forceinline StereoOut32 ApplyVolume( const StereoOut32& data, const V_VolumeLR& volume )
|
|
{
|
|
return StereoOut32(
|
|
ApplyVolume( data.Left, volume.Left ),
|
|
ApplyVolume( data.Right, volume.Right )
|
|
);
|
|
}
|
|
|
|
static __forceinline StereoOut32 ApplyVolume( const StereoOut32& data, const V_VolumeSlideLR& volume )
|
|
{
|
|
return StereoOut32(
|
|
ApplyVolume( data.Left, volume.Left.Value ),
|
|
ApplyVolume( data.Right, volume.Right.Value )
|
|
);
|
|
}
|
|
|
|
static void __forceinline UpdatePitch( uint coreidx, uint voiceidx )
|
|
{
|
|
V_Voice& vc( Cores[coreidx].Voices[voiceidx] );
|
|
s32 pitch;
|
|
|
|
// [Air] : re-ordered comparisons: Modulated is much more likely to be zero than voice,
|
|
// and so the way it was before it's have to check both voice and modulated values
|
|
// most of the time. Now it'll just check Modulated and short-circuit past the voice
|
|
// check (not that it amounts to much, but eh every little bit helps).
|
|
if( (vc.Modulated==0) || (voiceidx==0) )
|
|
pitch = vc.Pitch;
|
|
else
|
|
pitch = (vc.Pitch*(32768 + abs(Cores[coreidx].Voices[voiceidx-1].OutX)))>>15;
|
|
|
|
vc.SP+=pitch;
|
|
}
|
|
|
|
|
|
static __forceinline void CalculateADSR( V_Core& thiscore, uint voiceidx )
|
|
{
|
|
V_Voice& vc( thiscore.Voices[voiceidx] );
|
|
|
|
if( vc.ADSR.Phase==0 )
|
|
{
|
|
vc.ADSR.Value = 0;
|
|
return;
|
|
}
|
|
|
|
if( !vc.ADSR.Calculate() )
|
|
{
|
|
if( IsDevBuild )
|
|
{
|
|
if(MsgVoiceOff()) ConLog(" * SPU2: Voice Off by ADSR: %d \n", voiceidx);
|
|
}
|
|
vc.Stop();
|
|
thiscore.Regs.ENDX |= (1 << voiceidx);
|
|
}
|
|
|
|
jASSUME( vc.ADSR.Value >= 0 ); // ADSR should never be negative...
|
|
}
|
|
|
|
// Returns a 16 bit result in Value.
|
|
static s32 __forceinline GetVoiceValues_Linear( V_Core& thiscore, uint voiceidx )
|
|
{
|
|
V_Voice& vc( thiscore.Voices[voiceidx] );
|
|
|
|
while( vc.SP > 0 )
|
|
{
|
|
vc.PV2 = vc.PV1;
|
|
vc.PV1 = GetNextDataBuffered( thiscore, voiceidx );
|
|
vc.SP -= 4096;
|
|
}
|
|
|
|
CalculateADSR( thiscore, voiceidx );
|
|
|
|
// Note! It's very important that ADSR stay as accurate as possible. By the way
|
|
// it is used, various sound effects can end prematurely if we truncate more than
|
|
// one or two bits.
|
|
|
|
if(Interpolation==0)
|
|
{
|
|
return ApplyVolume( vc.PV1, vc.ADSR.Value );
|
|
}
|
|
else //if(Interpolation==1) //must be linear
|
|
{
|
|
s32 t0 = vc.PV2 - vc.PV1;
|
|
return MulShr32( (vc.PV1<<1) - ((t0*vc.SP)>>11), vc.ADSR.Value );
|
|
}
|
|
}
|
|
|
|
// Returns a 16 bit result in Value.
|
|
static s32 __forceinline GetVoiceValues_Cubic( V_Core& thiscore, uint voiceidx )
|
|
{
|
|
V_Voice& vc( thiscore.Voices[voiceidx] );
|
|
|
|
while( vc.SP > 0 )
|
|
{
|
|
vc.PV4 = vc.PV3;
|
|
vc.PV3 = vc.PV2;
|
|
vc.PV2 = vc.PV1;
|
|
|
|
vc.PV1 = GetNextDataBuffered( thiscore, voiceidx );
|
|
vc.PV1 <<= 2;
|
|
vc.SPc = vc.SP&4095; // just the fractional part, please!
|
|
vc.SP -= 4096;
|
|
}
|
|
|
|
CalculateADSR( thiscore, voiceidx );
|
|
|
|
s32 z0 = vc.PV3 - vc.PV4 + vc.PV1 - vc.PV2;
|
|
s32 z1 = (vc.PV4 - vc.PV3 - z0);
|
|
s32 z2 = (vc.PV2 - vc.PV4);
|
|
|
|
s32 mu = vc.SPc;
|
|
|
|
s32 val = (z0 * mu) >> 12;
|
|
val = ((val + z1) * mu) >> 12;
|
|
val = ((val + z2) * mu) >> 12;
|
|
val += vc.PV3;
|
|
|
|
// Note! It's very important that ADSR stay as accurate as possible. By the way
|
|
// it is used, various sound effects can end prematurely if we truncate more than
|
|
// one or two bits. (or maybe it's better with no truncation at all?)
|
|
return MulShr32( val>>1, vc.ADSR.Value );
|
|
}
|
|
|
|
// Noise values need to be mixed without going through interpolation, since it
|
|
// can wreak havoc on the noise (causing muffling or popping). Not that this noise
|
|
// generator is accurate in its own right.. but eh, ah well :)
|
|
static s32 __forceinline __fastcall GetNoiseValues( V_Core& thiscore, uint voiceidx )
|
|
{
|
|
V_Voice& vc( thiscore.Voices[voiceidx] );
|
|
|
|
s32 retval = GetNoiseValues();
|
|
|
|
/*while(vc.SP>=4096)
|
|
{
|
|
retval = GetNoiseValues();
|
|
vc.SP-=4096;
|
|
}*/
|
|
|
|
// GetNoiseValues can't set the phase zero on us unexpectedly
|
|
// like GetVoiceValues can. Better assert just in case though..
|
|
jASSUME( vc.ADSR.Phase != 0 );
|
|
|
|
CalculateADSR( thiscore, voiceidx );
|
|
|
|
// Yup, ADSR applies even to noise sources...
|
|
return ApplyVolume( retval, vc.ADSR.Value );
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////
|
|
/////////////////////////////////////////////////////////////////////////////////////////
|
|
// //
|
|
|
|
static __forceinline StereoOut32 ReadInputPV( uint core )
|
|
{
|
|
V_Core& thiscore( Cores[core] );
|
|
u32 pitch = AutoDMAPlayRate[core];
|
|
|
|
if(pitch==0) pitch=48000;
|
|
|
|
thiscore.ADMAPV += pitch;
|
|
while(thiscore.ADMAPV>=48000)
|
|
{
|
|
ReadInput( core, thiscore.ADMAP );
|
|
thiscore.ADMAPV -= 48000;
|
|
}
|
|
|
|
// Apply volumes:
|
|
return ApplyVolume( thiscore.ADMAP, thiscore.InpVol );
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////
|
|
/////////////////////////////////////////////////////////////////////////////////////////
|
|
// //
|
|
|
|
// writes a signed value to the SPU2 ram
|
|
// Performs no cache invalidation -- use only for dynamic memory ranges
|
|
// of the SPU2 (between 0x0000 and SPU2_DYN_MEMLINE)
|
|
static __forceinline void spu2M_WriteFast( u32 addr, s16 value )
|
|
{
|
|
// throw an assertion if the memory range is invalid:
|
|
#ifndef DEBUG_FAST
|
|
jASSUME( addr < SPU2_DYN_MEMLINE );
|
|
#endif
|
|
*GetMemPtr( addr ) = value;
|
|
}
|
|
|
|
|
|
static __forceinline StereoOut32 MixVoice( uint coreidx, uint voiceidx )
|
|
{
|
|
V_Core& thiscore( Cores[coreidx] );
|
|
V_Voice& vc( thiscore.Voices[voiceidx] );
|
|
|
|
// Most games don't use much volume slide effects. So only call the UpdateVolume
|
|
// methods when needed by checking the flag outside the method here...
|
|
|
|
vc.Volume.Update();
|
|
|
|
// SPU2 Note: The spu2 continues to process voices for eternity, always, so we
|
|
// have to run through all the motions of updating the voice regardless of it's
|
|
// audible status. Otherwise IRQs might not trigger and emulation might fail.
|
|
|
|
if( vc.ADSR.Phase > 0 )
|
|
{
|
|
UpdatePitch( coreidx, voiceidx );
|
|
|
|
s32 Value;
|
|
|
|
if( vc.Noise )
|
|
Value = GetNoiseValues( thiscore, voiceidx );
|
|
else
|
|
{
|
|
if( Interpolation == 2 )
|
|
Value = GetVoiceValues_Cubic( thiscore, voiceidx );
|
|
else
|
|
Value = GetVoiceValues_Linear( thiscore, voiceidx );
|
|
}
|
|
|
|
// Note: All values recorded into OutX (may be used for modulation later)
|
|
vc.OutX = Value;
|
|
|
|
if( IsDevBuild )
|
|
DebugCores[coreidx].Voices[voiceidx].displayPeak = max(DebugCores[coreidx].Voices[voiceidx].displayPeak,abs(vc.OutX));
|
|
|
|
// Write-back of raw voice data (post ADSR applied)
|
|
|
|
if (voiceidx==1) spu2M_WriteFast( 0x400 + (coreidx<<12) + OutPos, Value );
|
|
else if (voiceidx==3) spu2M_WriteFast( 0x600 + (coreidx<<12) + OutPos, Value );
|
|
|
|
return ApplyVolume( StereoOut32( Value, Value ), vc.Volume );
|
|
}
|
|
else
|
|
{
|
|
// Write-back of raw voice data (some zeros since the voice is "dead")
|
|
|
|
if (voiceidx==1) spu2M_WriteFast( 0x400 + (coreidx<<12) + OutPos, 0 );
|
|
else if (voiceidx==3) spu2M_WriteFast( 0x600 + (coreidx<<12) + OutPos, 0 );
|
|
|
|
return StereoOut32( 0, 0 );
|
|
}
|
|
}
|
|
|
|
struct VoiceMixSet
|
|
{
|
|
static const VoiceMixSet Empty;
|
|
StereoOut32 Dry, Wet;
|
|
|
|
VoiceMixSet() {}
|
|
VoiceMixSet( const StereoOut32& dry, const StereoOut32& wet ) :
|
|
Dry( dry ),
|
|
Wet( wet )
|
|
{
|
|
}
|
|
};
|
|
|
|
const VoiceMixSet VoiceMixSet::Empty( StereoOut32::Empty, StereoOut32::Empty );
|
|
|
|
static __forceinline void MixCoreVoices( VoiceMixSet& dest, const uint coreidx )
|
|
{
|
|
V_Core& thiscore( Cores[coreidx] );
|
|
|
|
for( uint voiceidx=0; voiceidx<V_Core::NumVoices; ++voiceidx )
|
|
{
|
|
StereoOut32 VVal( MixVoice( coreidx, voiceidx ) );
|
|
|
|
// Note: Results from MixVoice are ranged at 16 bits.
|
|
|
|
dest.Dry.Left += VVal.Left & thiscore.VoiceGates[voiceidx].DryL;
|
|
dest.Dry.Right += VVal.Right & thiscore.VoiceGates[voiceidx].DryR;
|
|
dest.Wet.Left += VVal.Left & thiscore.VoiceGates[voiceidx].WetL;
|
|
dest.Wet.Right += VVal.Right & thiscore.VoiceGates[voiceidx].WetR;
|
|
}
|
|
}
|
|
|
|
static StereoOut32 __fastcall MixCore( const uint coreidx, const VoiceMixSet& inVoices, const StereoOut32& Input, const StereoOut32& Ext )
|
|
{
|
|
V_Core& thiscore( Cores[coreidx] );
|
|
thiscore.MasterVol.Update();
|
|
|
|
// Saturate final result to standard 16 bit range.
|
|
const VoiceMixSet Voices( clamp_mix( inVoices.Dry ), clamp_mix( inVoices.Wet ) );
|
|
|
|
// Write Mixed results To Output Area
|
|
spu2M_WriteFast( 0x1000 + (coreidx<<12) + OutPos, Voices.Dry.Left );
|
|
spu2M_WriteFast( 0x1200 + (coreidx<<12) + OutPos, Voices.Dry.Right );
|
|
spu2M_WriteFast( 0x1400 + (coreidx<<12) + OutPos, Voices.Wet.Left );
|
|
spu2M_WriteFast( 0x1600 + (coreidx<<12) + OutPos, Voices.Wet.Right );
|
|
|
|
// Write mixed results to logfile (if enabled)
|
|
|
|
WaveDump::WriteCore( coreidx, CoreSrc_DryVoiceMix, Voices.Dry );
|
|
WaveDump::WriteCore( coreidx, CoreSrc_WetVoiceMix, Voices.Wet );
|
|
|
|
// Mix in the Input data
|
|
|
|
StereoOut32 TD(
|
|
Input.Left & thiscore.DryGate.InpL,
|
|
Input.Right & thiscore.DryGate.InpR
|
|
);
|
|
|
|
// Mix in the Voice data
|
|
TD.Left += Voices.Dry.Left & thiscore.DryGate.SndL;
|
|
TD.Right += Voices.Dry.Right & thiscore.DryGate.SndR;
|
|
|
|
// Mix in the External (nothing/core0) data
|
|
TD.Left += Ext.Left & thiscore.DryGate.ExtL;
|
|
TD.Right += Ext.Right & thiscore.DryGate.ExtR;
|
|
|
|
if( !EffectsDisabled )
|
|
{
|
|
//Reverb pointer advances regardless of the FxEnable bit...
|
|
Reverb_AdvanceBuffer( thiscore );
|
|
|
|
if( thiscore.FxEnable )
|
|
{
|
|
// Mix Input, Voice, and External data:
|
|
StereoOut32 TW(
|
|
Input.Left & thiscore.WetGate.InpL,
|
|
Input.Right & thiscore.WetGate.InpR
|
|
);
|
|
|
|
TW.Left += Voices.Wet.Left & thiscore.WetGate.SndL;
|
|
TW.Right += Voices.Wet.Right & thiscore.WetGate.SndR;
|
|
TW.Left += Ext.Left & thiscore.WetGate.ExtL;
|
|
TW.Right += Ext.Right & thiscore.WetGate.ExtR;
|
|
|
|
WaveDump::WriteCore( coreidx, CoreSrc_PreReverb, TW );
|
|
|
|
StereoOut32 RV( DoReverb( thiscore, TW ) );
|
|
|
|
// Volume boost after effects application. Boosting volume prior to effects
|
|
// causes slight overflows in some games, and the volume boost is required.
|
|
// (like all over volumes on SPU2, reverb coefficients and stuff are signed,
|
|
// range -50% to 50%, thus *2 is needed)
|
|
|
|
RV.Left *= 2;
|
|
RV.Right *= 2;
|
|
|
|
WaveDump::WriteCore( coreidx, CoreSrc_PostReverb, RV );
|
|
|
|
// Mix Dry+Wet
|
|
return TD + ApplyVolume( RV, thiscore.FxVol );
|
|
}
|
|
else
|
|
{
|
|
WaveDump::WriteCore( coreidx, CoreSrc_PreReverb, 0, 0 );
|
|
WaveDump::WriteCore( coreidx, CoreSrc_PostReverb, 0, 0 );
|
|
}
|
|
}
|
|
return TD;
|
|
}
|
|
|
|
// used to throttle the output rate of cache stat reports
|
|
static int p_cachestat_counter=0;
|
|
|
|
__forceinline void Mix()
|
|
{
|
|
// Note: Playmode 4 is SPDIF, which overrides other inputs.
|
|
StereoOut32 InputData[2] =
|
|
{
|
|
(PlayMode&4) ? StereoOut32::Empty : ReadInputPV( 0 ),
|
|
(PlayMode&8) ? StereoOut32::Empty : ReadInputPV( 1 )
|
|
};
|
|
|
|
WaveDump::WriteCore( 0, CoreSrc_Input, InputData[0] );
|
|
WaveDump::WriteCore( 1, CoreSrc_Input, InputData[1] );
|
|
|
|
// Todo: Replace me with memzero initializer!
|
|
VoiceMixSet VoiceData[2] = { VoiceMixSet::Empty, VoiceMixSet::Empty }; // mixed voice data for each core.
|
|
MixCoreVoices( VoiceData[0], 0 );
|
|
MixCoreVoices( VoiceData[1], 1 );
|
|
|
|
StereoOut32 Ext( MixCore( 0, VoiceData[0], InputData[0], StereoOut32::Empty ) );
|
|
|
|
if( (PlayMode & 4) || (Cores[0].Mute!=0) )
|
|
Ext = StereoOut32::Empty;
|
|
else
|
|
{
|
|
Ext = ApplyVolume( Ext, Cores[0].MasterVol );
|
|
clamp_mix( Ext );
|
|
}
|
|
|
|
// Commit Core 0 output to ram before mixing Core 1:
|
|
|
|
spu2M_WriteFast( 0x800 + OutPos, Ext.Left );
|
|
spu2M_WriteFast( 0xA00 + OutPos, Ext.Right );
|
|
WaveDump::WriteCore( 0, CoreSrc_External, Ext );
|
|
|
|
ApplyVolume( Ext, Cores[1].ExtVol );
|
|
StereoOut32 Out( MixCore( 1, VoiceData[1], InputData[1], Ext ) );
|
|
|
|
if( PlayMode & 8 )
|
|
{
|
|
// Experimental CDDA support
|
|
// The CDDA overrides all other mixer output. It's a direct feed!
|
|
|
|
ReadInput( 1, Out );
|
|
//WaveLog::WriteCore( 1, "CDDA-32", OutL, OutR );
|
|
}
|
|
else
|
|
{
|
|
Out.Left = MulShr32( Out.Left<<SndOutVolumeShift, Cores[1].MasterVol.Left.Value );
|
|
Out.Right = MulShr32( Out.Right<<SndOutVolumeShift, Cores[1].MasterVol.Right.Value );
|
|
|
|
// Final Clamp!
|
|
// This could be circumvented by using 1/2th total output volume, although
|
|
// I suspect this approach (clamping at the higher volume) is more true to the
|
|
// PS2's real implementation.
|
|
|
|
clamp_mix( Out, SndOutVolumeShift );
|
|
}
|
|
|
|
// Update spdif (called each sample)
|
|
if(PlayMode&4)
|
|
spdif_update();
|
|
|
|
SndBuffer::Write( Out );
|
|
|
|
// Update AutoDMA output positioning
|
|
OutPos++;
|
|
if (OutPos>=0x200) OutPos=0;
|
|
|
|
if( IsDevBuild )
|
|
{
|
|
p_cachestat_counter++;
|
|
if(p_cachestat_counter > (48000*10) )
|
|
{
|
|
p_cachestat_counter = 0;
|
|
if( MsgCache() ) ConLog( " * SPU2 > CacheStats > Hits: %d Misses: %d Ignores: %d\n",
|
|
g_counter_cache_hits,
|
|
g_counter_cache_misses,
|
|
g_counter_cache_ignores );
|
|
|
|
g_counter_cache_hits =
|
|
g_counter_cache_misses =
|
|
g_counter_cache_ignores = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////
|
|
/////////////////////////////////////////////////////////////////////////////////////////
|
|
// //
|
|
|
|
/*
|
|
-----------------------------------------------------------------------------
|
|
PSX reverb hardware notes
|
|
by Neill Corlett
|
|
-----------------------------------------------------------------------------
|
|
|
|
Yadda yadda disclaimer yadda probably not perfect yadda well it's okay anyway
|
|
yadda yadda.
|
|
|
|
-----------------------------------------------------------------------------
|
|
|
|
Basics
|
|
------
|
|
|
|
- The reverb buffer is 22khz 16-bit mono PCM.
|
|
- It starts at the reverb address given by 1DA2, extends to
|
|
the end of sound RAM, and wraps back to the 1DA2 address.
|
|
|
|
Setting the address at 1DA2 resets the current reverb work address.
|
|
|
|
This work address ALWAYS increments every 1/22050 sec., regardless of
|
|
whether reverb is enabled (bit 7 of 1DAA set).
|
|
|
|
And the contents of the reverb buffer ALWAYS play, scaled by the
|
|
"reverberation depth left/right" volumes (1D84/1D86).
|
|
(which, by the way, appear to be scaled so 3FFF=approx. 1.0, 4000=-1.0)
|
|
|
|
-----------------------------------------------------------------------------
|
|
|
|
Register names
|
|
--------------
|
|
|
|
These are probably not their real names.
|
|
These are probably not even correct names.
|
|
We will use them anyway, because we can.
|
|
|
|
1DC0: FB_SRC_A (offset)
|
|
1DC2: FB_SRC_B (offset)
|
|
1DC4: IIR_ALPHA (coef.)
|
|
1DC6: ACC_COEF_A (coef.)
|
|
1DC8: ACC_COEF_B (coef.)
|
|
1DCA: ACC_COEF_C (coef.)
|
|
1DCC: ACC_COEF_D (coef.)
|
|
1DCE: IIR_COEF (coef.)
|
|
1DD0: FB_ALPHA (coef.)
|
|
1DD2: FB_X (coef.)
|
|
1DD4: IIR_DEST_A0 (offset)
|
|
1DD6: IIR_DEST_A1 (offset)
|
|
1DD8: ACC_SRC_A0 (offset)
|
|
1DDA: ACC_SRC_A1 (offset)
|
|
1DDC: ACC_SRC_B0 (offset)
|
|
1DDE: ACC_SRC_B1 (offset)
|
|
1DE0: IIR_SRC_A0 (offset)
|
|
1DE2: IIR_SRC_A1 (offset)
|
|
1DE4: IIR_DEST_B0 (offset)
|
|
1DE6: IIR_DEST_B1 (offset)
|
|
1DE8: ACC_SRC_C0 (offset)
|
|
1DEA: ACC_SRC_C1 (offset)
|
|
1DEC: ACC_SRC_D0 (offset)
|
|
1DEE: ACC_SRC_D1 (offset)
|
|
1DF0: IIR_SRC_B1 (offset)
|
|
1DF2: IIR_SRC_B0 (offset)
|
|
1DF4: MIX_DEST_A0 (offset)
|
|
1DF6: MIX_DEST_A1 (offset)
|
|
1DF8: MIX_DEST_B0 (offset)
|
|
1DFA: MIX_DEST_B1 (offset)
|
|
1DFC: IN_COEF_L (coef.)
|
|
1DFE: IN_COEF_R (coef.)
|
|
|
|
The coefficients are signed fractional values.
|
|
-32768 would be -1.0
|
|
32768 would be 1.0 (if it were possible... the highest is of course 32767)
|
|
|
|
The offsets are (byte/8) offsets into the reverb buffer.
|
|
i.e. you multiply them by 8, you get byte offsets.
|
|
You can also think of them as (samples/4) offsets.
|
|
They appear to be signed. They can be negative.
|
|
None of the documented presets make them negative, though.
|
|
|
|
Yes, 1DF0 and 1DF2 appear to be backwards. Not a typo.
|
|
|
|
-----------------------------------------------------------------------------
|
|
|
|
What it does
|
|
------------
|
|
|
|
We take all reverb sources:
|
|
- regular channels that have the reverb bit on
|
|
- cd and external sources, if their reverb bits are on
|
|
and mix them into one stereo 44100hz signal.
|
|
|
|
Lowpass/downsample that to 22050hz. The PSX uses a proper bandlimiting
|
|
algorithm here, but I haven't figured out the hysterically exact specifics.
|
|
I use an 8-tap filter with these coefficients, which are nice but probably
|
|
not the real ones:
|
|
|
|
0.037828187894
|
|
0.157538631280
|
|
0.321159685278
|
|
0.449322115345
|
|
0.449322115345
|
|
0.321159685278
|
|
0.157538631280
|
|
0.037828187894
|
|
|
|
So we have two input samples (INPUT_SAMPLE_L, INPUT_SAMPLE_R) every 22050hz.
|
|
|
|
* IN MY EMULATION, I divide these by 2 to make it clip less.
|
|
(and of course the L/R output coefficients are adjusted to compensate)
|
|
The real thing appears to not do this.
|
|
|
|
At every 22050hz tick:
|
|
- If the reverb bit is enabled (bit 7 of 1DAA), execute the reverb
|
|
steady-state algorithm described below
|
|
- AFTERWARDS, retrieve the "wet out" L and R samples from the reverb buffer
|
|
(This part may not be exactly right and I guessed at the coefs. TODO: check later.)
|
|
L is: 0.333 * (buffer[MIX_DEST_A0] + buffer[MIX_DEST_B0])
|
|
R is: 0.333 * (buffer[MIX_DEST_A1] + buffer[MIX_DEST_B1])
|
|
- Advance the current buffer position by 1 sample
|
|
|
|
The wet out L and R are then upsampled to 44100hz and played at the
|
|
"reverberation depth left/right" (1D84/1D86) volume, independent of the main
|
|
volume.
|
|
|
|
-----------------------------------------------------------------------------
|
|
|
|
Reverb steady-state
|
|
-------------------
|
|
|
|
The reverb steady-state algorithm is fairly clever, and of course by
|
|
"clever" I mean "batshit insane".
|
|
|
|
buffer[x] is relative to the current buffer position, not the beginning of
|
|
the buffer. Note that all buffer offsets must wrap around so they're
|
|
contained within the reverb work area.
|
|
|
|
Clipping is performed at the end... maybe also sooner, but definitely at
|
|
the end.
|
|
|
|
IIR_INPUT_A0 = buffer[IIR_SRC_A0] * IIR_COEF + INPUT_SAMPLE_L * IN_COEF_L;
|
|
IIR_INPUT_A1 = buffer[IIR_SRC_A1] * IIR_COEF + INPUT_SAMPLE_R * IN_COEF_R;
|
|
IIR_INPUT_B0 = buffer[IIR_SRC_B0] * IIR_COEF + INPUT_SAMPLE_L * IN_COEF_L;
|
|
IIR_INPUT_B1 = buffer[IIR_SRC_B1] * IIR_COEF + INPUT_SAMPLE_R * IN_COEF_R;
|
|
|
|
IIR_A0 = IIR_INPUT_A0 * IIR_ALPHA + buffer[IIR_DEST_A0] * (1.0 - IIR_ALPHA);
|
|
IIR_A1 = IIR_INPUT_A1 * IIR_ALPHA + buffer[IIR_DEST_A1] * (1.0 - IIR_ALPHA);
|
|
IIR_B0 = IIR_INPUT_B0 * IIR_ALPHA + buffer[IIR_DEST_B0] * (1.0 - IIR_ALPHA);
|
|
IIR_B1 = IIR_INPUT_B1 * IIR_ALPHA + buffer[IIR_DEST_B1] * (1.0 - IIR_ALPHA);
|
|
|
|
buffer[IIR_DEST_A0 + 1sample] = IIR_A0;
|
|
buffer[IIR_DEST_A1 + 1sample] = IIR_A1;
|
|
buffer[IIR_DEST_B0 + 1sample] = IIR_B0;
|
|
buffer[IIR_DEST_B1 + 1sample] = IIR_B1;
|
|
|
|
ACC0 = buffer[ACC_SRC_A0] * ACC_COEF_A +
|
|
buffer[ACC_SRC_B0] * ACC_COEF_B +
|
|
buffer[ACC_SRC_C0] * ACC_COEF_C +
|
|
buffer[ACC_SRC_D0] * ACC_COEF_D;
|
|
ACC1 = buffer[ACC_SRC_A1] * ACC_COEF_A +
|
|
buffer[ACC_SRC_B1] * ACC_COEF_B +
|
|
buffer[ACC_SRC_C1] * ACC_COEF_C +
|
|
buffer[ACC_SRC_D1] * ACC_COEF_D;
|
|
|
|
FB_A0 = buffer[MIX_DEST_A0 - FB_SRC_A];
|
|
FB_A1 = buffer[MIX_DEST_A1 - FB_SRC_A];
|
|
FB_B0 = buffer[MIX_DEST_B0 - FB_SRC_B];
|
|
FB_B1 = buffer[MIX_DEST_B1 - FB_SRC_B];
|
|
|
|
buffer[MIX_DEST_A0] = ACC0 - FB_A0 * FB_ALPHA;
|
|
buffer[MIX_DEST_A1] = ACC1 - FB_A1 * FB_ALPHA;
|
|
buffer[MIX_DEST_B0] = (FB_ALPHA * ACC0) - FB_A0 * (FB_ALPHA^0x8000) - FB_B0 * FB_X;
|
|
buffer[MIX_DEST_B1] = (FB_ALPHA * ACC1) - FB_A1 * (FB_ALPHA^0x8000) - FB_B1 * FB_X;
|
|
|
|
-----------------------------------------------------------------------------
|
|
*/
|