SPU2-X: Experimental reverb based on Dr. Hell's

http://drhell.web.fc2.com/ps1/ and/or Martin Korth's http://problemkaputt.de/psx-spx.htm reverse engineering. Negate final reverb output (don't know why but it's an improvement when comparing the waveform).
2016-10-13 15:28:01 +01:00 · 2016-10-13 15:28:01 +01:00 · 9a51820dec
parent 01f0f436ac
commit 9a51820dec
9 changed files with 108 additions and 442 deletions
--- a/plugins/spu2-x/src/Debug.cpp
+++ b/plugins/spu2-x/src/Debug.cpp
@ -218,15 +218,15 @@ void DoFullDump()

            fprintf(dump, "  - FB_ALPHA:    %x\n", Cores[c].Revb.FB_ALPHA);
            fprintf(dump, "  - FB_X:        %x\n", Cores[c].Revb.FB_X);
-            fprintf(dump, "  - FB_SRC_A:    %x\n", Cores[c].Revb.FB_SRC_A);
-            fprintf(dump, "  - FB_SRC_B:    %x\n", Cores[c].Revb.FB_SRC_B);
+            fprintf(dump, "  - FB_SIZE_A:    %x\n", Cores[c].Revb.FB_SIZE_A);
+            fprintf(dump, "  - FB_SIZE_B:    %x\n", Cores[c].Revb.FB_SIZE_B);

            fprintf(dump, "  - IIR_ALPHA:   %x\n", Cores[c].Revb.IIR_ALPHA);
            fprintf(dump, "  - IIR_COEF:    %x\n", Cores[c].Revb.IIR_COEF);
            fprintf(dump, "  - IIR_SRC_A0:  %x\n", Cores[c].Revb.IIR_SRC_A0);
            fprintf(dump, "  - IIR_SRC_A1:  %x\n", Cores[c].Revb.IIR_SRC_A1);
-            fprintf(dump, "  - IIR_SRC_B1:  %x\n", Cores[c].Revb.IIR_SRC_B0);
-            fprintf(dump, "  - IIR_SRC_B0:  %x\n", Cores[c].Revb.IIR_SRC_B1);
+            fprintf(dump, "  - IIR_SRC_B0:  %x\n", Cores[c].Revb.IIR_SRC_B0);
+            fprintf(dump, "  - IIR_SRC_B1:  %x\n", Cores[c].Revb.IIR_SRC_B1);
            fprintf(dump, "  - IIR_DEST_A0: %x\n", Cores[c].Revb.IIR_DEST_A0);
            fprintf(dump, "  - IIR_DEST_A1: %x\n", Cores[c].Revb.IIR_DEST_A1);
            fprintf(dump, "  - IIR_DEST_B0: %x\n", Cores[c].Revb.IIR_DEST_B0);
--- a/plugins/spu2-x/src/Mixer.cpp
+++ b/plugins/spu2-x/src/Mixer.cpp
@ -675,6 +675,7 @@ StereoOut32 V_Core::Mix(const VoiceMixSet &inVoices, const StereoOut32 &Input, c

    // ToDo:
    // Bad EndA causes memory corruption. Bad for us, unknown on PS2!
+    // According to no$psx, effects always run but don't always write back, so the FxEnable check may be wrong
    if (!FxEnable || EffectsEndA >= 0x100000)
        return TD;

@ -886,194 +887,3 @@ __forceinline
        }
    }
 }
-
-/////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////
-//                                                                                     //
-
-/*
-----------------------------------------------------------------------------
-PSX reverb hardware notes
-by Neill Corlett
-----------------------------------------------------------------------------
-
-Yadda yadda disclaimer yadda probably not perfect yadda well it's okay anyway
-yadda yadda.
-
-----------------------------------------------------------------------------
-
-Basics
------
-
- The reverb buffer is 22khz 16-bit mono PCM.
- It starts at the reverb address given by 1DA2, extends to
-  the end of sound RAM, and wraps back to the 1DA2 address.
-
-Setting the address at 1DA2 resets the current reverb work address.
-
-This work address ALWAYS increments every 1/22050 sec., regardless of
-whether reverb is enabled (bit 7 of 1DAA set).
-
-And the contents of the reverb buffer ALWAYS play, scaled by the
-"reverberation depth left/right" volumes (1D84/1D86).
-(which, by the way, appear to be scaled so 3FFF=approx. 1.0, 4000=-1.0)
-
-----------------------------------------------------------------------------
-
-Register names
--------------
-
-These are probably not their real names.
-These are probably not even correct names.
-We will use them anyway, because we can.
-
-1DC0: FB_SRC_A       (offset)
-1DC2: FB_SRC_B       (offset)
-1DC4: IIR_ALPHA      (coef.)
-1DC6: ACC_COEF_A     (coef.)
-1DC8: ACC_COEF_B     (coef.)
-1DCA: ACC_COEF_C     (coef.)
-1DCC: ACC_COEF_D     (coef.)
-1DCE: IIR_COEF       (coef.)
-1DD0: FB_ALPHA       (coef.)
-1DD2: FB_X           (coef.)
-1DD4: IIR_DEST_A0    (offset)
-1DD6: IIR_DEST_A1    (offset)
-1DD8: ACC_SRC_A0     (offset)
-1DDA: ACC_SRC_A1     (offset)
-1DDC: ACC_SRC_B0     (offset)
-1DDE: ACC_SRC_B1     (offset)
-1DE0: IIR_SRC_A0     (offset)
-1DE2: IIR_SRC_A1     (offset)
-1DE4: IIR_DEST_B0    (offset)
-1DE6: IIR_DEST_B1    (offset)
-1DE8: ACC_SRC_C0     (offset)
-1DEA: ACC_SRC_C1     (offset)
-1DEC: ACC_SRC_D0     (offset)
-1DEE: ACC_SRC_D1     (offset)
-1DF0: IIR_SRC_B1     (offset)
-1DF2: IIR_SRC_B0     (offset)
-1DF4: MIX_DEST_A0    (offset)
-1DF6: MIX_DEST_A1    (offset)
-1DF8: MIX_DEST_B0    (offset)
-1DFA: MIX_DEST_B1    (offset)
-1DFC: IN_COEF_L      (coef.)
-1DFE: IN_COEF_R      (coef.)
-
-The coefficients are signed fractional values.
-32768 would be -1.0
- 32768 would be  1.0 (if it were possible... the highest is of course 32767)
-
-The offsets are (byte/8) offsets into the reverb buffer.
-i.e. you multiply them by 8, you get byte offsets.
-You can also think of them as (samples/4) offsets.
-They appear to be signed.  They can be negative.
-None of the documented presets make them negative, though.
-
-Yes, 1DF0 and 1DF2 appear to be backwards.  Not a typo.
-
-----------------------------------------------------------------------------
-
-What it does
------------
-
-We take all reverb sources:
- regular channels that have the reverb bit on
- cd and external sources, if their reverb bits are on
-and mix them into one stereo 44100hz signal.
-
-Lowpass/downsample that to 22050hz.  The PSX uses a proper bandlimiting
-algorithm here, but I haven't figured out the hysterically exact specifics.
-I use an 8-tap filter with these coefficients, which are nice but probably
-not the real ones:
-
-0.037828187894
-0.157538631280
-0.321159685278
-0.449322115345
-0.449322115345
-0.321159685278
-0.157538631280
-0.037828187894
-
-So we have two input samples (INPUT_SAMPLE_L, INPUT_SAMPLE_R) every 22050hz.
-
-* IN MY EMULATION, I divide these by 2 to make it clip less.
-  (and of course the L/R output coefficients are adjusted to compensate)
-  The real thing appears to not do this.
-
-At every 22050hz tick:
- If the reverb bit is enabled (bit 7 of 1DAA), execute the reverb
-  steady-state algorithm described below
- AFTERWARDS, retrieve the "wet out" L and R samples from the reverb buffer
-  (This part may not be exactly right and I guessed at the coefs. TODO: check later.)
-  L is: 0.333 * (buffer[MIX_DEST_A0] + buffer[MIX_DEST_B0])
-  R is: 0.333 * (buffer[MIX_DEST_A1] + buffer[MIX_DEST_B1])
- Advance the current buffer position by 1 sample
-
-The wet out L and R are then upsampled to 44100hz and played at the
-"reverberation depth left/right" (1D84/1D86) volume, independent of the main
-volume.
-
-----------------------------------------------------------------------------
-
-Reverb steady-state
-------------------
-
-The reverb steady-state algorithm is fairly clever, and of course by
-"clever" I mean "batshit insane".
-
-buffer[x] is relative to the current buffer position, not the beginning of
-the buffer.  Note that all buffer offsets must wrap around so they're
-contained within the reverb work area.
-
-Clipping is performed at the end... maybe also sooner, but definitely at
-the end.
-
-IIR_INPUT_A0 = buffer[IIR_SRC_A0] * IIR_COEF + INPUT_SAMPLE_L * IN_COEF_L;
-IIR_INPUT_A1 = buffer[IIR_SRC_A1] * IIR_COEF + INPUT_SAMPLE_R * IN_COEF_R;
-IIR_INPUT_B0 = buffer[IIR_SRC_B0] * IIR_COEF + INPUT_SAMPLE_L * IN_COEF_L;
-IIR_INPUT_B1 = buffer[IIR_SRC_B1] * IIR_COEF + INPUT_SAMPLE_R * IN_COEF_R;
-
-IIR_A0 = IIR_INPUT_A0 * IIR_ALPHA + buffer[IIR_DEST_A0] * (1.0 - IIR_ALPHA);
-IIR_A1 = IIR_INPUT_A1 * IIR_ALPHA + buffer[IIR_DEST_A1] * (1.0 - IIR_ALPHA);
-IIR_B0 = IIR_INPUT_B0 * IIR_ALPHA + buffer[IIR_DEST_B0] * (1.0 - IIR_ALPHA);
-IIR_B1 = IIR_INPUT_B1 * IIR_ALPHA + buffer[IIR_DEST_B1] * (1.0 - IIR_ALPHA);
-
-buffer[IIR_DEST_A0 + 1sample] = IIR_A0;
-buffer[IIR_DEST_A1 + 1sample] = IIR_A1;
-buffer[IIR_DEST_B0 + 1sample] = IIR_B0;
-buffer[IIR_DEST_B1 + 1sample] = IIR_B1;
-
-ACC0 = buffer[ACC_SRC_A0] * ACC_COEF_A +
-       buffer[ACC_SRC_B0] * ACC_COEF_B +
-       buffer[ACC_SRC_C0] * ACC_COEF_C +
-       buffer[ACC_SRC_D0] * ACC_COEF_D;
-ACC1 = buffer[ACC_SRC_A1] * ACC_COEF_A +
-       buffer[ACC_SRC_B1] * ACC_COEF_B +
-       buffer[ACC_SRC_C1] * ACC_COEF_C +
-       buffer[ACC_SRC_D1] * ACC_COEF_D;
-
-FB_A0 = buffer[MIX_DEST_A0 - FB_SRC_A];
-FB_A1 = buffer[MIX_DEST_A1 - FB_SRC_A];
-FB_B0 = buffer[MIX_DEST_B0 - FB_SRC_B];
-FB_B1 = buffer[MIX_DEST_B1 - FB_SRC_B];
-
-buffer[MIX_DEST_A0] = ACC0 - FB_A0 * FB_ALPHA;
-buffer[MIX_DEST_A1] = ACC1 - FB_A1 * FB_ALPHA;
-buffer[MIX_DEST_B0] = (FB_ALPHA * ACC0) - FB_A0 * (FB_ALPHA^0x8000) - FB_B0 * FB_X;
-buffer[MIX_DEST_B1] = (FB_ALPHA * ACC1) - FB_A1 * (FB_ALPHA^0x8000) - FB_B1 * FB_X;
-
-Air notes:
-  The above is effectivly the same as:
-    buffer[MIX_DEST_B0] = (ACC0 * FB_ALPHA) + (FB_A0 * (1.0-FB_ALPHA)) - FB_B0 * FB_X;
-    buffer[MIX_DEST_B1] = (ACC1 * FB_ALPHA) + (FB_A1 * (1.0-FB_ALPHA)) - FB_B1 * FB_X;
-
-  Which reduces to:
-    buffer[MIX_DEST_B0] = FB_A0 + ((ACC0-FB_A0) * FB_ALPHA) - FB_B0 * FB_X;
-    buffer[MIX_DEST_B1] = FB_A1 + ((ACC1-FB_A1) * FB_ALPHA) - FB_B1 * FB_X;
-
-
-
-----------------------------------------------------------------------------
-*/
--- a/plugins/spu2-x/src/RegLog.cpp
+++ b/plugins/spu2-x/src/RegLog.cpp
@ -265,8 +265,8 @@ void SPU2writeLog(const char *action, u32 rmem, u16 value)
        RegLog(2, t "L", mem, core, value); \
        break;

-                LOG_REVB_REG(FB_SRC_A, "FB_SRC_A")
-                LOG_REVB_REG(FB_SRC_B, "FB_SRC_B")
+                LOG_REVB_REG(FB_SIZE_A, "FB_SIZE_A")
+                LOG_REVB_REG(FB_SIZE_B, "FB_SIZE_B")
                LOG_REVB_REG(IIR_SRC_A0, "IIR_SRC_A0")
                LOG_REVB_REG(IIR_SRC_A1, "IIR_SRC_A1")
                LOG_REVB_REG(IIR_SRC_B1, "IIR_SRC_B1")
--- a/plugins/spu2-x/src/RegTable.cpp
+++ b/plugins/spu2-x/src/RegTable.cpp
@ -99,8 +99,8 @@ u16 const *const regtable_original[0x401] =
        PCORE(0, ExtEffectsStartA) + 1,
        PCORE(0, ExtEffectsStartA),

-        PREVB_REG(0, FB_SRC_A),
-        PREVB_REG(0, FB_SRC_B),
+        PREVB_REG(0, FB_SIZE_A),
+        PREVB_REG(0, FB_SIZE_B),
        PREVB_REG(0, IIR_DEST_A0),
        PREVB_REG(0, IIR_DEST_A1),
        PREVB_REG(0, ACC_SRC_A0),
@ -202,8 +202,8 @@ u16 const *const regtable_original[0x401] =
        PCORE(1, ExtEffectsStartA) + 1,
        PCORE(1, ExtEffectsStartA),

-        PREVB_REG(1, FB_SRC_A),
-        PREVB_REG(1, FB_SRC_B),
+        PREVB_REG(1, FB_SIZE_A),
+        PREVB_REG(1, FB_SIZE_B),
        PREVB_REG(1, IIR_DEST_A0),
        PREVB_REG(1, IIR_DEST_A1),
        PREVB_REG(1, ACC_SRC_A0),
--- a/plugins/spu2-x/src/Reverb.cpp
+++ b/plugins/spu2-x/src/Reverb.cpp
@ -16,13 +16,6 @@
 */

 #include "Global.h"
-#include "Lowpass.h"
-
-// Low pass filters: Change these to 32 for a speedup (benchmarks needed to see if
-// the speed gain is worth the quality drop)
-
-//static LowPassFilter64 lowpass_left( 11000, SampleRate );
-//static LowPassFilter64 lowpass_right( 11000, SampleRate );

 __forceinline s32 V_Core::RevbGetIndexer(s32 offset)
 {
@ -56,215 +49,85 @@ void V_Core::Reverb_AdvanceBuffer()

 StereoOut32 V_Core::DoReverb(const StereoOut32 &Input)
 {
-#if 0
-	static const s32 downcoeffs[8] =
-	{
-		1283,  5344,  10895, 15243,
-		15243, 10895,  5344,  1283
-	};
-#else
-    // 2/3 of the above
-    static const s32 downcoeffs[8] =
-        {
-            855, 3562, 7263, 10163,
-            10163, 7263, 3562, 855};
-#endif
+    if (EffectsBufferSize <= 0) {
+        return StereoOut32::Empty;
+    }

-    downbuf[dbpos] = Input;
-    dbpos = (dbpos + 1) & 7;
+    bool R = Cycles & 1;

-    // Reverb processing occurs at 24khz, so we skip processing every other sample,
-    // and use the previous calculation for this core instead.
+    // Calculate the read/write addresses we'll be needing for this session of reverb.

-    if ((Cycles & 1) == 0) {
-        // Important: Factor silence into the upsampler here, otherwise the reverb engine
-        // develops a nasty feedback loop.
+    const u32 same_src = RevbGetIndexer(R ? RevBuffers.IIR_SRC_A1 : RevBuffers.IIR_SRC_A0);
+    const u32 same_dst = RevbGetIndexer(R ? RevBuffers.IIR_DEST_A1 : RevBuffers.IIR_DEST_A0);
+    const u32 same_prv = RevbGetIndexer(R ? RevBuffers.SAME_R_PRV : RevBuffers.SAME_L_PRV);

-        upbuf[ubpos] = StereoOut32::Empty;
-    } else {
-        if (EffectsBufferSize <= 0) {
-            ubpos = (ubpos + 1) & 7;
-            return StereoOut32::Empty;
-        }
+    const u32 diff_src = RevbGetIndexer(R ? RevBuffers.IIR_SRC_B0 : RevBuffers.IIR_SRC_B1);
+    const u32 diff_dst = RevbGetIndexer(R ? RevBuffers.IIR_DEST_B1 : RevBuffers.IIR_DEST_B0);
+    const u32 diff_prv = RevbGetIndexer(R ? RevBuffers.DIFF_R_PRV : RevBuffers.DIFF_L_PRV);

-        // Advance the current reverb buffer pointer, and cache the read/write addresses we'll be
-        // needing for this session of reverb.
+    const u32 comb1_src = RevbGetIndexer(R ? RevBuffers.ACC_SRC_A1 : RevBuffers.ACC_SRC_A0);
+    const u32 comb2_src = RevbGetIndexer(R ? RevBuffers.ACC_SRC_B1 : RevBuffers.ACC_SRC_B0);
+    const u32 comb3_src = RevbGetIndexer(R ? RevBuffers.ACC_SRC_C1 : RevBuffers.ACC_SRC_C0);
+    const u32 comb4_src = RevbGetIndexer(R ? RevBuffers.ACC_SRC_D1 : RevBuffers.ACC_SRC_D0);

-        const u32 src_a0 = RevbGetIndexer(RevBuffers.IIR_SRC_A0);
-        const u32 src_a1 = RevbGetIndexer(RevBuffers.IIR_SRC_A1);
-        const u32 src_b0 = RevbGetIndexer(RevBuffers.IIR_SRC_B0);
-        const u32 src_b1 = RevbGetIndexer(RevBuffers.IIR_SRC_B1);
+    const u32 apf1_src = RevbGetIndexer(R ? RevBuffers.APF1_R_SRC : RevBuffers.APF1_L_SRC);
+    const u32 apf1_dst = RevbGetIndexer(R ? RevBuffers.MIX_DEST_A1 : RevBuffers.MIX_DEST_A0);
+    const u32 apf2_src = RevbGetIndexer(R ? RevBuffers.APF2_R_SRC : RevBuffers.APF2_L_SRC);
+    const u32 apf2_dst = RevbGetIndexer(R ? RevBuffers.MIX_DEST_B1 : RevBuffers.MIX_DEST_B0);

-        const u32 dest_a0 = RevbGetIndexer(RevBuffers.IIR_DEST_A0);
-        const u32 dest_a1 = RevbGetIndexer(RevBuffers.IIR_DEST_A1);
-        const u32 dest_b0 = RevbGetIndexer(RevBuffers.IIR_DEST_B0);
-        const u32 dest_b1 = RevbGetIndexer(RevBuffers.IIR_DEST_B1);
+    // -----------------------------------------
+    //          Optimized IRQ Testing !
+    // -----------------------------------------

-        const u32 dest2_a0 = RevbGetIndexer(RevBuffers.IIR_DEST_A0 + 1);
-        const u32 dest2_a1 = RevbGetIndexer(RevBuffers.IIR_DEST_A1 + 1);
-        const u32 dest2_b0 = RevbGetIndexer(RevBuffers.IIR_DEST_B0 + 1);
-        const u32 dest2_b1 = RevbGetIndexer(RevBuffers.IIR_DEST_B1 + 1);
+    // This test is enhanced by using the reverb effects area begin/end test as a
+    // shortcut, since all buffer addresses are within that area.  If the IRQA isn't
+    // within that zone then the "bulk" of the test is skipped, so this should only
+    // be a slowdown on a few evil games.

-        const u32 acc_src_a0 = RevbGetIndexer(RevBuffers.ACC_SRC_A0);
-        const u32 acc_src_b0 = RevbGetIndexer(RevBuffers.ACC_SRC_B0);
-        const u32 acc_src_c0 = RevbGetIndexer(RevBuffers.ACC_SRC_C0);
-        const u32 acc_src_d0 = RevbGetIndexer(RevBuffers.ACC_SRC_D0);
+    for (int i = 0; i < 2; i++) {
+        if (Cores[i].IRQEnable && ((Cores[i].IRQA >= EffectsStartA) && (Cores[i].IRQA <= EffectsEndA))) {
+            if ((Cores[i].IRQA == same_src) || (Cores[i].IRQA == diff_src) ||
+                (Cores[i].IRQA == same_dst) || (Cores[i].IRQA == diff_dst) ||
+                (Cores[i].IRQA == same_prv) || (Cores[i].IRQA == diff_prv) ||

-        const u32 acc_src_a1 = RevbGetIndexer(RevBuffers.ACC_SRC_A1);
-        const u32 acc_src_b1 = RevbGetIndexer(RevBuffers.ACC_SRC_B1);
-        const u32 acc_src_c1 = RevbGetIndexer(RevBuffers.ACC_SRC_C1);
-        const u32 acc_src_d1 = RevbGetIndexer(RevBuffers.ACC_SRC_D1);
+                (Cores[i].IRQA == comb1_src) || (Cores[i].IRQA == comb2_src) ||
+                (Cores[i].IRQA == comb3_src) || (Cores[i].IRQA == comb4_src) ||

-        const u32 fb_src_a0 = RevbGetIndexer(RevBuffers.FB_SRC_A0);
-        const u32 fb_src_a1 = RevbGetIndexer(RevBuffers.FB_SRC_A1);
-        const u32 fb_src_b0 = RevbGetIndexer(RevBuffers.FB_SRC_B0);
-        const u32 fb_src_b1 = RevbGetIndexer(RevBuffers.FB_SRC_B1);
-
-        const u32 mix_dest_a0 = RevbGetIndexer(RevBuffers.MIX_DEST_A0);
-        const u32 mix_dest_a1 = RevbGetIndexer(RevBuffers.MIX_DEST_A1);
-        const u32 mix_dest_b0 = RevbGetIndexer(RevBuffers.MIX_DEST_B0);
-        const u32 mix_dest_b1 = RevbGetIndexer(RevBuffers.MIX_DEST_B1);
-
-        // -----------------------------------------
-        //          Optimized IRQ Testing !
-        // -----------------------------------------
-
-        // This test is enhanced by using the reverb effects area begin/end test as a
-        // shortcut, since all buffer addresses are within that area.  If the IRQA isn't
-        // within that zone then the "bulk" of the test is skipped, so this should only
-        // be a slowdown on a few evil games.
-
-        for (int i = 0; i < 2; i++) {
-            if (Cores[i].IRQEnable && ((Cores[i].IRQA >= EffectsStartA) && (Cores[i].IRQA <= EffectsEndA))) {
-                if ((Cores[i].IRQA == src_a0) || (Cores[i].IRQA == src_a1) ||
-                    (Cores[i].IRQA == src_b0) || (Cores[i].IRQA == src_b1) ||
-
-                    (Cores[i].IRQA == dest_a0) || (Cores[i].IRQA == dest_a1) ||
-                    (Cores[i].IRQA == dest_b0) || (Cores[i].IRQA == dest_b1) ||
-
-                    (Cores[i].IRQA == dest2_a0) || (Cores[i].IRQA == dest2_a1) ||
-                    (Cores[i].IRQA == dest2_b0) || (Cores[i].IRQA == dest2_b1) ||
-
-                    (Cores[i].IRQA == acc_src_a0) || (Cores[i].IRQA == acc_src_a1) ||
-                    (Cores[i].IRQA == acc_src_b0) || (Cores[i].IRQA == acc_src_b1) ||
-                    (Cores[i].IRQA == acc_src_c0) || (Cores[i].IRQA == acc_src_c1) ||
-                    (Cores[i].IRQA == acc_src_d0) || (Cores[i].IRQA == acc_src_d1) ||
-
-                    (Cores[i].IRQA == fb_src_a0) || (Cores[i].IRQA == fb_src_a1) ||
-                    (Cores[i].IRQA == fb_src_b0) || (Cores[i].IRQA == fb_src_b1) ||
-
-                    (Cores[i].IRQA == mix_dest_a0) || (Cores[i].IRQA == mix_dest_a1) ||
-                    (Cores[i].IRQA == mix_dest_b0) || (Cores[i].IRQA == mix_dest_b1)) {
-                    //printf("Core %d IRQ Called (Reverb). IRQA = %x\n",i,addr);
-                    SetIrqCall(i);
-                }
+                (Cores[i].IRQA == apf1_dst) || (Cores[i].IRQA == apf1_src) ||
+                (Cores[i].IRQA == apf2_dst) || (Cores[i].IRQA == apf2_src)) {
+                //printf("Core %d IRQ Called (Reverb). IRQA = %x\n",i,addr);
+                SetIrqCall(i);
            }
        }
-
-        // -----------------------------------------
-        //         Begin Reverb Processing !
-        // -----------------------------------------
-
-        StereoOut32 INPUT_SAMPLE;
-
-        for (int x = 0; x < 8; ++x) {
-            INPUT_SAMPLE.Left += (downbuf[(dbpos + x) & 7].Left * downcoeffs[x]);
-            INPUT_SAMPLE.Right += (downbuf[(dbpos + x) & 7].Right * downcoeffs[x]);
-        }
-
-        INPUT_SAMPLE.Left >>= 16;
-        INPUT_SAMPLE.Right >>= 16;
-
-        s32 input_L = INPUT_SAMPLE.Left * Revb.IN_COEF_L;
-        s32 input_R = INPUT_SAMPLE.Right * Revb.IN_COEF_R;
-
-        const s32 IIR_INPUT_A0 = clamp_mix((((s32)_spu2mem[src_a0] * Revb.IIR_COEF) + input_L) >> 15);
-        const s32 IIR_INPUT_A1 = clamp_mix((((s32)_spu2mem[src_a1] * Revb.IIR_COEF) + input_L) >> 15);
-        const s32 IIR_INPUT_B0 = clamp_mix((((s32)_spu2mem[src_b0] * Revb.IIR_COEF) + input_R) >> 15);
-        const s32 IIR_INPUT_B1 = clamp_mix((((s32)_spu2mem[src_b1] * Revb.IIR_COEF) + input_R) >> 15);
-
-        const s32 src_dest_a0 = _spu2mem[dest_a0];
-        const s32 src_dest_a1 = _spu2mem[dest_a1];
-        const s32 src_dest_b0 = _spu2mem[dest_b0];
-        const s32 src_dest_b1 = _spu2mem[dest_b1];
-
-        // This section differs from Neill's doc as it uses single-mul interpolation instead
-        // of 0x8000-val inversion.  (same result, faster)
-        const s32 IIR_A0 = src_dest_a0 + (((IIR_INPUT_A0 - src_dest_a0) * Revb.IIR_ALPHA) >> 15);
-        const s32 IIR_A1 = src_dest_a1 + (((IIR_INPUT_A1 - src_dest_a1) * Revb.IIR_ALPHA) >> 15);
-        const s32 IIR_B0 = src_dest_b0 + (((IIR_INPUT_B0 - src_dest_b0) * Revb.IIR_ALPHA) >> 15);
-        const s32 IIR_B1 = src_dest_b1 + (((IIR_INPUT_B1 - src_dest_b1) * Revb.IIR_ALPHA) >> 15);
-        _spu2mem[dest2_a0] = clamp_mix(IIR_A0);
-        _spu2mem[dest2_a1] = clamp_mix(IIR_A1);
-        _spu2mem[dest2_b0] = clamp_mix(IIR_B0);
-        _spu2mem[dest2_b1] = clamp_mix(IIR_B1);
-
-        const s32 ACC0 = clamp_mix(
-            ((_spu2mem[acc_src_a0] * Revb.ACC_COEF_A) >> 15) +
-            ((_spu2mem[acc_src_b0] * Revb.ACC_COEF_B) >> 15) +
-            ((_spu2mem[acc_src_c0] * Revb.ACC_COEF_C) >> 15) +
-            ((_spu2mem[acc_src_d0] * Revb.ACC_COEF_D) >> 15));
-
-        const s32 ACC1 = clamp_mix(
-            ((_spu2mem[acc_src_a1] * Revb.ACC_COEF_A) >> 15) +
-            ((_spu2mem[acc_src_b1] * Revb.ACC_COEF_B) >> 15) +
-            ((_spu2mem[acc_src_c1] * Revb.ACC_COEF_C) >> 15) +
-            ((_spu2mem[acc_src_d1] * Revb.ACC_COEF_D) >> 15));
-
-        // The following code differs from Neill's doc as it uses the more natural single-mul
-        // interpolative, instead of the funky ^0x8000 stuff.  (better result, faster)
-
-        const s32 FB_A0 = _spu2mem[fb_src_a0];
-        const s32 FB_A1 = _spu2mem[fb_src_a1];
-        const s32 FB_B0 = _spu2mem[fb_src_b0];
-        const s32 FB_B1 = _spu2mem[fb_src_b1];
-
-        const s32 mix_a0 = clamp_mix(ACC0 - ((FB_A0 * Revb.FB_ALPHA) >> 15));
-        const s32 mix_a1 = clamp_mix(ACC1 - ((FB_A1 * Revb.FB_ALPHA) >> 15));
-        const s32 mix_b0 = clamp_mix(FB_A0 + (((ACC0 - FB_A0) * Revb.FB_ALPHA - FB_B0 * Revb.FB_X) >> 15));
-        const s32 mix_b1 = clamp_mix(FB_A1 + (((ACC1 - FB_A1) * Revb.FB_ALPHA - FB_B1 * Revb.FB_X) >> 15));
-
-        _spu2mem[mix_dest_a0] = mix_a0;
-        _spu2mem[mix_dest_a1] = mix_a1;
-        _spu2mem[mix_dest_b0] = mix_b0;
-        _spu2mem[mix_dest_b1] = mix_b1;
-
-        upbuf[ubpos] = clamp_mix(StereoOut32(
-            mix_a0 + mix_b0, // left
-            mix_a1 + mix_b1  // right
-            ));
    }

-    StereoOut32 retval;
+    // Reverb algorithm pretty much directly ripped from http://drhell.web.fc2.com/ps1/
+    // minus the 35 step FIR which just seems to break things.

-    //for( int x=0; x<8; ++x )
-    //{
-    //	retval.Left  += (upbuf[(ubpos+x)&7].Left*downcoeffs[x]);
-    //	retval.Right += (upbuf[(ubpos+x)&7].Right*downcoeffs[x]);
-    //}
+    s32 in, same, diff, apf1, apf2, out;

-    if ((Cycles & 1) == 0) {
-        retval.Left = (upbuf[(ubpos + 5) & 7].Left + upbuf[(ubpos + 7) & 7].Left) >> 1;
-        retval.Right = (upbuf[(ubpos + 5) & 7].Right + upbuf[(ubpos + 7) & 7].Right) >> 1;
-    } else {
-        retval.Left = upbuf[(ubpos + 6) & 7].Left;
-        retval.Right = upbuf[(ubpos + 6) & 7].Right;
+#define MUL(x, y) ((x) * (y) >> 15)
+    in = MUL(R ? Revb.IN_COEF_R : Revb.IN_COEF_L, R ? Input.Right : Input.Left);
+
+    same = MUL(Revb.IIR_ALPHA, in + MUL(Revb.IIR_COEF, _spu2mem[same_src]) - _spu2mem[same_prv]) + _spu2mem[same_prv];
+    diff = MUL(Revb.IIR_ALPHA, in + MUL(Revb.IIR_COEF, _spu2mem[diff_src]) - _spu2mem[diff_prv]) + _spu2mem[diff_prv];
+
+    out = MUL(Revb.ACC_COEF_A, _spu2mem[comb1_src]) + MUL(Revb.ACC_COEF_B, _spu2mem[comb2_src]) + MUL(Revb.ACC_COEF_C, _spu2mem[comb3_src]) + MUL(Revb.ACC_COEF_D, _spu2mem[comb4_src]);
+
+    apf1 = out - MUL(Revb.FB_ALPHA, _spu2mem[apf1_src]);
+    out = _spu2mem[apf1_src] + MUL(Revb.FB_ALPHA, apf1);
+    apf2 = out - MUL(Revb.FB_X, _spu2mem[apf2_src]);
+    out = _spu2mem[apf2_src] + MUL(Revb.FB_X, apf2);
+
+    // According to no$psx the effects always run but don't always write back, see check in V_Core::Mix
+    if (FxEnable) {
+        _spu2mem[same_dst] = clamp_mix(same);
+        _spu2mem[diff_dst] = clamp_mix(diff);
+        _spu2mem[apf1_dst] = clamp_mix(apf1);
+        _spu2mem[apf2_dst] = clamp_mix(apf2);
    }

-    // Notes:
-    //  the first -1 is to adjust for the null padding in every other upbuf sample (which
-    //  halves the overall volume).
-    //  The second +1 divides by two, which is part of Neill's suggestion to divide by 3.
-    //
-    // According Neill the final result should be divided by 3, but currently the output
-    // is way too quiet for that to fly.  In fact no division at all might be better.
-    // In any case the problem always seems to be that the reverb isn't resonating enough
-    // (indicating short buffers or bad coefficient math?), not that it isn't loud enough.
+    (R ? LastEffect.Right : LastEffect.Left) = -clamp_mix(out);

-    //retval.Left  >>= (16-1 + 1);
-    //retval.Right >>= (16-1 + 1);
-
-    ubpos = (ubpos + 1) & 7;
-
-    return retval;
-}
+    return LastEffect;
+}
--- a/plugins/spu2-x/src/defs.h
+++ b/plugins/spu2-x/src/defs.h
@ -18,6 +18,7 @@
 #pragma once

 #include "Mixer.h"
+#include "SndOut.h"

 // --------------------------------------------------------------------------------------
 //  SPU2 Memory Indexers
@ -229,8 +230,8 @@ struct V_Reverb
    s16 IN_COEF_L;
    s16 IN_COEF_R;

-    u32 FB_SRC_A;
-    u32 FB_SRC_B;
+    u32 FB_SIZE_A;
+    u32 FB_SIZE_B;

    s16 FB_ALPHA;
    s16 FB_X;
@ -269,11 +270,6 @@ struct V_Reverb

 struct V_ReverbBuffers
 {
-    s32 FB_SRC_A0;
-    s32 FB_SRC_B0;
-    s32 FB_SRC_A1;
-    s32 FB_SRC_B1;
-
    s32 IIR_SRC_A0;
    s32 IIR_SRC_A1;
    s32 IIR_SRC_B0;
@ -297,6 +293,16 @@ struct V_ReverbBuffers
    s32 MIX_DEST_B0;
    s32 MIX_DEST_B1;

+    s32 SAME_L_PRV;
+    s32 SAME_R_PRV;
+    s32 DIFF_L_PRV;
+    s32 DIFF_R_PRV;
+
+    s32 APF1_L_SRC;
+    s32 APF1_R_SRC;
+    s32 APF2_L_SRC;
+    s32 APF2_R_SRC;
+
    bool NeedsUpdated;
 };

@ -419,9 +425,7 @@ struct V_Core

    V_CoreRegs Regs; // Registers

-    // Last samples to pass through the effects processor.
-    // Used because the effects processor works at 24khz and just pulls
-    // from this for the odd Ts.
+    // Preserves the channel processed last cycle
    StereoOut32 LastEffect;

    u8 CoreEnabled;
@ -444,10 +448,6 @@ struct V_Core
    u16 psxSoundDataTransferControl;
    u16 psxSPUSTAT;

-    StereoOut32 downbuf[8];
-    StereoOut32 upbuf[8];
-    int dbpos, ubpos;
-
    // HACK -- This is a temp buffer which is (or isn't?) used to circumvent some memory
    // corruption that originates elsewhere in the plugin. >_<  The actual ADMA buffer
    // is an area mapped to SPU2 main memory.
@ -471,8 +471,6 @@ struct V_Core
    void AnalyzeReverbPreset();

    s32 EffectsBufferIndexer(s32 offset) const;
-    void UpdateFeedbackBuffersA();
-    void UpdateFeedbackBuffersB();

    void WriteRegPS1(u32 mem, u16 value);
    u16 ReadRegPS1(u32 mem);
--- a/plugins/spu2-x/src/regs.h
+++ b/plugins/spu2-x/src/regs.h
@ -61,9 +61,9 @@

 // .. repeated for each voice ..

-#define REG_A_ESA 0x02E0  //Address: Top address of working area for effects processing
-#define R_FB_SRC_A 0x02E4 // Feedback Source A
-#define R_FB_SRC_B 0x02E8 // Feedback Source B
+#define REG_A_ESA 0x02E0   //Address: Top address of working area for effects processing
+#define R_FB_SIZE_A 0x02E4 // Feedback Source A
+#define R_FB_SIZE_B 0x02E8 // Feedback Source B
 #define R_IIR_DEST_A0 0x02EC
 #define R_IIR_DEST_A1 0x02F0
 #define R_ACC_SRC_A0 0x02F4
--- a/plugins/spu2-x/src/spu2freeze.cpp
+++ b/plugins/spu2-x/src/spu2freeze.cpp
@ -25,7 +25,7 @@ static const u32 SAVE_ID = 0x1227521;

 // versioning for saves.
 // Increment this when changes to the savestate system are made.
-static const u32 SAVE_VERSION = 0x000d;
+static const u32 SAVE_VERSION = 0x000e;

 static void wipe_the_cache()
 {
--- a/plugins/spu2-x/src/spu2sys.cpp
+++ b/plugins/spu2-x/src/spu2sys.cpp
@ -195,7 +195,7 @@ void V_Core::AnalyzeReverbPreset()
    ConLog("----------------------------------------------------------\n");

    ConLog("    IN_COEF_L, IN_COEF_R        0x%08x, 0x%08x\n", Revb.IN_COEF_L, Revb.IN_COEF_R);
-    ConLog("    FB_SRC_A, FB_SRC_B          0x%08x, 0x%08x\n", Revb.FB_SRC_A, Revb.FB_SRC_B);
+    ConLog("    FB_SIZE_A, FB_SIZE_B          0x%08x, 0x%08x\n", Revb.FB_SIZE_A, Revb.FB_SIZE_B);
    ConLog("    FB_ALPHA, FB_X              0x%08x, 0x%08x\n", Revb.FB_ALPHA, Revb.FB_X);

    ConLog("    ACC_COEF_A                  0x%08x\n", Revb.ACC_COEF_A);
@ -242,18 +242,6 @@ s32 V_Core::EffectsBufferIndexer(s32 offset) const
    return pos;
 }

-void V_Core::UpdateFeedbackBuffersA()
-{
-    RevBuffers.FB_SRC_A0 = EffectsBufferIndexer(Revb.MIX_DEST_A0 - Revb.FB_SRC_A);
-    RevBuffers.FB_SRC_A1 = EffectsBufferIndexer(Revb.MIX_DEST_A1 - Revb.FB_SRC_A);
-}
-
-void V_Core::UpdateFeedbackBuffersB()
-{
-    RevBuffers.FB_SRC_B0 = EffectsBufferIndexer(Revb.MIX_DEST_B0 - Revb.FB_SRC_B);
-    RevBuffers.FB_SRC_B1 = EffectsBufferIndexer(Revb.MIX_DEST_B1 - Revb.FB_SRC_B);
-}
-
 void V_Core::UpdateEffectsBufferSize()
 {
    const s32 newbufsize = EffectsEndA - EffectsStartA + 1;
@ -290,9 +278,6 @@ void V_Core::UpdateEffectsBufferSize()
    RevBuffers.ACC_SRC_D0 = EffectsBufferIndexer(Revb.ACC_SRC_D0);
    RevBuffers.ACC_SRC_D1 = EffectsBufferIndexer(Revb.ACC_SRC_D1);

-    UpdateFeedbackBuffersA();
-    UpdateFeedbackBuffersB();
-
    RevBuffers.IIR_DEST_A0 = EffectsBufferIndexer(Revb.IIR_DEST_A0);
    RevBuffers.IIR_DEST_A1 = EffectsBufferIndexer(Revb.IIR_DEST_A1);
    RevBuffers.IIR_DEST_B0 = EffectsBufferIndexer(Revb.IIR_DEST_B0);
@ -307,6 +292,16 @@ void V_Core::UpdateEffectsBufferSize()
    RevBuffers.MIX_DEST_A1 = EffectsBufferIndexer(Revb.MIX_DEST_A1);
    RevBuffers.MIX_DEST_B0 = EffectsBufferIndexer(Revb.MIX_DEST_B0);
    RevBuffers.MIX_DEST_B1 = EffectsBufferIndexer(Revb.MIX_DEST_B1);
+
+    RevBuffers.SAME_L_PRV = EffectsBufferIndexer(Revb.IIR_DEST_A0 - 1);
+    RevBuffers.SAME_R_PRV = EffectsBufferIndexer(Revb.IIR_DEST_A1 - 1);
+    RevBuffers.DIFF_L_PRV = EffectsBufferIndexer(Revb.IIR_DEST_B0 - 1);
+    RevBuffers.DIFF_R_PRV = EffectsBufferIndexer(Revb.IIR_DEST_B1 - 1);
+
+    RevBuffers.APF1_L_SRC = EffectsBufferIndexer(Revb.MIX_DEST_A0 - Revb.FB_SIZE_A);
+    RevBuffers.APF1_R_SRC = EffectsBufferIndexer(Revb.MIX_DEST_A1 - Revb.FB_SIZE_A);
+    RevBuffers.APF2_L_SRC = EffectsBufferIndexer(Revb.MIX_DEST_B0 - Revb.FB_SIZE_B);
+    RevBuffers.APF2_R_SRC = EffectsBufferIndexer(Revb.MIX_DEST_B1 - Revb.FB_SIZE_B);
 }

 void V_Voice::QueueStart()
@ -719,10 +714,10 @@ void V_Core::WriteRegPS1(u32 mem, u16 value)
                break;

            case 0x1DC0:
-                Revb.FB_SRC_A = value * 4;
+                Revb.FB_SIZE_A = value * 4;
                break;
            case 0x1DC2:
-                Revb.FB_SRC_B = value * 4;
+                Revb.FB_SIZE_B = value * 4;
                break;
            case 0x1DC4:
                Revb.IIR_ALPHA = value;
@ -1550,8 +1545,8 @@ static RegWriteHandler *const tbl_reg_writes[0x401] =

        CoreParamsPair(0, REG_A_ESA),

-        ReverbPair(0, R_FB_SRC_A),    //       0x02E4		// Feedback Source A
-        ReverbPair(0, R_FB_SRC_B),    //       0x02E8		// Feedback Source B
+        ReverbPair(0, R_FB_SIZE_A),   //       0x02E4		// Feedback Source A
+        ReverbPair(0, R_FB_SIZE_B),   //       0x02E8		// Feedback Source B
        ReverbPair(0, R_IIR_DEST_A0), //    0x02EC
        ReverbPair(0, R_IIR_DEST_A1), //    0x02F0
        ReverbPair(0, R_ACC_SRC_A0),  //     0x02F4
@ -1640,8 +1635,8 @@ static RegWriteHandler *const tbl_reg_writes[0x401] =

        CoreParamsPair(1, REG_A_ESA),

-        ReverbPair(1, R_FB_SRC_A),    //       0x02E4		// Feedback Source A
-        ReverbPair(1, R_FB_SRC_B),    //       0x02E8		// Feedback Source B
+        ReverbPair(1, R_FB_SIZE_A),   //       0x02E4		// Feedback Source A
+        ReverbPair(1, R_FB_SIZE_B),   //       0x02E8		// Feedback Source B
        ReverbPair(1, R_IIR_DEST_A0), //    0x02EC
        ReverbPair(1, R_IIR_DEST_A1), //    0x02F0
        ReverbPair(1, R_ACC_SRC_A0),  //     0x02F4