diff --git a/bsnes/cpu/cpu.cpp b/bsnes/cpu/cpu.cpp
index 481b5dca..cb836768 100755
--- a/bsnes/cpu/cpu.cpp
+++ b/bsnes/cpu/cpu.cpp
@@ -112,6 +112,7 @@ void CPU::reset() {
   status.irq_transition = false;
   status.irq_pending = false;
 
+  status.irq_lock = false;
   status.hdma_pending = false;
 
   status.wram_addr = 0x000000;
diff --git a/bsnes/cpu/cpu.hpp b/bsnes/cpu/cpu.hpp
index 9a995cc0..d478484b 100755
--- a/bsnes/cpu/cpu.hpp
+++ b/bsnes/cpu/cpu.hpp
@@ -34,11 +34,16 @@ private:
   void op_irq(uint16 vector);
 
   //timing
+  struct QueueEvent {
+    enum : unsigned {
+      DramRefresh,
+      HdmaRun,
+    };
+  };
   nall::priority_queue<unsigned> queue;
   void queue_event(unsigned id);
   void last_cycle();
   void add_clocks(unsigned clocks);
-  void add_time(unsigned clocks);
   void scanline();
   void run_auto_joypad_poll();
 
@@ -56,6 +61,7 @@ private:
   unsigned hdma_addr(unsigned i);
   unsigned hdma_iaddr(unsigned i);
   void dma_run();
+  bool hdma_active_after(unsigned i);
   void hdma_update(unsigned i);
   void hdma_run();
   void hdma_init();
@@ -104,6 +110,7 @@ private:
     bool irq_transition;
     bool irq_pending;
 
+    bool irq_lock;
     bool hdma_pending;
 
     unsigned wram_addr;
diff --git a/bsnes/cpu/dma.cpp b/bsnes/cpu/dma.cpp
index 404f880b..d9d89e05 100755
--- a/bsnes/cpu/dma.cpp
+++ b/bsnes/cpu/dma.cpp
@@ -83,6 +83,15 @@ void CPU::dma_run() {
       dma_transfer(channel[i].direction, dma_bbus(i, index++), dma_addr(i));
     } while(channel[i].dma_enabled && --channel[i].transfer_size);
   }
+
+  status.irq_lock = true;
+}
+
+bool CPU::hdma_active_after(unsigned i) {
+  for(unsigned n = i + 1; i < 8; i++) {
+    if(channel[i].hdma_enabled && !channel[i].hdma_completed) return true;
+  }
+  return false;
 }
 
 void CPU::hdma_update(unsigned i) {
@@ -93,10 +102,15 @@ void CPU::hdma_update(unsigned i) {
     add_clocks(8);
 
     if(channel[i].indirect) {
-      channel[i].indirect_addr  = dma_read(hdma_addr(i)) << 0;
-      add_clocks(8);
-      channel[i].indirect_addr |= dma_read(hdma_addr(i)) << 8;
+      channel[i].indirect_addr = dma_read(hdma_addr(i)) << 8;
       add_clocks(8);
+
+    //emulating this glitch causes a slight slowdown; only enable if needed
+    //if(!channel[i].hdma_completed || hdma_active_after(i)) {
+        channel[i].indirect_addr >>= 8;
+        channel[i].indirect_addr |= dma_read(hdma_addr(i)) << 8;
+        add_clocks(8);
+    //}
     }
   }
 }
@@ -108,7 +122,7 @@ void CPU::hdma_run() {
   }
   if(channels == 0) return;
 
-  add_clocks(16);
+  add_clocks(24);
   for(unsigned i = 0; i < 8; i++) {
     if(channel[i].hdma_enabled == false || channel[i].hdma_completed == true) continue;
     channel[i].dma_enabled = false;
@@ -130,6 +144,8 @@ void CPU::hdma_run() {
     channel[i].hdma_do_transfer = channel[i].line_counter & 0x80;
     hdma_update(i);
   }
+
+  status.irq_lock = true;
 }
 
 void CPU::hdma_init() {
@@ -150,6 +166,8 @@ void CPU::hdma_init() {
     channel[i].line_counter = 0;
     hdma_update(i);
   }
+
+  status.irq_lock = true;
 }
 
 void CPU::dma_reset() {
diff --git a/bsnes/cpu/timing.cpp b/bsnes/cpu/timing.cpp
index 95ed6d8e..7c333db7 100755
--- a/bsnes/cpu/timing.cpp
+++ b/bsnes/cpu/timing.cpp
@@ -1,12 +1,5 @@
 #ifdef CPU_CPP
 
-struct QueueEvent {
-  enum : unsigned {
-    DramRefresh,
-    HdmaRun,
-  };
-};
-
 void CPU::queue_event(unsigned id) {
   switch(id) {
     case QueueEvent::DramRefresh: return add_clocks(40);
@@ -15,13 +8,18 @@ void CPU::queue_event(unsigned id) {
 }
 
 void CPU::last_cycle() {
+  if(status.irq_lock) {
+    status.irq_lock = false;
+    return;
+  }
+
   if(status.nmi_transition) {
     regs.wai = false;
     status.nmi_transition = false;
     status.nmi_pending = true;
   }
 
-  if(status.irq_transition) {
+  if(status.irq_transition || regs.irq) {
     regs.wai = false;
     status.irq_transition = false;
     status.irq_pending = !regs.p.i;
@@ -29,40 +27,32 @@ void CPU::last_cycle() {
 }
 
 void CPU::add_clocks(unsigned clocks) {
-  step(clocks);
-  queue.tick(clocks);
-  unsigned clocksleft = lineclocks() - hcounter();
-  if(clocks > clocksleft) {
-    add_time(clocksleft);
-    add_time(clocks - clocksleft);
-  } else {
-    add_time(clocks);
-  }
-}
-
-void CPU::add_time(unsigned clocks) {
-  if(status.irq_line && (status.virq_enabled || status.hirq_enabled)) {
-    status.irq_transition = true;
-  }
-
-  if(status.virq_enabled && !status.hirq_enabled) {
+  if(status.hirq_enabled) {
+    if(status.virq_enabled) {
+      unsigned cpu_time = vcounter() * 1364 + hcounter();
+      unsigned irq_time = status.vtime * 1364 + status.htime * 4;
+      if(cpu_time > irq_time) irq_time += 262 * 1364;
+      bool irq_valid = status.irq_valid;
+      status.irq_valid = cpu_time <= irq_time && cpu_time + clocks > irq_time;
+      if(!irq_valid && status.irq_valid) status.irq_line = true;
+    } else {
+      unsigned irq_time = status.htime * 4;
+      if(hcounter() > irq_time) irq_time += 1364;
+      bool irq_valid = status.irq_valid;
+      status.irq_valid = hcounter() <= irq_time && hcounter() + clocks > irq_time;
+      if(!irq_valid && status.irq_valid) status.irq_line = true;
+    }
+    if(status.irq_line) status.irq_transition = true;
+  } else if(status.virq_enabled) {
     bool irq_valid = status.irq_valid;
     status.irq_valid = vcounter() == status.vtime;
-    if(!irq_valid && status.irq_valid) {
-      status.irq_line = true;
-      status.irq_transition = true;
-    }
-  } else if(status.hirq_enabled) {
-    bool irq_valid = status.irq_valid;
-    status.irq_valid = hcounter() <= status.htime * 4 && hcounter() + clocks > status.htime * 4;
-    if(status.virq_enabled && vcounter() != status.vtime) status.irq_valid = false;
-    if(!irq_valid && status.irq_valid) {
-      status.irq_line = true;
-      status.irq_transition = true;
-    }
+    if(!irq_valid && status.irq_valid) status.irq_line = true;
+    if(status.irq_line) status.irq_transition = true;
   }
 
   tick(clocks);
+  queue.tick(clocks);
+  step(clocks);
 }
 
 void CPU::scanline() {
diff --git a/bsnes/info.hpp b/bsnes/info.hpp
index bed8783b..add71c68 100755
--- a/bsnes/info.hpp
+++ b/bsnes/info.hpp
@@ -1,7 +1,7 @@
 namespace SNES {
   namespace Info {
     static const char Name[] = "bsnes";
-    static const char Version[] = "067.10";
+    static const char Version[] = "067.11";
     static const unsigned SerializerVersion = 12;
   }
 }
diff --git a/bsnes/smp/snes_spc/Spc_Dsp.cpp b/bsnes/smp/snes_spc/Spc_Dsp.cpp
index 46412254..106980ac 100755
--- a/bsnes/smp/snes_spc/Spc_Dsp.cpp
+++ b/bsnes/smp/snes_spc/Spc_Dsp.cpp
@@ -21,9 +21,6 @@ Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */
 	#include BLARGG_ENABLE_OPTIMIZER
 #endif
 
-// New SNES DSP behaves slightly differently (not all differences handled yet)
-bool const new_snes = false;
-
 // if ( io < -32768 ) io = -32768;
 // if ( io >  32767 ) io =  32767;
 #define CLAMP16( io )\
@@ -93,736 +90,551 @@ inline void Spc_Dsp::write_sample( int l, int r )
 // Volume registers and efb are signed! Easy to forget int8_t cast.
 // Prefixes are to avoid accidental use of locals with same names.
 
-// Gaussian interpolation
-
-static short const gauss [512] =
+// Interleaved gauss table (to improve cache coherency)
+// interleaved_gauss [i] = gauss [(i & 1) * 256 + 255 - (i >> 1 & 0xFF)]
+static short const interleaved_gauss [512] =
 {
-   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,
-   2,   2,   3,   3,   3,   3,   3,   4,   4,   4,   4,   4,   5,   5,   5,   5,
-   6,   6,   6,   6,   7,   7,   7,   8,   8,   8,   9,   9,   9,  10,  10,  10,
-  11,  11,  11,  12,  12,  13,  13,  14,  14,  15,  15,  15,  16,  16,  17,  17,
-  18,  19,  19,  20,  20,  21,  21,  22,  23,  23,  24,  24,  25,  26,  27,  27,
-  28,  29,  29,  30,  31,  32,  32,  33,  34,  35,  36,  36,  37,  38,  39,  40,
-  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,
-  58,  59,  60,  61,  62,  64,  65,  66,  67,  69,  70,  71,  73,  74,  76,  77,
-  78,  80,  81,  83,  84,  86,  87,  89,  90,  92,  94,  95,  97,  99, 100, 102,
- 104, 106, 107, 109, 111, 113, 115, 117, 118, 120, 122, 124, 126, 128, 130, 132,
- 134, 137, 139, 141, 143, 145, 147, 150, 152, 154, 156, 159, 161, 163, 166, 168,
- 171, 173, 175, 178, 180, 183, 186, 188, 191, 193, 196, 199, 201, 204, 207, 210,
- 212, 215, 218, 221, 224, 227, 230, 233, 236, 239, 242, 245, 248, 251, 254, 257,
- 260, 263, 267, 270, 273, 276, 280, 283, 286, 290, 293, 297, 300, 304, 307, 311,
- 314, 318, 321, 325, 328, 332, 336, 339, 343, 347, 351, 354, 358, 362, 366, 370,
- 374, 378, 381, 385, 389, 393, 397, 401, 405, 410, 414, 418, 422, 426, 430, 434,
- 439, 443, 447, 451, 456, 460, 464, 469, 473, 477, 482, 486, 491, 495, 499, 504,
- 508, 513, 517, 522, 527, 531, 536, 540, 545, 550, 554, 559, 563, 568, 573, 577,
- 582, 587, 592, 596, 601, 606, 611, 615, 620, 625, 630, 635, 640, 644, 649, 654,
- 659, 664, 669, 674, 678, 683, 688, 693, 698, 703, 708, 713, 718, 723, 728, 732,
- 737, 742, 747, 752, 757, 762, 767, 772, 777, 782, 787, 792, 797, 802, 806, 811,
- 816, 821, 826, 831, 836, 841, 846, 851, 855, 860, 865, 870, 875, 880, 884, 889,
- 894, 899, 904, 908, 913, 918, 923, 927, 932, 937, 941, 946, 951, 955, 960, 965,
- 969, 974, 978, 983, 988, 992, 997,1001,1005,1010,1014,1019,1023,1027,1032,1036,
-1040,1045,1049,1053,1057,1061,1066,1070,1074,1078,1082,1086,1090,1094,1098,1102,
-1106,1109,1113,1117,1121,1125,1128,1132,1136,1139,1143,1146,1150,1153,1157,1160,
-1164,1167,1170,1174,1177,1180,1183,1186,1190,1193,1196,1199,1202,1205,1207,1210,
-1213,1216,1219,1221,1224,1227,1229,1232,1234,1237,1239,1241,1244,1246,1248,1251,
-1253,1255,1257,1259,1261,1263,1265,1267,1269,1270,1272,1274,1275,1277,1279,1280,
-1282,1283,1284,1286,1287,1288,1290,1291,1292,1293,1294,1295,1296,1297,1297,1298,
-1299,1300,1300,1301,1302,1302,1303,1303,1303,1304,1304,1304,1304,1304,1305,1305,
+ 370,1305, 366,1305, 362,1304, 358,1304, 354,1304, 351,1304, 347,1304, 343,1303,
+ 339,1303, 336,1303, 332,1302, 328,1302, 325,1301, 321,1300, 318,1300, 314,1299,
+ 311,1298, 307,1297, 304,1297, 300,1296, 297,1295, 293,1294, 290,1293, 286,1292,
+ 283,1291, 280,1290, 276,1288, 273,1287, 270,1286, 267,1284, 263,1283, 260,1282,
+ 257,1280, 254,1279, 251,1277, 248,1275, 245,1274, 242,1272, 239,1270, 236,1269,
+ 233,1267, 230,1265, 227,1263, 224,1261, 221,1259, 218,1257, 215,1255, 212,1253,
+ 210,1251, 207,1248, 204,1246, 201,1244, 199,1241, 196,1239, 193,1237, 191,1234,
+ 188,1232, 186,1229, 183,1227, 180,1224, 178,1221, 175,1219, 173,1216, 171,1213,
+ 168,1210, 166,1207, 163,1205, 161,1202, 159,1199, 156,1196, 154,1193, 152,1190,
+ 150,1186, 147,1183, 145,1180, 143,1177, 141,1174, 139,1170, 137,1167, 134,1164,
+ 132,1160, 130,1157, 128,1153, 126,1150, 124,1146, 122,1143, 120,1139, 118,1136,
+ 117,1132, 115,1128, 113,1125, 111,1121, 109,1117, 107,1113, 106,1109, 104,1106,
+ 102,1102, 100,1098,  99,1094,  97,1090,  95,1086,  94,1082,  92,1078,  90,1074,
+  89,1070,  87,1066,  86,1061,  84,1057,  83,1053,  81,1049,  80,1045,  78,1040,
+  77,1036,  76,1032,  74,1027,  73,1023,  71,1019,  70,1014,  69,1010,  67,1005,
+  66,1001,  65, 997,  64, 992,  62, 988,  61, 983,  60, 978,  59, 974,  58, 969,
+  56, 965,  55, 960,  54, 955,  53, 951,  52, 946,  51, 941,  50, 937,  49, 932,
+  48, 927,  47, 923,  46, 918,  45, 913,  44, 908,  43, 904,  42, 899,  41, 894,
+  40, 889,  39, 884,  38, 880,  37, 875,  36, 870,  36, 865,  35, 860,  34, 855,
+  33, 851,  32, 846,  32, 841,  31, 836,  30, 831,  29, 826,  29, 821,  28, 816,
+  27, 811,  27, 806,  26, 802,  25, 797,  24, 792,  24, 787,  23, 782,  23, 777,
+  22, 772,  21, 767,  21, 762,  20, 757,  20, 752,  19, 747,  19, 742,  18, 737,
+  17, 732,  17, 728,  16, 723,  16, 718,  15, 713,  15, 708,  15, 703,  14, 698,
+  14, 693,  13, 688,  13, 683,  12, 678,  12, 674,  11, 669,  11, 664,  11, 659,
+  10, 654,  10, 649,  10, 644,   9, 640,   9, 635,   9, 630,   8, 625,   8, 620,
+   8, 615,   7, 611,   7, 606,   7, 601,   6, 596,   6, 592,   6, 587,   6, 582,
+   5, 577,   5, 573,   5, 568,   5, 563,   4, 559,   4, 554,   4, 550,   4, 545,
+   4, 540,   3, 536,   3, 531,   3, 527,   3, 522,   3, 517,   2, 513,   2, 508,
+   2, 504,   2, 499,   2, 495,   2, 491,   2, 486,   1, 482,   1, 477,   1, 473,
+   1, 469,   1, 464,   1, 460,   1, 456,   1, 451,   1, 447,   1, 443,   1, 439,
+   0, 434,   0, 430,   0, 426,   0, 422,   0, 418,   0, 414,   0, 410,   0, 405,
+   0, 401,   0, 397,   0, 393,   0, 389,   0, 385,   0, 381,   0, 378,   0, 374,
 };
 
-inline int Spc_Dsp::interpolate( voice_t const* v )
-{
-	// Make pointers into gaussian based on fractional position between samples
-	int offset = v->interp_pos >> 4 & 0xFF;
-	short const* fwd = gauss + 255 - offset;
-	short const* rev = gauss       + offset; // mirror left half of gaussian
-	
-	int const* in = &v->buf [(v->interp_pos >> 12) + v->buf_pos];
-	int out;
-	out  = (fwd [  0] * in [0]) >> 11;
-	out += (fwd [256] * in [1]) >> 11;
-	out += (rev [256] * in [2]) >> 11;
-	out = (int16_t) out;
-	out += (rev [  0] * in [3]) >> 11;
-	
-	CLAMP16( out );
-	out &= ~1;
-	return out;
-}
-
 
 //// Counters
 
-int const simple_counter_range = 2048 * 5 * 3; // 30720
+#define RATE( rate, div )\
+	(rate >= div ? rate / div * 8 - 1 : rate - 1)
 
-static unsigned const counter_rates [32] =
+static unsigned const counter_mask [32] =
 {
-   simple_counter_range + 1, // never fires
-          2048, 1536,
-	1280, 1024,  768,
-	 640,  512,  384,
-	 320,  256,  192,
-	 160,  128,   96,
-	  80,   64,   48,
-	  40,   32,   24,
-	  20,   16,   12,
-	  10,    8,    6,
-	   5,    4,    3,
-	         2,
-	         1
-};
-static unsigned const counter_offsets [32] =
-{
-	  1, 0, 1040,
-	536, 0, 1040,
-	536, 0, 1040,
-	536, 0, 1040,
-	536, 0, 1040,
-	536, 0, 1040,
-	536, 0, 1040,
-	536, 0, 1040,
-	536, 0, 1040,
-	536, 0, 1040,
-	     0,
-	     0
+	RATE(   2,2), RATE(2048,4), RATE(1536,3),
+	RATE(1280,5), RATE(1024,4), RATE( 768,3),
+	RATE( 640,5), RATE( 512,4), RATE( 384,3),
+	RATE( 320,5), RATE( 256,4), RATE( 192,3),
+	RATE( 160,5), RATE( 128,4), RATE(  96,3),
+	RATE(  80,5), RATE(  64,4), RATE(  48,3),
+	RATE(  40,5), RATE(  32,4), RATE(  24,3),
+	RATE(  20,5), RATE(  16,4), RATE(  12,3),
+	RATE(  10,5), RATE(   8,4), RATE(   6,3),
+	RATE(   5,5), RATE(   4,4), RATE(   3,3),
+	              RATE(   2,4),
+	              RATE(   1,4)
 };
+#undef RATE
 
 inline void Spc_Dsp::init_counter()
 {
-	m.counter = 0;
-}
-
-inline void Spc_Dsp::run_counters()
-{
-	if ( --m.counter < 0 )
-		m.counter = simple_counter_range - 1;
-}
-
-inline unsigned Spc_Dsp::read_counter( int rate )
-{
-	return ((unsigned) m.counter + counter_offsets [rate]) % counter_rates [rate];
-}
-
-
-//// Envelope
-
-inline void Spc_Dsp::run_envelope( voice_t* const v )
-{
-	int env = v->env;
-	if ( v->env_mode == env_release ) // 60%
+	// counters start out with this synchronization
+	m.counters [0] =     1;
+	m.counters [1] =     0;
+	m.counters [2] = -0x20u;
+	m.counters [3] =  0x0B;
+	
+	int n = 2;
+	for ( int i = 1; i < 32; i++ )
 	{
-		if ( (env -= 0x8) < 0 )
-			env = 0;
-		v->env = env;
+		m.counter_select [i] = &m.counters [n];
+		if ( !--n )
+			n = 3;
 	}
-	else
+	m.counter_select [ 0] = &m.counters [0];
+	m.counter_select [30] = &m.counters [2];
+}
+
+inline void Spc_Dsp::run_counter( int i )
+{
+	int n = m.counters [i];
+	if ( !(n-- & 7) )
+		n -= 6 - i;
+	m.counters [i] = n;
+}
+
+#define READ_COUNTER( rate )\
+	(*m.counter_select [rate] & counter_mask [rate])
+
+
+//// Emulation
+
+void Spc_Dsp::run( int clock_count )
+{
+	int new_phase = m.phase + clock_count;
+	int count = new_phase >> 5;
+	m.phase = new_phase & 31;
+	if ( !count )
+		return;
+	
+	byte* const ram = this->ram;
+	byte const* const dir = &ram [REG(dir) * 0x100];
+	int const slow_gaussian = (REG(pmon) >> 1) | REG(non);
+	int const noise_rate = REG(flg) & 0x1F;
+	
+	// Global volume
+	int mvoll = (int8_t) REG(mvoll);
+	int mvolr = (int8_t) REG(mvolr);
+	if ( mvoll * mvolr < surround_threshold )
+		mvoll = -mvoll; // eliminate surround
+	
+	do
 	{
-		int rate;
-		int env_data = VREG(v->regs,adsr1);
-		if ( m.t_adsr0 & 0x80 ) // 99% ADSR
+		// KON/KOFF reading
+		if ( (m.every_other_sample ^= 1) != 0 )
 		{
-			if ( v->env_mode >= env_decay ) // 99%
-			{
-				env--;
-				env -= env >> 8;
-				rate = env_data & 0x1F;
-				if ( v->env_mode == env_decay ) // 1%
-					rate = (m.t_adsr0 >> 3 & 0x0E) + 0x10;
-			}
-			else // env_attack
-			{
-				rate = (m.t_adsr0 & 0x0F) * 2 + 1;
-				env += rate < 31 ? 0x20 : 0x400;
-			}
-		}
-		else // GAIN
-		{
-			int mode;
-			env_data = VREG(v->regs,gain);
-			mode = env_data >> 5;
-			if ( mode < 4 ) // direct
-			{
-				env = env_data * 0x10;
-				rate = 31;
-			}
-			else
-			{
-				rate = env_data & 0x1F;
-				if ( mode == 4 ) // 4: linear decrease
-				{
-					env -= 0x20;
-				}
-				else if ( mode < 6 ) // 5: exponential decrease
-				{
-					env--;
-					env -= env >> 8;
-				}
-				else // 6,7: linear increase
-				{
-					env += 0x20;
-					if ( mode > 6 && (unsigned) v->hidden_env >= 0x600 )
-						env += 0x8 - 0x20; // 7: two-slope linear increase
-				}
-			}
+			m.new_kon &= ~m.kon;
+			m.kon    = m.new_kon;
+			m.t_koff = REG(koff); 
 		}
 		
-		// Sustain level
-		if ( (env >> 8) == (env_data >> 5) && v->env_mode == env_decay )
-			v->env_mode = env_sustain;
-		
-		v->hidden_env = env;
-		
-		// unsigned cast because linear decrease going negative also triggers this
-		if ( (unsigned) env > 0x7FF )
-		{
-			env = (env < 0 ? 0 : 0x7FF);
-			if ( v->env_mode == env_attack )
-				v->env_mode = env_decay;
-		}
-		
-		if ( !read_counter( rate ) )
-			v->env = env; // nothing else is controlled by the counter
-	}
-}
-
-
-//// BRR Decoding
-
-inline void Spc_Dsp::decode_brr( voice_t* v )
-{
-	// Arrange the four input nybbles in 0xABCD order for easy decoding
-	int nybbles = m.t_brr_byte * 0x100 + ram [(v->brr_addr + v->brr_offset + 1) & 0xFFFF];
-	
-	int const header = m.t_brr_header;
-	
-	// Write to next four samples in circular buffer
-	int* pos = &v->buf [v->buf_pos];
-	if ( (v->buf_pos += 4) >= brr_buf_size )
-		v->buf_pos = 0;
-	
-	// Decode four samples
-	for ( int* end = pos + 4; pos < end; pos++ )
-	{
-		// Extract nybble and sign-extend
-		int s = (int16_t) nybbles >> 12;
-		nybbles <<= 4;
-		
-		// Shift sample based on header
-		int const shift = header >> 4;
-		s = (s << shift) >> 1;
-		if ( shift >= 0xD ) // handle invalid range
-			s = (s >> 25) << 11; // same as: s = (s < 0 ? -0x800 : 0)
-		
-		// Apply (unstable) IIR filter (8 is the most commonly used)
-		int const filter = header & 0x0C;
-		int const p1 = pos [brr_buf_size - 1];
-		int const p2 = pos [brr_buf_size - 2] >> 1;
-		if ( filter >= 8 ) // most common one
-		{
-			s += p1;
-			s -= p2;
-			if ( filter == 8 ) // pos[0] = s*2 + pos[-1] * 1.09625 - pos[-2] * 0.9375
-			{
-				s += p2 >> 4;
-				s += (p1 * -3) >> 6;
-			}
-			else // pos[0] = s*2 + pos[-1] * 1.796875 - pos[-2] * 0.8125
-			{
-				s += (p1 * -13) >> 7;
-				s += (p2 * 3) >> 4;
-			}
-		}
-		else if ( filter ) // pos[0] = s*2 + pos[-1] * 0.9375
-		{
-			s += p1 >> 1;
-			s += (-p1) >> 5;
-		}
-		
-		// Adjust and write sample
-		CLAMP16( s );
-		s = (int16_t) (s * 2);
-		pos [brr_buf_size] = pos [0] = s; // second copy simplifies wrap-around
-	}
-}
-
-
-//// Misc
-
-#define MISC_CLOCK( n ) inline void Spc_Dsp::misc_##n()
-
-MISC_CLOCK( 27 )
-{
-	m.t_pmon = REG(pmon) & 0xFE; // voice 0 doesn't support PMON
-}
-MISC_CLOCK( 28 )
-{
-	m.t_non = REG(non);
-	m.t_eon = REG(eon);
-	m.t_dir = REG(dir);
-}
-MISC_CLOCK( 29 )
-{
-	if ( (m.every_other_sample ^= 1) != 0 )
-		m.new_kon &= ~m.kon; // clears KON 63 clocks after it was last read
-}
-MISC_CLOCK( 30 )
-{
-	if ( m.every_other_sample )
-	{
-		m.kon    = m.new_kon;
-		m.t_koff = REG(koff) | mute_mask; 
-	}
-	
-	run_counters();
-	
-	// Noise
-	if ( !read_counter( REG(flg) & 0x1F ) )
-	{
-		int feedback = (m.noise << 13) ^ (m.noise << 14);
-		m.noise = (feedback & 0x4000) ^ (m.noise >> 1);
-	}
-}
-
-
-//// Voices
-
-#define VOICE_CLOCK( n ) void Spc_Dsp::voice_##n( voice_t* const v )
-
-inline VOICE_CLOCK( V1 )
-{
-	m.t_dir_addr = m.t_dir * 0x100 + m.t_srcn * 4;
-	m.t_srcn = VREG(v->regs,srcn);
-}
-inline VOICE_CLOCK( V2 )
-{
-	// Read sample pointer (ignored if not needed)
-	byte const* entry = &ram [m.t_dir_addr];
-	if ( !v->kon_delay )
-		entry += 2;
-	m.t_brr_next_addr = GET_LE16A( entry );
-	
-	m.t_adsr0 = VREG(v->regs,adsr0);
-	
-	// Read pitch, spread over two clocks
-	m.t_pitch = VREG(v->regs,pitchl);
-}
-inline VOICE_CLOCK( V3a )
-{
-	m.t_pitch += (VREG(v->regs,pitchh) & 0x3F) << 8;
-}
-inline VOICE_CLOCK( V3b )
-{
-	// Read BRR header and byte
-	m.t_brr_byte   = ram [(v->brr_addr + v->brr_offset) & 0xFFFF];
-	m.t_brr_header = ram [v->brr_addr]; // brr_addr doesn't need masking
-}
-VOICE_CLOCK( V3c )
-{
-	// Pitch modulation using previous voice's output
-	if ( m.t_pmon & v->vbit )
-		m.t_pitch += ((m.t_output >> 5) * m.t_pitch) >> 10;
-	
-	if ( v->kon_delay )
-	{
-		// Get ready to start BRR decoding on next sample
-		if ( v->kon_delay == 5 )
-		{
-			v->brr_addr    = m.t_brr_next_addr;
-			v->brr_offset  = 1;
-			v->buf_pos     = 0;
-			m.t_brr_header = 0; // header is ignored on this sample
-			kon_check      = true;
-		}
-		
-		// Envelope is never run during KON
-		v->env        = 0;
-		v->hidden_env = 0;
-		
-		// Disable BRR decoding until last three samples
-		v->interp_pos = 0;
-		if ( --v->kon_delay & 3 )
-			v->interp_pos = 0x4000;
-		
-		// Pitch is never added during KON
-		m.t_pitch = 0;
-	}
-	
-	// Gaussian interpolation
-	{
-		int output = interpolate( v );
+		run_counter( 1 );
+		run_counter( 2 );
+		run_counter( 3 );
 		
 		// Noise
-		if ( m.t_non & v->vbit )
-			output = (int16_t) (m.noise * 2);
-		
-		// Apply envelope
-		m.t_output = (output * v->env) >> 11 & ~1;
-		v->t_envx_out = (byte) (v->env >> 4);
-	}
-	
-	// Immediate silence due to end of sample or soft reset
-	if ( REG(flg) & 0x80 || (m.t_brr_header & 3) == 1 )
-	{
-		v->env_mode = env_release;
-		v->env      = 0;
-	}
-	
-	if ( m.every_other_sample )
-	{
-		// KOFF
-		if ( m.t_koff & v->vbit && (!new_snes || v->kon_delay < 3) )
-			v->env_mode = env_release;
-		
-		// KON
-		if ( m.kon & v->vbit )
+		if ( !READ_COUNTER( noise_rate ) )
 		{
-			v->kon_delay = 5;
-			v->env_mode  = env_attack;
+			int feedback = (m.noise << 13) ^ (m.noise << 14);
+			m.noise = (feedback & 0x4000) ^ (m.noise >> 1);
 		}
-	}
-	
-	// Run envelope for next sample
-	if ( !v->kon_delay )
-		run_envelope( v );
-}
-inline void Spc_Dsp::voice_output( voice_t const* v, int ch )
-{
-	// Apply left/right volume
-	int amp = (m.t_output * (int8_t) VREG(v->regs,voll + ch)) >> 7;
-	
-	// Avoid negative volume if surround is disabled
-	// (emulator feature; not part of actual DSP)
-	if ( (int8_t) VREG(v->regs,voll + ch) < surround_threshold )
-		amp = -amp;
-	
-	// Add to output total
-	m.t_main_out [ch] += amp;
-	CLAMP16( m.t_main_out [ch] );
-	
-	// Optionally add to echo total
-	if ( m.t_eon & v->vbit )
-	{
-		m.t_echo_out [ch] += amp;
-		CLAMP16( m.t_echo_out [ch] );
-	}
-}
-VOICE_CLOCK( V4 )
-{
-	// Decode BRR
-	m.t_looped = 0;
-	if ( v->interp_pos >= 0x4000 )
-	{
-		decode_brr( v );
 		
-		if ( (v->brr_offset += 2) >= brr_block_size )
+		// Voices
+		int pmon_input = 0;
+		int main_out_l = 0;
+		int main_out_r = 0;
+		int echo_out_l = 0;
+		int echo_out_r = 0;
+		voice_t* v = m.voices;
+		byte* v_regs = regs;
+		int vbit = 1;
+		do
 		{
-			// Start decoding next BRR block
-			assert( v->brr_offset == brr_block_size );
-			v->brr_addr = (v->brr_addr + brr_block_size) & 0xFFFF;
-			if ( m.t_brr_header & 1 )
+			#define SAMPLE_PTR(i) GET_LE16A( &dir [VREG(v_regs,srcn) * 4 + i * 2] )
+			
+			int brr_header = ram [v->brr_addr];
+			int kon_delay = v->kon_delay;
+			
+			// Pitch
+			int pitch = GET_LE16A( &VREG(v_regs,pitchl) ) & 0x3FFF;
+			if ( REG(pmon) & vbit )
+				pitch += ((pmon_input >> 5) * pitch) >> 10;
+			
+			// KON phases
+			if ( --kon_delay >= 0 )
 			{
-				v->brr_addr = m.t_brr_next_addr;
-				m.t_looped = v->vbit;
+				v->kon_delay = kon_delay;
+				
+				// Get ready to start BRR decoding on next sample
+				if ( kon_delay == 4 )
+				{
+					v->brr_addr   = SAMPLE_PTR( 0 );
+					v->brr_offset = 1;
+					v->buf_pos    = v->buf;
+					brr_header    = 0; // header is ignored on this sample
+				}
+				
+				// Envelope is never run during KON
+				v->env        = 0;
+				v->hidden_env = 0;
+				
+				// Disable BRR decoding until last three samples
+				v->interp_pos = (kon_delay & 3 ? 0x4000 : 0);
+				
+				// Pitch is never added during KON
+				pitch = 0;
 			}
-			v->brr_offset = 1;
+			
+			int env = v->env;
+			
+			// Gaussian interpolation
+			{
+				int output = 0;
+				VREG(v_regs,envx) = (byte) (env >> 4);
+				if ( env )
+				{
+					// Make pointers into gaussian based on fractional position between samples
+					int offset = (unsigned) v->interp_pos >> 3 & 0x1FE;
+					short const* fwd = interleaved_gauss       + offset;
+					short const* rev = interleaved_gauss + 510 - offset; // mirror left half of gaussian
+					
+					int const* in = &v->buf_pos [(unsigned) v->interp_pos >> 12];
+					
+					if ( !(slow_gaussian & vbit) ) // 99%
+					{
+						// Faster approximation when exact sample value isn't necessary for pitch mod
+						output = (fwd [0] * in [0] +
+						          fwd [1] * in [1] +
+						          rev [1] * in [2] +
+						          rev [0] * in [3]) >> 11;
+						output = (output * env) >> 11;
+					}
+					else
+					{
+						output = (int16_t) (m.noise * 2);
+						if ( !(REG(non) & vbit) )
+						{
+							output  = (fwd [0] * in [0]) >> 11;
+							output += (fwd [1] * in [1]) >> 11;
+							output += (rev [1] * in [2]) >> 11;
+							output = (int16_t) output;
+							output += (rev [0] * in [3]) >> 11;
+							
+							CLAMP16( output );
+							output &= ~1;
+						}
+						output = (output * env) >> 11 & ~1;
+					}
+					
+					// Output
+					int l = output * v->volume [0];
+					int r = output * v->volume [1];
+					
+					main_out_l += l;
+					main_out_r += r;
+					
+					if ( REG(eon) & vbit )
+					{
+						echo_out_l += l;
+						echo_out_r += r;
+					}
+				}
+				
+				pmon_input = output;
+				VREG(v_regs,outx) = (byte) (output >> 8);
+			}
+			
+			// Soft reset or end of sample
+			if ( REG(flg) & 0x80 || (brr_header & 3) == 1 )
+			{
+				v->env_mode = env_release;
+				env         = 0;
+			}
+			
+			if ( m.every_other_sample )
+			{
+				// KOFF
+				if ( m.t_koff & vbit )
+					v->env_mode = env_release;
+				
+				// KON
+				if ( m.kon & vbit )
+				{
+					v->kon_delay = 5;
+					v->env_mode  = env_attack;
+					REG(endx) &= ~vbit;
+				}
+			}
+			
+			// Envelope
+			if ( !v->kon_delay )
+			{
+				if ( v->env_mode == env_release ) // 97%
+				{
+					env -= 0x8;
+					v->env = env;
+					if ( env <= 0 )
+					{
+						v->env = 0;
+						goto skip_brr; // no BRR decoding for you!
+					}
+				}
+				else // 3%
+				{
+					int rate;
+					int const adsr0 = VREG(v_regs,adsr0);
+					int env_data = VREG(v_regs,adsr1);
+					if ( adsr0 >= 0x80 ) // 97% ADSR
+					{
+						if ( v->env_mode > env_decay ) // 89%
+						{
+							env--;
+							env -= env >> 8;
+							rate = env_data & 0x1F;
+							
+							// optimized handling
+							v->hidden_env = env;
+							if ( READ_COUNTER( rate ) )
+								goto exit_env;
+							v->env = env;
+							goto exit_env;
+						}
+						else if ( v->env_mode == env_decay )
+						{
+							env--;
+							env -= env >> 8;
+							rate = (adsr0 >> 3 & 0x0E) + 0x10;
+						}
+						else // env_attack
+						{
+							rate = (adsr0 & 0x0F) * 2 + 1;
+							env += rate < 31 ? 0x20 : 0x400;
+						}
+					}
+					else // GAIN
+					{
+						int mode;
+						env_data = VREG(v_regs,gain);
+						mode = env_data >> 5;
+						if ( mode < 4 ) // direct
+						{
+							env = env_data * 0x10;
+							rate = 31;
+						}
+						else
+						{
+							rate = env_data & 0x1F;
+							if ( mode == 4 ) // 4: linear decrease
+							{
+								env -= 0x20;
+							}
+							else if ( mode < 6 ) // 5: exponential decrease
+							{
+								env--;
+								env -= env >> 8;
+							}
+							else // 6,7: linear increase
+							{
+								env += 0x20;
+								if ( mode > 6 && (unsigned) v->hidden_env >= 0x600 )
+									env += 0x8 - 0x20; // 7: two-slope linear increase
+							}
+						}
+					}
+					
+					// Sustain level
+					if ( (env >> 8) == (env_data >> 5) && v->env_mode == env_decay )
+						v->env_mode = env_sustain;
+					
+					v->hidden_env = env;
+					
+					// unsigned cast because linear decrease going negative also triggers this
+					if ( (unsigned) env > 0x7FF )
+					{
+						env = (env < 0 ? 0 : 0x7FF);
+						if ( v->env_mode == env_attack )
+							v->env_mode = env_decay;
+					}
+					
+					if ( !READ_COUNTER( rate ) )
+						v->env = env; // nothing else is controlled by the counter
+				}
+			}
+		exit_env:
+			
+			{
+				// Apply pitch
+				int old_pos = v->interp_pos;
+				int interp_pos = (old_pos & 0x3FFF) + pitch;
+				if ( interp_pos > 0x7FFF )
+					interp_pos = 0x7FFF;
+				v->interp_pos = interp_pos;
+				
+				// BRR decode if necessary
+				if ( old_pos >= 0x4000 )
+				{
+					// Arrange the four input nybbles in 0xABCD order for easy decoding
+					int nybbles = ram [(v->brr_addr + v->brr_offset) & 0xFFFF] * 0x100 +
+							ram [(v->brr_addr + v->brr_offset + 1) & 0xFFFF];
+					
+					// Advance read position
+					int const brr_block_size = 9;
+					int brr_offset = v->brr_offset;
+					if ( (brr_offset += 2) >= brr_block_size )
+					{
+						// Next BRR block
+						int brr_addr = (v->brr_addr + brr_block_size) & 0xFFFF;
+						assert( brr_offset == brr_block_size );
+						if ( brr_header & 1 )
+						{
+							brr_addr = SAMPLE_PTR( 1 );
+							if ( !v->kon_delay )
+								REG(endx) |= vbit;
+						}
+						v->brr_addr = brr_addr;
+						brr_offset  = 1;
+					}
+					v->brr_offset = brr_offset;
+					
+					// Decode
+					
+					// 0: >>1  1: <<0  2: <<1 ... 12: <<11  13-15: >>4 <<11
+					static unsigned char const shifts [16 * 2] = {
+						13,12,12,12,12,12,12,12,12,12,12, 12, 12, 16, 16, 16,
+						 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11
+					};
+					int const scale = brr_header >> 4;
+					int const right_shift = shifts [scale];
+					int const left_shift  = shifts [scale + 16];
+					
+					// Decode and write to next four samples in circular buffer
+					int* pos = v->buf_pos;
+					for ( int* end = pos + 4; pos < end; pos++ )
+					{
+						// Extract upper nybble and scale appropriately
+						int s = ((int16_t) nybbles >> right_shift) << left_shift;
+						nybbles <<= 4;
+						
+						// Apply IIR filter (8 is the most commonly used)
+						int const filter = brr_header & 0x0C;
+						int const p1 = pos [brr_buf_size - 1];
+						int const p2 = pos [brr_buf_size - 2] >> 1;
+						if ( filter >= 8 )
+						{
+							s += p1;
+							s -= p2;
+							if ( filter == 8 ) // s += p1 * 0.953125 - p2 * 0.46875
+							{
+								s += p2 >> 4;
+								s += (p1 * -3) >> 6;
+							}
+							else // s += p1 * 0.8984375 - p2 * 0.40625
+							{
+								s += (p1 * -13) >> 7;
+								s += (p2 * 3) >> 4;
+							}
+						}
+						else if ( filter ) // s += p1 * 0.46875
+						{
+							s += p1 >> 1;
+							s += (-p1) >> 5;
+						}
+						
+						// Adjust and write sample
+						CLAMP16( s );
+						s = (int16_t) (s * 2);
+						pos [brr_buf_size] = pos [0] = s; // second copy simplifies wrap-around
+					}
+					
+					if ( pos >= &v->buf [brr_buf_size] )
+						pos = v->buf;
+					v->buf_pos = pos;
+				}
+			}
+skip_brr:
+			// Next voice
+			vbit <<= 1;
+			v_regs += 0x10;
+			v++;
 		}
-	}
-	
-	// Apply pitch
-	v->interp_pos = (v->interp_pos & 0x3FFF) + m.t_pitch;
-	
-	// Keep from getting too far ahead (when using pitch modulation)
-	if ( v->interp_pos > 0x7FFF )
-		v->interp_pos = 0x7FFF;
-	
-	// Output left
-	voice_output( v, 0 );
-}
-inline VOICE_CLOCK( V5 )
-{
-	// Output right
-	voice_output( v, 1 );
-	
-	// ENDX, OUTX, and ENVX won't update if you wrote to them 1-2 clocks earlier
-	int endx_buf = REG(endx) | m.t_looped;
-	
-	// Clear bit in ENDX if KON just began
-	if ( v->kon_delay == 5 )
-		endx_buf &= ~v->vbit;
-	m.endx_buf = (byte) endx_buf;
-}
-inline VOICE_CLOCK( V6 )
-{
-	(void) v; // avoid compiler warning about unused v
-	m.outx_buf = (byte) (m.t_output >> 8);
-}
-inline VOICE_CLOCK( V7 )
-{
-	// Update ENDX
-	REG(endx) = m.endx_buf;
-	
-	m.envx_buf = v->t_envx_out;
-}
-inline VOICE_CLOCK( V8 )
-{
-	// Update OUTX
-	VREG(v->regs,outx) = m.outx_buf;
-}
-inline VOICE_CLOCK( V9 )
-{
-	// Update ENVX
-	VREG(v->regs,envx) = m.envx_buf;
-}
-
-// Most voices do all these in one clock, so make a handy composite
-inline VOICE_CLOCK( V3 )
-{
-	voice_V3a( v );
-	voice_V3b( v );
-	voice_V3c( v );
-}
-
-// Common combinations of voice steps on different voices. This greatly reduces
-// code size and allows everything to be inlined in these functions.
-VOICE_CLOCK(V7_V4_V1) { voice_V7(v); voice_V1(v+3); voice_V4(v+1); }
-VOICE_CLOCK(V8_V5_V2) { voice_V8(v); voice_V5(v+1); voice_V2(v+2); }
-VOICE_CLOCK(V9_V6_V3) { voice_V9(v); voice_V6(v+1); voice_V3(v+2); }
-
-
-//// Echo
-
-// Current echo buffer pointer for left/right channel
-#define ECHO_PTR( ch )      (&ram [m.t_echo_ptr + ch * 2])
-
-// Sample in echo history buffer, where 0 is the oldest
-#define ECHO_FIR( i )       (m.echo_hist_pos [i])
-
-// Calculate FIR point for left/right channel
-#define CALC_FIR( i, ch )   ((ECHO_FIR( i + 1 ) [ch] * (int8_t) REG(fir + i * 0x10)) >> 6)
-
-#define ECHO_CLOCK( n ) inline void Spc_Dsp::echo_##n()
-
-inline void Spc_Dsp::echo_read( int ch )
-{
-	int s = GET_LE16SA( ECHO_PTR( ch ) );
-	// second copy simplifies wrap-around handling
-	ECHO_FIR( 0 ) [ch] = ECHO_FIR( 8 ) [ch] = s >> 1;
-}
-
-ECHO_CLOCK( 22 )
-{
-	// History
-	if ( ++m.echo_hist_pos >= &m.echo_hist [echo_hist_size] )
-		m.echo_hist_pos = m.echo_hist;
-	
-	m.t_echo_ptr = (m.t_esa * 0x100 + m.echo_offset) & 0xFFFF;
-	echo_read( 0 );
-	
-	// FIR (using l and r temporaries below helps compiler optimize)
-	int l = CALC_FIR( 0, 0 );
-	int r = CALC_FIR( 0, 1 );
-	
-	m.t_echo_in [0] = l;
-	m.t_echo_in [1] = r;
-}
-ECHO_CLOCK( 23 )
-{
-	int l = CALC_FIR( 1, 0 ) + CALC_FIR( 2, 0 );
-	int r = CALC_FIR( 1, 1 ) + CALC_FIR( 2, 1 );
-	
-	m.t_echo_in [0] += l;
-	m.t_echo_in [1] += r;
-	
-	echo_read( 1 );
-}
-ECHO_CLOCK( 24 )
-{
-	int l = CALC_FIR( 3, 0 ) + CALC_FIR( 4, 0 ) + CALC_FIR( 5, 0 );
-	int r = CALC_FIR( 3, 1 ) + CALC_FIR( 4, 1 ) + CALC_FIR( 5, 1 );
-	
-	m.t_echo_in [0] += l;
-	m.t_echo_in [1] += r;
-}
-ECHO_CLOCK( 25 )
-{
-	int l = m.t_echo_in [0] + CALC_FIR( 6, 0 );
-	int r = m.t_echo_in [1] + CALC_FIR( 6, 1 );
-	
-	l = (int16_t) l;
-	r = (int16_t) r;
-	
-	l += (int16_t) CALC_FIR( 7, 0 );
-	r += (int16_t) CALC_FIR( 7, 1 );
-	
-	CLAMP16( l );
-	CLAMP16( r );
-	
-	m.t_echo_in [0] = l & ~1;
-	m.t_echo_in [1] = r & ~1;
-}
-inline int Spc_Dsp::echo_output( int ch )
-{
-	int out = (int16_t) ((m.t_main_out [ch] * (int8_t) REG(mvoll + ch * 0x10)) >> 7) +
-			(int16_t) ((m.t_echo_in [ch] * (int8_t) REG(evoll + ch * 0x10)) >> 7);
-	CLAMP16( out );
-	return out;
-}
-ECHO_CLOCK( 26 )
-{
-	// Surround disabler (emulator feature; not part of actual DSP)
-	if ( (int8_t) REG(mvoll) * (int8_t) REG(mvolr) < surround_threshold )
-		m.t_main_out [0] = -m.t_main_out [0]; // eliminate surround
-	
-	// Left output volumes
-	// (save sample for next clock so we can output both together)
-	m.t_main_out [0] = echo_output( 0 );
-	
-	// Echo feedback
-	int l = m.t_echo_out [0] + (int16_t) ((m.t_echo_in [0] * (int8_t) REG(efb)) >> 7);
-	int r = m.t_echo_out [1] + (int16_t) ((m.t_echo_in [1] * (int8_t) REG(efb)) >> 7);
-	
-	CLAMP16( l );
-	CLAMP16( r );
-	
-	m.t_echo_out [0] = l & ~1;
-	m.t_echo_out [1] = r & ~1;
-}
-ECHO_CLOCK( 27 )
-{
-	// Output
-	int l = m.t_main_out [0];
-	int r = echo_output( 1 );
-	m.t_main_out [0] = 0;
-	m.t_main_out [1] = 0;
-	
-	// TODO: global muting isn't this simple (turns DAC on and off
-	// or something, causing small ~37-sample pulse when first muted)
-	if ( REG(flg) & 0x40 )
-	{
-		l = 0;
-		r = 0;
-	}
-	
-	// Output sample to DAC
-	SPC_DSP_OUT_HOOK( l, r );
-}
-ECHO_CLOCK( 28 )
-{
-	m.t_echo_enabled = REG(flg);
-}
-inline void Spc_Dsp::echo_write( int ch )
-{
-	if ( !(m.t_echo_enabled & 0x20) )
-	{
-		#ifdef SPC_DSP_ECHO_DEBUG
-			SPC_DSP_ECHO_DEBUG
+		while ( vbit < 0x100 );
+		
+		// Echo position
+		int echo_offset = m.echo_offset;
+		byte* const echo_ptr = &ram [(REG(esa) * 0x100 + echo_offset) & 0xFFFF];
+		if ( !echo_offset )
+			m.echo_length = (REG(edl) & 0x0F) * 0x800;
+		echo_offset += 4;
+		if ( echo_offset >= m.echo_length )
+			echo_offset = 0;
+		m.echo_offset = echo_offset;
+		
+		// FIR
+		int echo_in_l = GET_LE16SA( echo_ptr + 0 );
+		int echo_in_r = GET_LE16SA( echo_ptr + 2 );
+		
+		int (*echo_hist_pos) [2] = m.echo_hist_pos;
+		if ( ++echo_hist_pos >= &m.echo_hist [echo_hist_size] )
+			echo_hist_pos = m.echo_hist;
+		m.echo_hist_pos = echo_hist_pos;
+		
+		echo_hist_pos [0] [0] = echo_hist_pos [8] [0] = echo_in_l;
+		echo_hist_pos [0] [1] = echo_hist_pos [8] [1] = echo_in_r;
+		
+		#define CALC_FIR_( i, in )  ((in) * (int8_t) REG(fir + i * 0x10))
+		echo_in_l = CALC_FIR_( 7, echo_in_l );
+		echo_in_r = CALC_FIR_( 7, echo_in_r );
+		
+		#define CALC_FIR( i, ch )   CALC_FIR_( i, echo_hist_pos [i + 1] [ch] )
+		#define DO_FIR( i )\
+			echo_in_l += CALC_FIR( i, 0 );\
+			echo_in_r += CALC_FIR( i, 1 );
+		DO_FIR( 0 );
+		DO_FIR( 1 );
+		DO_FIR( 2 );
+		#if defined (__MWERKS__) && __MWERKS__ < 0x3200
+			__eieio(); // keeps compiler from stupidly "caching" things in memory
 		#endif
-		SET_LE16A( ECHO_PTR( ch ), m.t_echo_out [ch] );
+		DO_FIR( 3 );
+		DO_FIR( 4 );
+		DO_FIR( 5 );
+		DO_FIR( 6 );
+		
+		// Echo out
+		if ( !(REG(flg) & 0x20) )
+		{
+			int l = (echo_out_l >> 7) + ((echo_in_l * (int8_t) REG(efb)) >> 14);
+			int r = (echo_out_r >> 7) + ((echo_in_r * (int8_t) REG(efb)) >> 14);
+			
+			// just to help pass more validation tests
+			#if SPC_MORE_ACCURACY
+				l &= ~1;
+				r &= ~1;
+			#endif
+			
+			CLAMP16( l );
+			CLAMP16( r );
+			
+			SET_LE16A( echo_ptr + 0, l );
+			SET_LE16A( echo_ptr + 2, r );
+		}
+		
+		// Sound out
+		int l = (main_out_l * mvoll + echo_in_l * (int8_t) REG(evoll)) >> 14;
+		int r = (main_out_r * mvolr + echo_in_r * (int8_t) REG(evolr)) >> 14;
+		
+		CLAMP16( l );
+		CLAMP16( r );
+		
+		if ( (REG(flg) & 0x40) )
+		{
+			l = 0;
+			r = 0;
+		}
+		
+		SPC_DSP_OUT_HOOK( l, r );
 	}
-	m.t_echo_out [ch] = 0;
+	while ( --count );
 }
-ECHO_CLOCK( 29 )
-{
-	m.t_esa = REG(esa);
-	
-	if ( !m.echo_offset )
-		m.echo_length = (REG(edl) & 0x0F) * 0x800;
-	
-	m.echo_offset += 4;
-	if ( m.echo_offset >= m.echo_length )
-		m.echo_offset = 0;
-	
-	// Write left echo
-	echo_write( 0 );
-	
-	m.t_echo_enabled = REG(flg);
-}
-ECHO_CLOCK( 30 )
-{
-	// Write right echo
-	echo_write( 1 );
-}
-
-
-//// Timing
-
-// Execute clock for a particular voice
-#define V( clock, voice )   voice_##clock( &m.voices [voice] );
-
-/* The most common sequence of clocks uses composite operations
-for efficiency. For example, the following are equivalent to the
-individual steps on the right:
-
-V(V7_V4_V1,2) -> V(V7,2) V(V4,3) V(V1,5)
-V(V8_V5_V2,2) -> V(V8,2) V(V5,3) V(V2,4)
-V(V9_V6_V3,2) -> V(V9,2) V(V6,3) V(V3,4) */
-
-// Voice      0      1      2      3      4      5      6      7
-#define GEN_DSP_TIMING \
-PHASE( 0)  V(V5,0)V(V2,1)\
-PHASE( 1)  V(V6,0)V(V3,1)\
-PHASE( 2)  V(V7_V4_V1,0)\
-PHASE( 3)  V(V8_V5_V2,0)\
-PHASE( 4)  V(V9_V6_V3,0)\
-PHASE( 5)         V(V7_V4_V1,1)\
-PHASE( 6)         V(V8_V5_V2,1)\
-PHASE( 7)         V(V9_V6_V3,1)\
-PHASE( 8)                V(V7_V4_V1,2)\
-PHASE( 9)                V(V8_V5_V2,2)\
-PHASE(10)                V(V9_V6_V3,2)\
-PHASE(11)                       V(V7_V4_V1,3)\
-PHASE(12)                       V(V8_V5_V2,3)\
-PHASE(13)                       V(V9_V6_V3,3)\
-PHASE(14)                              V(V7_V4_V1,4)\
-PHASE(15)                              V(V8_V5_V2,4)\
-PHASE(16)                              V(V9_V6_V3,4)\
-PHASE(17)  V(V1,0)                            V(V7,5)V(V4,6)\
-PHASE(18)                                     V(V8_V5_V2,5)\
-PHASE(19)                                     V(V9_V6_V3,5)\
-PHASE(20)         V(V1,1)                            V(V7,6)V(V4,7)\
-PHASE(21)                                            V(V8,6)V(V5,7)  V(V2,0)  /* t_brr_next_addr order dependency */\
-PHASE(22)  V(V3a,0)                                  V(V9,6)V(V6,7)  echo_22();\
-PHASE(23)                                                   V(V7,7)  echo_23();\
-PHASE(24)                                                   V(V8,7)  echo_24();\
-PHASE(25)  V(V3b,0)                                         V(V9,7)  echo_25();\
-PHASE(26)                                                            echo_26();\
-PHASE(27) misc_27();                                                 echo_27();\
-PHASE(28) misc_28();                                                 echo_28();\
-PHASE(29) misc_29();                                                 echo_29();\
-PHASE(30) misc_30();V(V3c,0)                                         echo_30();\
-PHASE(31)  V(V4,0)       V(V1,2)\
-
-#if !SPC_DSP_CUSTOM_RUN
-
-void Spc_Dsp::run( int clocks_remain )
-{
-	require( clocks_remain > 0 );
-	
-	int const phase = m.phase;
-	m.phase = (phase + clocks_remain) & 31;
-	switch ( phase )
-	{
-	loop:
-	
-		#define PHASE( n ) if ( n && !--clocks_remain ) break; case n:
-		GEN_DSP_TIMING
-		#undef PHASE
-	
-		if ( --clocks_remain )
-			goto loop;
-	}
-}
-
-#endif
 
 
 //// Setup
 
+void Spc_Dsp::apply_output_enables()
+{
+	for ( int i = 0; i < voice_count; i++ )
+		update_voice_vol( i * 0x10 );
+}
+
 void Spc_Dsp::init( void* ram_64k )
 {
 	ram = (byte*) ram_64k;
-	disable_surround( false );
+	disable_surround( false ); // must be before mute_voices
 	mute_voices( 0 );
 	set_output( NULL, 0 );
 	reset();
@@ -856,28 +668,25 @@ void Spc_Dsp::soft_reset()
 	m.phase              = 0;
 	
 	init_counter();
-	
-	kon_check = false;
 }
 
 void Spc_Dsp::load( byte const new_regs [register_count] )
 {
-	memcpy( regs, new_regs, register_count );
+	memcpy( regs, new_regs, sizeof regs );
 	BLARGG_CLEAR( &m );
 	
 	for ( int i = voice_count; --i >= 0; )
 	{
-		voice_t* v = &m.voices [i];
-		v->brr_offset = 1;
-		v->vbit       = 1 << i;
-		v->regs       = &regs [i * 0x10];
+		voice_t& v = m.voices [i];
+		v.brr_offset = 1;
+		v.buf_pos    = v.buf;
 	}
 	m.new_kon = REG(kon);
-	m.t_dir   = REG(dir);
-	m.t_esa   = REG(esa);
 	
 	soft_reset();
 	REG(flg) = new_regs [r_flg]; // soft_reset() overwrites this
+	
+	apply_output_enables();
 }
 
 void Spc_Dsp::reset()
@@ -892,6 +701,6 @@ void Spc_Dsp::reset()
 		0x75,0xF5,0x06,0x97,0x10,0xC3,0x24,0xBB,0x00,0x00,0x7B,0x7A,0xE0,0x60,0x12,0x0F,
 		0xF7,0x74,0x1C,0xE5,0x39,0x3D,0x73,0xC1,0x00,0x00,0x7A,0xB3,0xFF,0x4E,0x7B,0xFF
 	};
-	
+
 	load( initial_regs );
 }
diff --git a/bsnes/smp/snes_spc/Spc_Dsp.h b/bsnes/smp/snes_spc/Spc_Dsp.h
index 95f136d7..0da18456 100755
--- a/bsnes/smp/snes_spc/Spc_Dsp.h
+++ b/bsnes/smp/snes_spc/Spc_Dsp.h
@@ -1,4 +1,4 @@
-// Highly accurate SNES SPC-700 DSP emulator
+// Fast SNES SPC-700 DSP emulator (about 3x speed of accurate one)
 
 // snes_spc 0.9.5
 #ifndef BLARGG_SPC_DSP_H
@@ -8,16 +8,18 @@
 
 BLARGG_NAMESPACE_BEGIN
 
-extern "C" { typedef void (*dsp_copy_func_t)( unsigned char** io, void* state, size_t ); }
-
 struct Spc_Dsp {
 public:
 	typedef BOOST::uint8_t byte;
 	
 // Setup
-
+	
 	// Initializes DSP and has it use the 64K RAM provided
 	void init( void* ram_64k );
+
+	// Sets function that is called when output buffer is filled, or NULL for none
+	blargg_callback<void (*)( void* user_data )> set_output_callback;
+	//void set_output_callback( void (*func)( void* user_data ), void* user_data );
 	
 	// Sets destination for output samples. If begin is NULL, doesn't generate any.
 	typedef short sample_t;
@@ -26,52 +28,42 @@ public:
 	// Current position in output buffer, or NULL if no buffer set
 	sample_t* output_ptr() const;
 	
-	// Sets function that is called when output buffer is filled, or NULL for none
-	blargg_callback<void (*)( void* user_data )> set_output_callback;
-	//void set_output_callback( void (*func)( void* user_data ), void* user_data );
-	
-// Emulation
+	// Number of samples written to output buffer since last set, or 0 if no buffer set.
+	int sample_count() const;
 
+// Emulation
+	
 	// Resets DSP to power-on state
 	void reset();
 
 	// Emulates pressing reset switch on SNES
 	void soft_reset();
 	
-	// Reads/writes DSP registers. For accuracy, you must first call run()
+	// Reads/writes DSP registers. For accuracy, you must first call spc_run_dsp()
 	// to catch the DSP up to present.
 	int  read ( int addr ) const;
 	void write( int addr, int data );
 
 	// Runs DSP for specified number of clocks (~1024000 per second). Every 32 clocks
-	// a pair of samples is be generated.
+	// a pair of samples is generated.
 	void run( int clock_count );
-	
+
 // Sound control
 
-	// Using these reduces emulation accuracy.
-	
-	// Mutes voices corresponding to non-zero bits in mask (issues repeated KOFF events).
+	// Mutes voices corresponding to non-zero bits in mask (overrides VxVOL with 0).
+	// Reduces emulation accuracy.
 	enum { voice_count = 8 };
-	void mute_voices( int mask )            { mute_mask = mask; }
+	void mute_voices( int mask )        { mute_mask = mask; }
 	
 	// If true, prevents channels and global volumes from being phase-negated
 	void disable_surround( bool disable = true );
-	
+
 // State
 	
 	// Resets DSP and uses supplied values to initialize registers
 	enum { register_count = 128 };
 	void load( byte const regs [register_count] );
 
-	// Saves/loads exact emulator state
-	enum { state_size = 640 }; // maximum space needed when saving
-	typedef dsp_copy_func_t copy_func_t;
-	void copy_state( unsigned char** io, copy_func_t );
-
-	// Returns non-zero if new key-on events occurred since last call
-	bool check_kon();
-	
 // DSP register addresses
 
 	// Global registers
@@ -109,21 +101,17 @@ public:
 	struct voice_t
 	{
 		int buf [brr_buf_size*2];// decoded samples (twice the size to simplify wrap handling)
-		int buf_pos;            // place in buffer where next samples will be decoded
+		int* buf_pos;           // place in buffer where next samples will be decoded
 		int interp_pos;         // relative fractional position in sample (0x1000 = 1.0)
 		int brr_addr;           // address of current BRR block
 		int brr_offset;         // current decoding offset in BRR block
-		byte* regs;             // pointer to voice's DSP registers
-		int vbit;               // bitmask for voice: 0x01 for voice 0, 0x02 for voice 1, etc.
 		int kon_delay;          // KON delay/current setup phase
 		env_mode_t env_mode;
 		int env;                // current envelope level
 		int hidden_env;         // used by GAIN mode 7, very obscure quirk
-		byte t_envx_out;
+		int volume [2];         // copy of volume from DSP registers, with surround disabled
 	};
 private:
-	enum { brr_block_size = 9 };
-	
 	// non-emulation state
 	byte* ram; // 64K shared RAM between DSP and SMP
 	int mute_mask;
@@ -133,145 +121,81 @@ private:
 	sample_t* output_end;
 	sample_t* user_output_end;
 	sample_t dummy_buf [2];
-	bool kon_check;         // set when a new KON occurs
 	
 	struct state_t
 	{
 		int every_other_sample; // toggles every sample
 		int kon;                // KON value when last checked
 		int noise;
-		int counter;
 		int echo_offset;        // offset from ESA in echo buffer
 		int echo_length;        // number of bytes that echo_offset will stop at
 		int phase;              // next clock cycle to run (0-31)
+		unsigned counters [4];
 		
-		// Hidden registers also written to when main register is written to
-		int  new_kon;
-		byte endx_buf;
-		byte envx_buf;
-		byte outx_buf;
-		
-		// Temporary state between clocks
-		
-		// read once per sample
-		int t_pmon;
-		int t_non;
-		int t_eon;
-		int t_dir;
+		int new_kon;
 		int t_koff;
 		
-		// read a few clocks ahead then used
-		int t_brr_next_addr;
-		int t_adsr0;
-		int t_brr_header;
-		int t_brr_byte;
-		int t_srcn;
-		int t_esa;
-		int t_echo_enabled;
-		
-		// internal state that is recalculated every sample
-		int t_dir_addr;
-		int t_pitch;
-		int t_output;
-		int t_looped;
-		int t_echo_ptr;
-		
-		// left/right sums
-		int t_main_out [2];
-		int t_echo_out [2];
-		int t_echo_in  [2];
-		
-		voice_t voices [voice_count];
-		
 		// Echo history keeps most recent 8 samples (twice the size to simplify wrap handling)
 		int (*echo_hist_pos) [2]; // &echo_hist [0 to 7]
 		int echo_hist [echo_hist_size * 2] [2];
+		
+		unsigned* counter_select [32];
+		voice_t voices [voice_count];
 	};
 	state_t m;
 	
 	byte regs [register_count];
 	
 	void init_counter();
-	void run_counters();
-	unsigned read_counter( int rate );
-	
-	int  interpolate( voice_t const* v );
-	void run_envelope( voice_t* const v );
-	void decode_brr( voice_t* v );
-
-	void misc_27();
-	void misc_28();
-	void misc_29();
-	void misc_30();
-
-	void voice_output( voice_t const* v, int ch );
-	void voice_V1( voice_t* const );
-	void voice_V2( voice_t* const );
-	void voice_V3( voice_t* const );
-	void voice_V3a( voice_t* const );
-	void voice_V3b( voice_t* const );
-	void voice_V3c( voice_t* const );
-	void voice_V4( voice_t* const );
-	void voice_V5( voice_t* const );
-	void voice_V6( voice_t* const );
-	void voice_V7( voice_t* const );
-	void voice_V8( voice_t* const );
-	void voice_V9( voice_t* const );
-	void voice_V7_V4_V1( voice_t* const );
-	void voice_V8_V5_V2( voice_t* const );
-	void voice_V9_V6_V3( voice_t* const );
-
-	void echo_read( int ch );
-	int  echo_output( int ch );
-	void echo_write( int ch );
-	void echo_22();
-	void echo_23();
-	void echo_24();
-	void echo_25();
-	void echo_26();
-	void echo_27();
-	void echo_28();
-	void echo_29();
-	void echo_30();
-	
+	void run_counter( int );
+	void update_voice_vol( int addr );
 	void set_null_output();
 	void write_sample( int l, int r );
+	void apply_output_enables();
 };
 
-#include <assert.h>
-
 inline int Spc_Dsp::read( int addr ) const
 {
 	assert( (unsigned) addr < register_count );
-	
 	return regs [addr];
 }
 
+inline void Spc_Dsp::update_voice_vol( int addr )
+{
+	int l = (int8_t) regs [addr + v_voll];
+	int r = (int8_t) regs [addr + v_volr];
+	
+	if ( l * r < surround_threshold )
+	{
+		// signs differ, so negate those that are negative
+		l ^= l >> 7;
+		r ^= r >> 7;
+	}
+	
+	int index = addr >> 4;
+	voice_t& v = m.voices [index];
+	int enabled = ~mute_mask >> index & 1;
+	v.volume [0] = l * enabled;
+	v.volume [1] = r * enabled;
+}
+
 inline void Spc_Dsp::write( int addr, int data )
 {
 	assert( (unsigned) addr < register_count );
 	
 	regs [addr] = (byte) data;
-	switch ( addr & 0x0F )
+	int low = addr & 0x0F;
+	if ( low < 0x2 ) // voice volumes
+	{
+		update_voice_vol( low ^ addr /* addr & 0xF0 */ );
+	}
+	else if ( low == 0xC )
 	{
-	case v_envx:
-		m.envx_buf = (byte) data;
-		break;
-		
-	case v_outx:
-		m.outx_buf = (byte) data;
-		break;
-	
-	case 0x0C:
 		if ( addr == r_kon )
 			m.new_kon = (byte) data;
 		
 		if ( addr == r_endx ) // always cleared, regardless of data written
-		{
-			m.endx_buf = 0;
 			regs [r_endx] = 0;
-		}
-		break;
 	}
 }
 
@@ -280,40 +204,22 @@ inline void Spc_Dsp::disable_surround( bool disable )
 	surround_threshold = disable ? 0 : -0x4000;
 }
 
-inline bool Spc_Dsp::check_kon()
-{
-	bool old = kon_check;
-	kon_check = 0;
-	return old;
-}
-
 inline Spc_Dsp::sample_t* Spc_Dsp::output_ptr() const
 {
 	// Don't return pointer into dummy_buf
 	return (output_ptr_ != dummy_buf ? output_ptr_ : user_output_end);
 }
 
-class SPC_State_Copier {
-	Spc_Dsp::copy_func_t func;
-	unsigned char** buf;
-public:
-	SPC_State_Copier( unsigned char** p, Spc_Dsp::copy_func_t f ) { func = f; buf = p; }
-	void copy( void* state, size_t size );
-	int copy_int( int state, int size );
-	void skip( int count );
-	
-	// Reads uint8_t and then skips that many bytes. If writing, writes
-	// uint8_t of 0. This allows future expansion at this point, by writing
-	// non-zero and additional data.
-	void extra();
-};
-
-#define SPC_COPY( type, state )\
-{\
-	state = (BOOST::type) copier.copy_int( state, sizeof (BOOST::type) );\
-	check( (BOOST::type) state == state );\
+inline int Spc_Dsp::sample_count() const
+{
+	sample_t* p = output_ptr();
+	return (p ? p - output_begin : 0);
 }
 
+#define SPC_NO_COPY_STATE_FUNCS 1
+
+#define SPC_LESS_ACCURATE 1
+
 BLARGG_NAMESPACE_END
 
 #endif