diff --git a/core/build.h b/core/build.h
index 677aa0964..172259039 100755
--- a/core/build.h
+++ b/core/build.h
@@ -188,10 +188,10 @@
 //defaults
 
 #ifndef FEAT_SHREC
-	#if HOST_CPU == CPU_MIPS
-		#define FEAT_SHREC DYNAREC_NONE
-	#else
+	#if HOST_CPU == CPU_ARM || HOST_CPU == CPU_ARM64 || HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
 		#define FEAT_SHREC DYNAREC_JIT
+	#else
+		#define FEAT_SHREC DYNAREC_NONE
 	#endif
 #endif
 
@@ -204,7 +204,7 @@
 #endif
 
 #ifndef FEAT_DSPREC
-	#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_ARM64 || HOST_CPU == CPU_X64
+	#if HOST_CPU == CPU_ARM || HOST_CPU == CPU_ARM64 || HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
 		#define FEAT_DSPREC DYNAREC_JIT
 	#else
 		#define FEAT_DSPREC DYNAREC_NONE
diff --git a/core/hw/aica/dsp_arm32.cpp b/core/hw/aica/dsp_arm32.cpp
new file mode 100644
index 000000000..097ed5827
--- /dev/null
+++ b/core/hw/aica/dsp_arm32.cpp
@@ -0,0 +1,426 @@
+/*
+	Copyright 2021 flyinghead
+
+	This file is part of reicast.
+
+    reicast is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 2 of the License, or
+    (at your option) any later version.
+
+    reicast is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with reicast.  If not, see <https://www.gnu.org/licenses/>.
+ */
+#include "build.h"
+
+#if HOST_CPU == CPU_ARM && FEAT_DSPREC != DYNAREC_NONE
+
+#include "dsp.h"
+#include "aica.h"
+#include "aica_if.h"
+#include "hw/mem/_vmem.h"
+#include <aarch32/macro-assembler-aarch32.h>
+using namespace vixl::aarch32;
+
+constexpr size_t CodeSize = 4096 * 8;	//32 kb, 8 pages
+
+#if defined(__unix__)
+alignas(4096) static u8 DynCode[CodeSize] __attribute__((section(".text")));
+#else
+#error "Unsupported platform for arm32 DSP dynarec"
+#endif
+
+class DSPAssembler : public MacroAssembler
+{
+public:
+	DSPAssembler(u8 *code_buffer, size_t size) : MacroAssembler(code_buffer, size, A32) {}
+
+	void compile(dsp_t *DSP)
+	{
+		this->DSP = DSP;
+		DEBUG_LOG(AICA_ARM, "DSPAssembler::compile recompiling for arm32 at %p", GetBuffer()->GetStartAddress<void*>());
+
+		RegisterList regList = RegisterList::Union(
+				RegisterList(r4, r5, r6, r7),
+				RegisterList(r8, r9, r10, lr));
+		Push(regList);
+		Mov(r8, (uintptr_t)&DSP->TEMP[0]);	// r8 points to TEMP, right after the code
+		Mov(r7, (uintptr_t)DSPData);		// r7 points to DSPData
+		const Register& INPUTS = r4;		// 24 bits
+		const Register& ACC = r5;			// 26 bits - saved
+		const Register& B = r10;			// 26 bits - saved
+		const Register& X = r6;				// 24 bits
+		const Register& Y = r9;				// 13 bits
+
+		Mov(ACC, 0);
+		Mov(B, 0);
+		Str(B, dsp_operand(&DSP->FRC_REG));
+		Str(B, dsp_operand(&DSP->Y_REG));
+		Str(B, dsp_operand(&DSP->ADRS_REG));
+
+		for (int step = 0; step < 128; ++step)
+		{
+			u32 *mpro = &DSPData->MPRO[step * 4];
+			_INST op;
+			DecodeInst(mpro, &op);
+			const u32 COEF = step;
+
+			if (op.XSEL || op.YRL || (op.ADRL && op.SHIFT != 3))
+			{
+				if (op.IRA <= 0x1f)
+					//INPUTS = DSP->MEMS[op.IRA];
+					Ldr(INPUTS, dsp_operand(DSP->MEMS, op.IRA));
+				else if (op.IRA <= 0x2F)
+				{
+					//INPUTS = DSP->MIXS[op.IRA - 0x20] << 4;		// MIXS is 20 bit
+					Ldr(INPUTS, dsp_operand(DSP->MIXS, op.IRA - 0x20));
+					Lsl(INPUTS, INPUTS, 4);
+				}
+				else if (op.IRA <= 0x31)
+				{
+					//INPUTS = DSPData->EXTS[op.IRA - 0x30] << 8;	// EXTS is 16 bits
+					Ldr(INPUTS, dspdata_operand(DSPData->EXTS, op.IRA - 0x30));
+					Lsl(INPUTS, INPUTS, 8);
+				}
+				else
+				{
+					Mov(INPUTS, 0);
+				}
+			}
+
+			if (op.IWT)
+			{
+				//DSP->MEMS[op.IWA] = MEMVAL[step & 3];	// MEMVAL was selected in previous MRD
+				Ldr(r1, dsp_operand(DSP->MEMVAL, step & 3));
+				Str(r1, dsp_operand(DSP->MEMS, op.IWA));
+			}
+
+			// Operand sel
+			// B
+			if (!op.ZERO)
+			{
+				if (op.BSEL)
+					//B = ACC;
+					Mov(B, ACC);
+				else
+				{
+					//B = DSP->TEMP[(TRA + DSP->regs.MDEC_CT) & 0x7F];
+					Ldr(r1, dsp_operand(&DSP->regs.MDEC_CT));
+					if (op.TRA)
+						Add(r1, r1, op.TRA);
+					Bfc(r1, 7, 25);
+					Ldr(B, dsp_operand(DSP->TEMP, r1));
+				}
+				if (op.NEGB)
+					//B = 0 - B;
+					Rsb(B, B, 0);
+			}
+
+			// X
+			const Register* X_alias = &X;
+			if (op.XSEL)
+				//X = INPUTS;
+				X_alias = &INPUTS;
+			else
+			{
+				//X = DSP->TEMP[(TRA + DSP->regs.MDEC_CT) & 0x7F];
+				if (!op.ZERO && !op.BSEL && !op.NEGB)
+					X_alias = &B;
+				else
+				{
+					Ldr(r1, dsp_operand(&DSP->regs.MDEC_CT));
+					if (op.TRA)
+						Add(r1, r1, op.TRA);
+					Bfc(r1, 7, 25);
+					Ldr(X, dsp_operand(DSP->TEMP, r1));
+				}
+			}
+
+			// Y
+			if (op.YSEL == 0)
+			{
+				//Y = FRC_REG;
+				Ldr(Y, dsp_operand(&DSP->FRC_REG));
+			}
+			else if (op.YSEL == 1)
+			{
+				//Y = DSPData->COEF[COEF] >> 3;	//COEF is 16 bits
+				Ldr(Y, dspdata_operand(DSPData->COEF, COEF));
+				Sbfx(Y, Y, 3, 13);
+			}
+			else if (op.YSEL == 2)
+			{
+				//Y = Y_REG >> 11;
+				Ldr(r1, dsp_operand(&DSP->Y_REG));
+				Asr(Y, r1, 11);
+			}
+			else if (op.YSEL == 3)
+			{
+				//Y = (Y_REG >> 4) & 0x0FFF;
+				Ldr(r1, dsp_operand(&DSP->Y_REG));
+				Ubfx(Y, r1, 4, 12);
+			}
+
+			if (op.YRL)
+				//Y_REG = INPUTS;
+				Str(INPUTS, dsp_operand(&DSP->Y_REG));
+
+			if (op.TWT || op.FRCL || op.MWT || (op.ADRL && op.SHIFT == 3) || op.EWT)
+			{
+				// Shifter
+				// There's a 1-step delay at the output of the X*Y + B adder. So we use the ACC value from the previous step.
+				if (op.SHIFT == 0)
+				{
+					// SHIFTED = clamp(ACC, -0x800000, 0x7FFFFF)
+					Ssat(r2, 24, ACC);
+				}
+				else if (op.SHIFT == 1)
+				{
+					//SHIFTED = ACC << 1;	// x2 scale
+					// SHIFTED = clamp(SHIFTED, -0x800000, 0x7FFFFF)
+					Ssat(r2, 24, Operand(ACC, LSL, 1));
+				}
+				else if (op.SHIFT == 2)
+				{
+					//SHIFTED = ACC << 1;	// x2 scale
+					Lsl(r2, ACC, 1);
+				}
+				else if (op.SHIFT == 3)
+				{
+					//SHIFTED = ACC;
+					Mov(r2, ACC);
+				}
+				Str(r2, dsp_operand(&DSP->SHIFTED));
+			}
+
+			// ACCUM
+			//ACC = (((s64)X * (s64)Y) >> 12) + B;
+			Smull(r0, r1, *X_alias, Y);
+			Lsr(r0, r0, 12);
+			Orr(ACC, r0, Operand(r1, LSL, 20));
+			if (!op.ZERO)
+				Add(ACC, ACC, B);
+
+			if (op.TWT)
+			{
+				//DSP->TEMP[(op.TWA + DSP->regs.MDEC_CT) & 0x7F] = SHIFTED;
+				Ldr(r2, dsp_operand(&DSP->SHIFTED));
+				Ldr(r1, dsp_operand(&DSP->regs.MDEC_CT));
+				if (op.TWA)
+					Add(r1, r1, op.TWA);
+				Bfc(r1, 7, 25);
+				Str(r2, dsp_operand(DSP->TEMP, r1));
+			}
+
+			if (op.FRCL)
+			{
+				Ldr(r2, dsp_operand(&DSP->SHIFTED));
+				if (op.SHIFT == 3)
+					//FRC_REG = SHIFTED & 0x0FFF;
+					Ubfx(r1, r2, 0, 12);
+				else
+					//FRC_REG = SHIFTED >> 11;
+					Asr(r1, r2, 11);
+				Str(r1, dsp_operand(&DSP->FRC_REG));
+			}
+
+			if (step & 1)
+			{
+				const Register& ADDR = r3;
+				if (op.MRD)			// memory only allowed on odd. DoA inserts NOPs on even
+				{
+					//MEMVAL[(step + 2) & 3] = UNPACK(*(u16 *)&aica_ram[ADDR & ARAM_MASK]);
+					calculateADDR(ADDR, op);
+					Mov(r1, getAicaRam());
+					Ldrh(r0, MemOperand(r1, ADDR));
+					genCallRuntime(UNPACK);
+					Mov(r2, r0);
+					Str(r2, dsp_operand(DSP->MEMVAL, (step + 2) & 3));
+				}
+				if (op.MWT)
+				{
+					// *(u16 *)&aica_ram[ADDR & ARAM_MASK] = PACK(SHIFTED);
+					Ldr(r0, dsp_operand(&DSP->SHIFTED));
+					genCallRuntime(PACK);
+					Mov(r2, r0);
+
+					calculateADDR(ADDR, op);
+					Mov(r1, getAicaRam());
+					Strh(r2, MemOperand(r1, ADDR));
+				}
+			}
+
+			if (op.ADRL)
+			{
+				if (op.SHIFT == 3)
+				{
+					//ADRS_REG = SHIFTED >> 12;
+					Ldr(r2, dsp_operand(&DSP->SHIFTED));
+					Asr(r1, r2, 12);
+				}
+				else
+				{
+					//ADRS_REG = INPUTS >> 16;
+					Asr(r1, INPUTS, 16);
+				}
+				Str(r1, dsp_operand(&DSP->ADRS_REG));
+			}
+
+			if (op.EWT)
+			{
+				//DSPData->EFREG[op.EWA] = SHIFTED >> 8;
+				Ldr(r1, dsp_operand(&DSP->SHIFTED));
+				Asr(r1, r1, 8);
+				Str(r1, dspdata_operand(DSPData->EFREG, op.EWA));
+			}
+		}
+		Ldr(r1, dsp_operand(&DSP->regs.MDEC_CT));
+		// DSP->regs.MDEC_CT--
+		Subs(r1, r1, 1);
+		//if (dsp.regs.MDEC_CT == 0)
+		//	dsp.regs.MDEC_CT = dsp.RBL + 1;			// RBL is ring buffer length - 1
+		Mov(eq, r1, dsp.RBL + 1);
+		Str(r1, dsp_operand(&DSP->regs.MDEC_CT));
+
+		Pop(regList);
+		Mov(pc, lr);
+
+		FinalizeCode();
+
+		vmem_platform_flush_cache(
+			GetBuffer()->GetStartAddress<char*>(), GetBuffer()->GetEndAddress<char*>(),
+			GetBuffer()->GetStartAddress<void*>(), GetBuffer()->GetEndAddress<void*>());
+	}
+
+private:
+	MemOperand dsp_operand(void *data, int index = 0, u32 element_size = 4)
+	{
+		ptrdiff_t offset = ((u8*)data - (u8*)DSP) - offsetof(dsp_t, TEMP) + index  * element_size;
+		if (offset <= 4095)
+			return MemOperand(r8, offset);
+		Mov(r0, offset);
+		return MemOperand(r8, r0);
+	}
+
+	MemOperand dsp_operand(void *data, const Register& offset_reg, u32 element_size = 4)
+	{
+		ptrdiff_t offset = ((u8*)data - (u8*)DSP) - offsetof(dsp_t, TEMP);
+		if (offset == 0)
+			return MemOperand(r8, offset_reg, LSL, element_size == 4 ? 2 : element_size == 2 ? 1 : 0);
+
+		Mov(r0, offset);
+		Add(r0, r0, Operand(offset_reg, LSL, element_size == 4 ? 2 : element_size == 2 ? 1 : 0));
+		return MemOperand(r8, r0);
+	}
+
+	MemOperand dspdata_operand(void *data, int index = 0, u32 element_size = 4)
+	{
+		ptrdiff_t offset = ((u8*)data - (u8*)DSPData) + index  * element_size;
+		if (offset <= 4095)
+			return MemOperand(r7, offset);
+		Mov(r0, offset);
+		return MemOperand(r7, r0);
+	}
+
+	template <typename R, typename... P>
+	void genCallRuntime(R (*function)(P...))
+	{
+		ptrdiff_t offset = reinterpret_cast<uintptr_t>(function) - GetBuffer()->GetStartAddress<uintptr_t>();
+		verify((offset & 3) == 0);
+		if (offset >= -32 * 1024 * 1024 && offset <= 32 * 1024 * 1024)
+		{
+			Label function_label(offset);
+			Bl(&function_label);
+		}
+		else
+		{
+			Mov(r3, reinterpret_cast<uintptr_t>(function));
+			Blx(r3);
+		}
+	}
+
+	void calculateADDR(const Register& ADDR, const _INST& op)
+	{
+		//u32 ADDR = DSPData->MADRS[op.MASA];
+		Ldr(ADDR, dspdata_operand(DSPData->MADRS, op.MASA));
+		if (op.ADREB)
+		{
+			//ADDR += ADRS_REG & 0x0FFF;
+			Ldr(r1, dsp_operand(&DSP->ADRS_REG));
+			Ubfx(r0, r1, 0, 12);
+			Add(ADDR, ADDR, r0);
+		}
+		if (op.NXADR)
+			//ADDR++;
+			Add(ADDR, ADDR, 1);
+		if (!op.TABLE)
+		{
+			//ADDR += DSP->regs.MDEC_CT;
+			Ldr(r1, dsp_operand(&DSP->regs.MDEC_CT));
+			Add(ADDR, ADDR, r1);
+			//ADDR &= DSP->RBL;
+			// RBL is constant for this program
+			And(ADDR, ADDR, DSP->RBL);
+		}
+		else
+			//ADDR &= 0xFFFF;
+			Bfc(ADDR, 16, 16);
+
+		//ADDR <<= 1;				// Word -> byte address
+		Lsl(ADDR, ADDR, 1);
+		//ADDR += DSP->RBP;			// RBP is already a byte address
+		// RBP is constant for this program
+		Add(ADDR, ADDR, DSP->RBP);
+		// ADDR & ARAM_MASK
+		if (ARAM_SIZE == 2*1024*1024)
+			Bfc(ADDR, 21, 11);
+		else if (ARAM_SIZE == 8*1024*1024)
+			Bfc(ADDR, 23, 9);
+		else
+			die("Unsupported ARAM_SIZE");
+	}
+
+	uintptr_t getAicaRam()
+	{
+		return (uintptr_t)&aica_ram[0];
+	}
+
+	dsp_t *DSP = nullptr;
+};
+
+void dsp_recompile()
+{
+	dsp.Stopped = true;
+	for (int i = 127; i >= 0; --i)
+	{
+		u32 *IPtr = DSPData->MPRO + i * 4;
+
+		if (IPtr[0] != 0 || IPtr[1] != 0 || IPtr[2 ]!= 0 || IPtr[3] != 0)
+		{
+			dsp.Stopped = false;
+			break;
+		}
+	}
+	JITWriteProtect(false);
+	DSPAssembler assembler(DynCode, CodeSize);
+	assembler.compile(&dsp);
+	JITWriteProtect(true);
+}
+
+void dsp_rec_init()
+{
+	u8 *pCodeBuffer;
+	verify(vmem_platform_prepare_jit_block(DynCode, CodeSize, (void**)&pCodeBuffer));
+}
+
+void dsp_rec_step()
+{
+	((void (*)())(DynCode))();
+}
+
+#endif
diff --git a/core/hw/aica/sgc_if.cpp b/core/hw/aica/sgc_if.cpp
index 4dbc7c4b2..f9e10ce64 100755
--- a/core/hw/aica/sgc_if.cpp
+++ b/core/hw/aica/sgc_if.cpp
@@ -1363,7 +1363,6 @@ void AICA_Sample32()
 
 	//Generate 32 samples for each channel, before moving to next channel
 	//much more cache efficient !
-	u32 sg=0;
 	for (int ch = 0; ch < 64; ch++)
 	{
 		for (int i=0;i<32;i++)
@@ -1373,10 +1372,8 @@ void AICA_Sample32()
 			if (!Chans[ch].Step(oLeft, oRight, oDsp))
 				break;
 
-			sg++;
-
 			if (oLeft + oRight == 0)
-				oLeft = oRight = oDsp;
+				oLeft = oRight = oDsp >> 4;
 
 			mxlr[i*2+0] += oLeft;
 			mxlr[i*2+1] += oRight;