diff --git a/core/build.h b/core/build.h index 677aa0964..172259039 100755 --- a/core/build.h +++ b/core/build.h @@ -188,10 +188,10 @@ //defaults #ifndef FEAT_SHREC - #if HOST_CPU == CPU_MIPS - #define FEAT_SHREC DYNAREC_NONE - #else + #if HOST_CPU == CPU_ARM || HOST_CPU == CPU_ARM64 || HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 #define FEAT_SHREC DYNAREC_JIT + #else + #define FEAT_SHREC DYNAREC_NONE #endif #endif @@ -204,7 +204,7 @@ #endif #ifndef FEAT_DSPREC - #if HOST_CPU == CPU_X86 || HOST_CPU == CPU_ARM64 || HOST_CPU == CPU_X64 + #if HOST_CPU == CPU_ARM || HOST_CPU == CPU_ARM64 || HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 #define FEAT_DSPREC DYNAREC_JIT #else #define FEAT_DSPREC DYNAREC_NONE diff --git a/core/hw/aica/dsp_arm32.cpp b/core/hw/aica/dsp_arm32.cpp new file mode 100644 index 000000000..097ed5827 --- /dev/null +++ b/core/hw/aica/dsp_arm32.cpp @@ -0,0 +1,426 @@ +/* + Copyright 2021 flyinghead + + This file is part of reicast. + + reicast is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + reicast is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with reicast. If not, see . + */ +#include "build.h" + +#if HOST_CPU == CPU_ARM && FEAT_DSPREC != DYNAREC_NONE + +#include "dsp.h" +#include "aica.h" +#include "aica_if.h" +#include "hw/mem/_vmem.h" +#include +using namespace vixl::aarch32; + +constexpr size_t CodeSize = 4096 * 8; //32 kb, 8 pages + +#if defined(__unix__) +alignas(4096) static u8 DynCode[CodeSize] __attribute__((section(".text"))); +#else +#error "Unsupported platform for arm32 DSP dynarec" +#endif + +class DSPAssembler : public MacroAssembler +{ +public: + DSPAssembler(u8 *code_buffer, size_t size) : MacroAssembler(code_buffer, size, A32) {} + + void compile(dsp_t *DSP) + { + this->DSP = DSP; + DEBUG_LOG(AICA_ARM, "DSPAssembler::compile recompiling for arm32 at %p", GetBuffer()->GetStartAddress()); + + RegisterList regList = RegisterList::Union( + RegisterList(r4, r5, r6, r7), + RegisterList(r8, r9, r10, lr)); + Push(regList); + Mov(r8, (uintptr_t)&DSP->TEMP[0]); // r8 points to TEMP, right after the code + Mov(r7, (uintptr_t)DSPData); // r7 points to DSPData + const Register& INPUTS = r4; // 24 bits + const Register& ACC = r5; // 26 bits - saved + const Register& B = r10; // 26 bits - saved + const Register& X = r6; // 24 bits + const Register& Y = r9; // 13 bits + + Mov(ACC, 0); + Mov(B, 0); + Str(B, dsp_operand(&DSP->FRC_REG)); + Str(B, dsp_operand(&DSP->Y_REG)); + Str(B, dsp_operand(&DSP->ADRS_REG)); + + for (int step = 0; step < 128; ++step) + { + u32 *mpro = &DSPData->MPRO[step * 4]; + _INST op; + DecodeInst(mpro, &op); + const u32 COEF = step; + + if (op.XSEL || op.YRL || (op.ADRL && op.SHIFT != 3)) + { + if (op.IRA <= 0x1f) + //INPUTS = DSP->MEMS[op.IRA]; + Ldr(INPUTS, dsp_operand(DSP->MEMS, op.IRA)); + else if (op.IRA <= 0x2F) + { + //INPUTS = DSP->MIXS[op.IRA - 0x20] << 4; // MIXS is 20 bit + Ldr(INPUTS, dsp_operand(DSP->MIXS, op.IRA - 0x20)); + Lsl(INPUTS, INPUTS, 4); + } + else if (op.IRA <= 0x31) + { + //INPUTS = DSPData->EXTS[op.IRA - 0x30] << 8; // EXTS is 16 bits + Ldr(INPUTS, dspdata_operand(DSPData->EXTS, op.IRA - 0x30)); + Lsl(INPUTS, INPUTS, 8); + } + else + { + Mov(INPUTS, 0); + } + } + + if (op.IWT) + { + //DSP->MEMS[op.IWA] = MEMVAL[step & 3]; // MEMVAL was selected in previous MRD + Ldr(r1, dsp_operand(DSP->MEMVAL, step & 3)); + Str(r1, dsp_operand(DSP->MEMS, op.IWA)); + } + + // Operand sel + // B + if (!op.ZERO) + { + if (op.BSEL) + //B = ACC; + Mov(B, ACC); + else + { + //B = DSP->TEMP[(TRA + DSP->regs.MDEC_CT) & 0x7F]; + Ldr(r1, dsp_operand(&DSP->regs.MDEC_CT)); + if (op.TRA) + Add(r1, r1, op.TRA); + Bfc(r1, 7, 25); + Ldr(B, dsp_operand(DSP->TEMP, r1)); + } + if (op.NEGB) + //B = 0 - B; + Rsb(B, B, 0); + } + + // X + const Register* X_alias = &X; + if (op.XSEL) + //X = INPUTS; + X_alias = &INPUTS; + else + { + //X = DSP->TEMP[(TRA + DSP->regs.MDEC_CT) & 0x7F]; + if (!op.ZERO && !op.BSEL && !op.NEGB) + X_alias = &B; + else + { + Ldr(r1, dsp_operand(&DSP->regs.MDEC_CT)); + if (op.TRA) + Add(r1, r1, op.TRA); + Bfc(r1, 7, 25); + Ldr(X, dsp_operand(DSP->TEMP, r1)); + } + } + + // Y + if (op.YSEL == 0) + { + //Y = FRC_REG; + Ldr(Y, dsp_operand(&DSP->FRC_REG)); + } + else if (op.YSEL == 1) + { + //Y = DSPData->COEF[COEF] >> 3; //COEF is 16 bits + Ldr(Y, dspdata_operand(DSPData->COEF, COEF)); + Sbfx(Y, Y, 3, 13); + } + else if (op.YSEL == 2) + { + //Y = Y_REG >> 11; + Ldr(r1, dsp_operand(&DSP->Y_REG)); + Asr(Y, r1, 11); + } + else if (op.YSEL == 3) + { + //Y = (Y_REG >> 4) & 0x0FFF; + Ldr(r1, dsp_operand(&DSP->Y_REG)); + Ubfx(Y, r1, 4, 12); + } + + if (op.YRL) + //Y_REG = INPUTS; + Str(INPUTS, dsp_operand(&DSP->Y_REG)); + + if (op.TWT || op.FRCL || op.MWT || (op.ADRL && op.SHIFT == 3) || op.EWT) + { + // Shifter + // There's a 1-step delay at the output of the X*Y + B adder. So we use the ACC value from the previous step. + if (op.SHIFT == 0) + { + // SHIFTED = clamp(ACC, -0x800000, 0x7FFFFF) + Ssat(r2, 24, ACC); + } + else if (op.SHIFT == 1) + { + //SHIFTED = ACC << 1; // x2 scale + // SHIFTED = clamp(SHIFTED, -0x800000, 0x7FFFFF) + Ssat(r2, 24, Operand(ACC, LSL, 1)); + } + else if (op.SHIFT == 2) + { + //SHIFTED = ACC << 1; // x2 scale + Lsl(r2, ACC, 1); + } + else if (op.SHIFT == 3) + { + //SHIFTED = ACC; + Mov(r2, ACC); + } + Str(r2, dsp_operand(&DSP->SHIFTED)); + } + + // ACCUM + //ACC = (((s64)X * (s64)Y) >> 12) + B; + Smull(r0, r1, *X_alias, Y); + Lsr(r0, r0, 12); + Orr(ACC, r0, Operand(r1, LSL, 20)); + if (!op.ZERO) + Add(ACC, ACC, B); + + if (op.TWT) + { + //DSP->TEMP[(op.TWA + DSP->regs.MDEC_CT) & 0x7F] = SHIFTED; + Ldr(r2, dsp_operand(&DSP->SHIFTED)); + Ldr(r1, dsp_operand(&DSP->regs.MDEC_CT)); + if (op.TWA) + Add(r1, r1, op.TWA); + Bfc(r1, 7, 25); + Str(r2, dsp_operand(DSP->TEMP, r1)); + } + + if (op.FRCL) + { + Ldr(r2, dsp_operand(&DSP->SHIFTED)); + if (op.SHIFT == 3) + //FRC_REG = SHIFTED & 0x0FFF; + Ubfx(r1, r2, 0, 12); + else + //FRC_REG = SHIFTED >> 11; + Asr(r1, r2, 11); + Str(r1, dsp_operand(&DSP->FRC_REG)); + } + + if (step & 1) + { + const Register& ADDR = r3; + if (op.MRD) // memory only allowed on odd. DoA inserts NOPs on even + { + //MEMVAL[(step + 2) & 3] = UNPACK(*(u16 *)&aica_ram[ADDR & ARAM_MASK]); + calculateADDR(ADDR, op); + Mov(r1, getAicaRam()); + Ldrh(r0, MemOperand(r1, ADDR)); + genCallRuntime(UNPACK); + Mov(r2, r0); + Str(r2, dsp_operand(DSP->MEMVAL, (step + 2) & 3)); + } + if (op.MWT) + { + // *(u16 *)&aica_ram[ADDR & ARAM_MASK] = PACK(SHIFTED); + Ldr(r0, dsp_operand(&DSP->SHIFTED)); + genCallRuntime(PACK); + Mov(r2, r0); + + calculateADDR(ADDR, op); + Mov(r1, getAicaRam()); + Strh(r2, MemOperand(r1, ADDR)); + } + } + + if (op.ADRL) + { + if (op.SHIFT == 3) + { + //ADRS_REG = SHIFTED >> 12; + Ldr(r2, dsp_operand(&DSP->SHIFTED)); + Asr(r1, r2, 12); + } + else + { + //ADRS_REG = INPUTS >> 16; + Asr(r1, INPUTS, 16); + } + Str(r1, dsp_operand(&DSP->ADRS_REG)); + } + + if (op.EWT) + { + //DSPData->EFREG[op.EWA] = SHIFTED >> 8; + Ldr(r1, dsp_operand(&DSP->SHIFTED)); + Asr(r1, r1, 8); + Str(r1, dspdata_operand(DSPData->EFREG, op.EWA)); + } + } + Ldr(r1, dsp_operand(&DSP->regs.MDEC_CT)); + // DSP->regs.MDEC_CT-- + Subs(r1, r1, 1); + //if (dsp.regs.MDEC_CT == 0) + // dsp.regs.MDEC_CT = dsp.RBL + 1; // RBL is ring buffer length - 1 + Mov(eq, r1, dsp.RBL + 1); + Str(r1, dsp_operand(&DSP->regs.MDEC_CT)); + + Pop(regList); + Mov(pc, lr); + + FinalizeCode(); + + vmem_platform_flush_cache( + GetBuffer()->GetStartAddress(), GetBuffer()->GetEndAddress(), + GetBuffer()->GetStartAddress(), GetBuffer()->GetEndAddress()); + } + +private: + MemOperand dsp_operand(void *data, int index = 0, u32 element_size = 4) + { + ptrdiff_t offset = ((u8*)data - (u8*)DSP) - offsetof(dsp_t, TEMP) + index * element_size; + if (offset <= 4095) + return MemOperand(r8, offset); + Mov(r0, offset); + return MemOperand(r8, r0); + } + + MemOperand dsp_operand(void *data, const Register& offset_reg, u32 element_size = 4) + { + ptrdiff_t offset = ((u8*)data - (u8*)DSP) - offsetof(dsp_t, TEMP); + if (offset == 0) + return MemOperand(r8, offset_reg, LSL, element_size == 4 ? 2 : element_size == 2 ? 1 : 0); + + Mov(r0, offset); + Add(r0, r0, Operand(offset_reg, LSL, element_size == 4 ? 2 : element_size == 2 ? 1 : 0)); + return MemOperand(r8, r0); + } + + MemOperand dspdata_operand(void *data, int index = 0, u32 element_size = 4) + { + ptrdiff_t offset = ((u8*)data - (u8*)DSPData) + index * element_size; + if (offset <= 4095) + return MemOperand(r7, offset); + Mov(r0, offset); + return MemOperand(r7, r0); + } + + template + void genCallRuntime(R (*function)(P...)) + { + ptrdiff_t offset = reinterpret_cast(function) - GetBuffer()->GetStartAddress(); + verify((offset & 3) == 0); + if (offset >= -32 * 1024 * 1024 && offset <= 32 * 1024 * 1024) + { + Label function_label(offset); + Bl(&function_label); + } + else + { + Mov(r3, reinterpret_cast(function)); + Blx(r3); + } + } + + void calculateADDR(const Register& ADDR, const _INST& op) + { + //u32 ADDR = DSPData->MADRS[op.MASA]; + Ldr(ADDR, dspdata_operand(DSPData->MADRS, op.MASA)); + if (op.ADREB) + { + //ADDR += ADRS_REG & 0x0FFF; + Ldr(r1, dsp_operand(&DSP->ADRS_REG)); + Ubfx(r0, r1, 0, 12); + Add(ADDR, ADDR, r0); + } + if (op.NXADR) + //ADDR++; + Add(ADDR, ADDR, 1); + if (!op.TABLE) + { + //ADDR += DSP->regs.MDEC_CT; + Ldr(r1, dsp_operand(&DSP->regs.MDEC_CT)); + Add(ADDR, ADDR, r1); + //ADDR &= DSP->RBL; + // RBL is constant for this program + And(ADDR, ADDR, DSP->RBL); + } + else + //ADDR &= 0xFFFF; + Bfc(ADDR, 16, 16); + + //ADDR <<= 1; // Word -> byte address + Lsl(ADDR, ADDR, 1); + //ADDR += DSP->RBP; // RBP is already a byte address + // RBP is constant for this program + Add(ADDR, ADDR, DSP->RBP); + // ADDR & ARAM_MASK + if (ARAM_SIZE == 2*1024*1024) + Bfc(ADDR, 21, 11); + else if (ARAM_SIZE == 8*1024*1024) + Bfc(ADDR, 23, 9); + else + die("Unsupported ARAM_SIZE"); + } + + uintptr_t getAicaRam() + { + return (uintptr_t)&aica_ram[0]; + } + + dsp_t *DSP = nullptr; +}; + +void dsp_recompile() +{ + dsp.Stopped = true; + for (int i = 127; i >= 0; --i) + { + u32 *IPtr = DSPData->MPRO + i * 4; + + if (IPtr[0] != 0 || IPtr[1] != 0 || IPtr[2 ]!= 0 || IPtr[3] != 0) + { + dsp.Stopped = false; + break; + } + } + JITWriteProtect(false); + DSPAssembler assembler(DynCode, CodeSize); + assembler.compile(&dsp); + JITWriteProtect(true); +} + +void dsp_rec_init() +{ + u8 *pCodeBuffer; + verify(vmem_platform_prepare_jit_block(DynCode, CodeSize, (void**)&pCodeBuffer)); +} + +void dsp_rec_step() +{ + ((void (*)())(DynCode))(); +} + +#endif diff --git a/core/hw/aica/sgc_if.cpp b/core/hw/aica/sgc_if.cpp index 4dbc7c4b2..f9e10ce64 100755 --- a/core/hw/aica/sgc_if.cpp +++ b/core/hw/aica/sgc_if.cpp @@ -1363,7 +1363,6 @@ void AICA_Sample32() //Generate 32 samples for each channel, before moving to next channel //much more cache efficient ! - u32 sg=0; for (int ch = 0; ch < 64; ch++) { for (int i=0;i<32;i++) @@ -1373,10 +1372,8 @@ void AICA_Sample32() if (!Chans[ch].Step(oLeft, oRight, oDsp)) break; - sg++; - if (oLeft + oRight == 0) - oLeft = oRight = oDsp; + oLeft = oRight = oDsp >> 4; mxlr[i*2+0] += oLeft; mxlr[i*2+1] += oRight;