#include "arm7.h" #include "arm_mem.h" #include "virt_arm.h" #include #define C_CORE #if 0 #define arm_printf printf #else void arm_printf(...) { } #endif //#define CPUReadHalfWordQuick(addr) arm_ReadMem16(addr & 0x7FFFFF) #define CPUReadMemoryQuick(addr) (*(u32*)&aica_ram[addr&ARAM_MASK]) #define CPUReadByte arm_ReadMem8 #define CPUReadMemory arm_ReadMem32 #define CPUReadHalfWord arm_ReadMem16 #define CPUReadHalfWordSigned(addr) ((s16)arm_ReadMem16(addr)) #define CPUWriteMemory arm_WriteMem32 #define CPUWriteHalfWord arm_WriteMem16 #define CPUWriteByte arm_WriteMem8 #define reg arm_Reg #define armNextPC reg[R15_ARM_NEXT].I #define CPUUpdateTicksAccesint(a) 1 #define CPUUpdateTicksAccessSeq32(a) 1 #define CPUUpdateTicksAccesshort(a) 1 #define CPUUpdateTicksAccess32(a) 1 #define CPUUpdateTicksAccess16(a) 1 //bool arm_FiqPending; -- not used , i use the input directly :) //bool arm_IrqPending; DECL_ALIGN(8) reg_pair arm_Reg[RN_ARM_REG_COUNT]; void CPUSwap(u32 *a, u32 *b) { u32 c = *b; *b = *a; *a = c; } /* bool N_FLAG; bool Z_FLAG; bool C_FLAG; bool V_FLAG; */ #define N_FLAG (reg[RN_PSR_FLAGS].FLG.N) #define Z_FLAG (reg[RN_PSR_FLAGS].FLG.Z) #define C_FLAG (reg[RN_PSR_FLAGS].FLG.C) #define V_FLAG (reg[RN_PSR_FLAGS].FLG.V) bool armIrqEnable; bool armFiqEnable; //bool armState; int armMode; bool Arm7Enabled=false; u8 cpuBitsSet[256]; bool intState = false; bool stopState = false; bool holdState = false; void CPUSwitchMode(int mode, bool saveState, bool breakLoop=true); extern "C" void CPUFiq(); void CPUUpdateCPSR(); void CPUUpdateFlags(); void CPUSoftwareInterrupt(int comment); void CPUUndefinedException(); #if FEAT_AREC == DYNAREC_NONE void arm_Run_(u32 CycleCount) { if (!Arm7Enabled) return; u32 clockTicks=0; while (clockTicks #endif extern "C" void CompileCode(); //Emulate a single arm op, passed in opcode //DYNACALL for ECX passing u32 DYNACALL arm_single_op(u32 opcode) { u32 clockTicks=0; #define NO_OPCODE_READ //u32 static_opcode=((opcd_hash&0xFFF0)<<16) | ((opcd_hash&0x000F)<<4); //u32 static_opcode=((opcd_hash)<<28); #include "arm-new.h" return clockTicks; } /* ARM ALU opcodes (more or less) (flags,rv)=opcode(flags,in regs ..) rd=rv; if (set_flags) PSR=(rd==pc?CPSR:flags); (mem ops) Writes of R15: R15+12 R15 as base: R15+8 LDR rd=mem[addr(in regs)] LDM ... STR/STM: pc+12 /// "cached" interpreter: Set PC+12 to PC reg mov opcode call function if (pc settting opcode) lookup again using armNextPC PC setting opcodes ALU with write to PC LDR with write to PC (SDT) LDM with write to PC (BDT) B/BL SWI Indirect, via write to PSR/Mode MSR */ struct ArmDPOP { u32 key; u32 mask; u32 flags; }; vector ops; enum OpFlags { OP_SETS_PC = 1, OP_READS_PC = 32768, OP_IS_COND = 65536, OP_MFB = 0x80000000, OP_HAS_RD_12 = 2, OP_HAS_RD_16 = 4, OP_HAS_RS_0 = 8, OP_HAS_RS_8 = 16, OP_HAS_RS_16 = 32, OP_HAS_FLAGS_READ = 4096, OP_HAS_FLAGS_WRITE = 8192, OP_HAS_RD_READ = 16384, //For conditionals OP_WRITE_FLAGS = 64, OP_WRITE_FLAGS_S = 128, OP_READ_FLAGS = 256, OP_READ_FLAGS_S = 512, OP_WRITE_REG = 1024, OP_READ_REG_1 = 2048, }; #define DP_R_ROFC (OP_READ_FLAGS_S|OP_READ_REG_1) //Reads reg1, op2, flags if S #define DP_R_ROF (OP_READ_FLAGS|OP_READ_REG_1) //Reads reg1, op2, flags (ADC & co) #define DP_R_OFC (OP_READ_FLAGS_S) //Reads op2, flags if S #define DP_W_RFC (OP_WRITE_FLAGS_S|OP_WRITE_REG) //Writes reg, and flags if S #define DP_W_F (OP_WRITE_FLAGS) //Writes only flags, always (S=1) /* COND | 00 0 OP1 S Rn Rd SA ST 0 Rm -- Data opcode, PSR xfer (imm shifted reg) | 00 0 OP1 S Rn Rd Rs 0 ST 1 Rm -- Data opcode, PSR xfer (reg shifted reg) | 00 0 0 00A S Rd Rn Rs 1001 Rm -- Mult | 00 0 1 0B0 0 Rn Rd 0000 1001 Rm -- SWP | 00 1 OP1 S Rn Rd imm8r4 -- Data opcode, PSR xfer (imm8r4) | 01 0 P UBW L Rn Rd Offset -- LDR/STR (I=0) | 01 1 P UBW L Rn Rd SHAM SHTP 0 Rs -- LDR/STR (I=1) | 10 0 P USW L Rn {RList} -- LDM/STM | 10 1 L {offset} -- B/BL | 11 1 1 X* -- SWI (undef cases) | 01 1 XXXX X X* X* X* 1 XXXX - Undefined (LDR/STR w/ encodings that would be reg. based shift) | 11 0 PUNW L Rn {undef} -- Copr. Data xfer (undef) | 11 1 0 CPOP Crn Crd Cpn CP3 0 Crm -- Copr. Data Op (undef) | 11 1 0 CPO3 L Crn Crd Cpn CP3 1 Crm -- Copr. Reg xf (undef) Phase #1: -Non branches that don't touch memory (pretty much: Data processing, Not MSR, Mult) -Everything else is ifb Phase #2: Move LDR/STR to templates Phase #3: Move LDM/STM to templates */ void AddDPOP(u32 subcd, u32 rflags, u32 wflags) { ArmDPOP op; u32 key=subcd<<21; u32 mask=(15<<21) | (7<<25); op.flags=rflags|wflags; if (wflags==DP_W_F) { //also match S bit for opcodes that must write to flags (CMP & co) mask|=1<<20; key|=1<<20; } //ISR form (bit 25=0, bit 4 = 0) op.key=key; op.mask=mask | (1<<4); ops.push_back(op); //RSR form (bit 25=0, bit 4 = 1, bit 7=0) op.key = key | (1<<4); op.mask = mask | (1<<4) | (1<<7); ops.push_back(op); //imm8r4 form (bit 25=1) op.key = key | (1<<25); op.mask = mask; ops.push_back(op); } void InitHash() { /* COND | 00 I OP1 S Rn Rd OPER2 -- Data opcode, PSR xfer Data processing opcodes */ //AND 0000 Rn, OPER2, {Flags} Rd, {Flags} //EOR 0001 Rn, OPER2, {Flags} Rd, {Flags} //SUB 0010 Rn, OPER2, {Flags} Rd, {Flags} //RSB 0011 Rn, OPER2, {Flags} Rd, {Flags} //ADD 0100 Rn, OPER2, {Flags} Rd, {Flags} //ORR 1100 Rn, OPER2, {Flags} Rd, {Flags} //BIC 1110 Rn, OPER2, {Flags} Rd, {Flags} AddDPOP(0,DP_R_ROFC, DP_W_RFC); AddDPOP(1,DP_R_ROFC, DP_W_RFC); AddDPOP(2,DP_R_ROFC, DP_W_RFC); AddDPOP(3,DP_R_ROFC, DP_W_RFC); AddDPOP(4,DP_R_ROFC, DP_W_RFC); AddDPOP(12,DP_R_ROFC, DP_W_RFC); AddDPOP(14,DP_R_ROFC, DP_W_RFC); //ADC 0101 Rn, OPER2, Flags Rd, {Flags} //SBC 0110 Rn, OPER2, Flags Rd, {Flags} //RSC 0111 Rn, OPER2, Flags Rd, {Flags} AddDPOP(5,DP_R_ROF, DP_W_RFC); AddDPOP(6,DP_R_ROF, DP_W_RFC); AddDPOP(7,DP_R_ROF, DP_W_RFC); //TST 1000 S=1 Rn, OPER2, Flags Flags //TEQ 1001 S=1 Rn, OPER2, Flags Flags AddDPOP(8,DP_R_ROF, DP_W_F); AddDPOP(9,DP_R_ROF, DP_W_F); //CMP 1010 S=1 Rn, OPER2 Flags //CMN 1011 S=1 Rn, OPER2 Flags AddDPOP(10,DP_R_ROF, DP_W_F); AddDPOP(11,DP_R_ROF, DP_W_F); //MOV 1101 OPER2, {Flags} Rd, {Flags} //MVN 1111 OPER2, {Flags} Rd, {Flags} AddDPOP(13,DP_R_OFC, DP_W_RFC); AddDPOP(15,DP_R_OFC, DP_W_RFC); } /* * * X86 Compiler * */ void armEmit32(u32 emit32); void *armGetEmitPtr(); #define _DEVEL (1) #define EMIT_I armEmit32((I)) #define EMIT_GET_PTR() armGetEmitPtr() u8* icPtr; u8* ICache; extern const u32 ICacheSize=1024*1024; #if HOST_OS == OS_WINDOWS u8 ARM7_TCB[ICacheSize+4096]; #elif HOST_OS == OS_LINUX u8 ARM7_TCB[ICacheSize+4096] __attribute__((section(".text"))); #elif HOST_OS==OS_DARWIN u8 ARM7_TCB[ICacheSize+4096] __attribute__((section("__TEXT, .text"))); #else #error ARM7_TCB ALLOC #endif #include "arm_emitter/arm_emitter.h" #undef I using namespace ARM; void* EntryPoints[ARAM_SIZE_MAX/4]; enum OpType { VOT_Fallback, VOT_DataOp, VOT_B, VOT_BL, VOT_BR, //Branch (to register) VOT_Read, //Actually, this handles LDR and STR //VOT_LDM, //This Isn't used anymore VOT_MRS, VOT_MSR, }; void armv_call(void* target); void armv_setup(); void armv_intpr(u32 opcd); void armv_end(void* codestart, u32 cycles); void armv_check_pc(u32 pc); void armv_check_cache(u32 opcd, u32 pc); void armv_imm_to_reg(u32 regn, u32 imm); void armv_MOV32(eReg regn, u32 imm); void armv_prof(OpType opt,u32 op,u32 flg); extern "C" void arm_dispatch(); extern "C" void arm_exit(); extern "C" void DYNACALL #if BUILD_COMPILER == COMPILER_GCC // Avoid inlining / duplicating / whatever __attribute__ ((optimize(0))) #endif arm_mainloop(u32 cycl, void* regs, void* entrypoints); extern "C" void DYNACALL arm_compilecode(); template u32 DYNACALL DoMemOp(u32 addr,u32 data) { u32 rv=0; #if HOST_CPU==CPU_X86 addr=virt_arm_reg(0); data=virt_arm_reg(1); #endif if (L) { if (B) rv=arm_ReadMem8(addr); else rv=arm_ReadMem32(addr); } else { if (B) arm_WriteMem8(addr,data); else arm_WriteMem32(addr,data); } #if HOST_CPU==CPU_X86 virt_arm_reg(0)=rv; #endif return rv; } //findfirstset -- used in LDM/STM handling #if HOST_CPU==CPU_X86 && BUILD_COMPILER != COMPILER_GCC #include u32 findfirstset(u32 v) { unsigned long rv; _BitScanForward(&rv,v); return rv+1; } #else #define findfirstset __builtin_ffs #endif #if 0 //LDM isn't perf. citrical, and as a result, not implemented fully. //So this code is disabled //mask is *2 template void DYNACALL DoLDM(u32 addr, u32 mask) { #if HOST_CPU==CPU_X86 addr=virt_arm_reg(0); mask=virt_arm_reg(1); #endif //addr=(addr); //force align ? u32 idx=-1; do { u32 tz=findfirstset(mask); mask>>=tz; idx+=tz; arm_Reg[idx].I=arm_ReadMem32(addr); addr+=4; } while(mask); } #endif void* GetMemOp(bool L, bool B) { if (L) { if (B) return (void*)(u32(DYNACALL*)(u32,u32))&DoMemOp; else return (void*)(u32(DYNACALL*)(u32,u32))&DoMemOp; } else { if (B) return (void*)(u32(DYNACALL*)(u32,u32))&DoMemOp; else return (void*)(u32(DYNACALL*)(u32,u32))&DoMemOp; } } //Decodes an opcode, returns type. //opcd might be changed (currently for LDM/STM -> LDR/STR transforms) OpType DecodeOpcode(u32& opcd,u32& flags) { //by default, PC has to be updated flags=OP_READS_PC; u32 CC=(opcd >> 28); if (CC!=CC_AL) flags|=OP_IS_COND; //helpers ... #define CHK_BTS(M,S,V) ( (M & (opcd>>S)) == (V) ) //Check bits value in opcode #define IS_LOAD (opcd & (1<<20)) //Is L bit set ? (LDM/STM LDR/STR) #define READ_PC_CHECK(S) if (CHK_BTS(15,S,15)) flags|=OP_READS_PC; //Opcode sets pc ? bool _set_pc= (CHK_BTS(3,26,0) && CHK_BTS(15,12,15)) || //Data processing w/ Rd=PC (CHK_BTS(3,26,1) && CHK_BTS(15,12,15) && IS_LOAD ) || //LDR/STR w/ Rd=PC (CHK_BTS(7,25,4) && (opcd & 32768) && IS_LOAD) || //LDM/STM w/ PC in list CHK_BTS(7,25,5) || //B or BL CHK_BTS(15,24,15); //SWI //NV condition means VFP on newer cores, let interpreter handle it... if (CC==15) return VOT_Fallback; if (_set_pc) flags|=OP_SETS_PC; //B / BL ? if (CHK_BTS(7,25,5)) { verify(_set_pc); if (!(flags&OP_IS_COND)) flags&=~OP_READS_PC; //not COND doesn't read from pc flags|=OP_SETS_PC; //Branches Set pc .. //branch ! return (opcd&(1<<24))?VOT_BL:VOT_B; } //Common case: MOVCC PC,REG if (CHK_BTS(0xFFFFFF,4,0x1A0F00)) { verify(_set_pc); if (CC==CC_AL) flags&=~OP_READS_PC; return VOT_BR; } //No support for COND branching opcodes apart from the forms above .. if (CC!=CC_AL && _set_pc) { return VOT_Fallback; } u32 RList=opcd&0xFFFF; u32 Rn=(opcd>>16)&15; #define LDM_REGCNT() (cpuBitsSet[RList & 255] + cpuBitsSet[(RList >> 8) & 255]) //Data Processing opcodes -- find using mask/key //This will eventually be virtualised w/ register renaming for( u32 i=0;i> 28)!=0xE) { flags |= OP_HAS_FLAGS_READ; //if (flags & OP_WRITE_REG) flags |= OP_HAS_RD_READ; } //DPOP ! if ((ops[i].flags & OP_READ_FLAGS) || ((ops[i].flags & OP_READ_FLAGS_S) && (opcd & (1<<20)))) { flags |= OP_HAS_FLAGS_READ; } if ((ops[i].flags & OP_WRITE_FLAGS) || ((ops[i].flags & OP_WRITE_FLAGS_S) && (opcd & (1<<20)))) { flags |= OP_HAS_FLAGS_WRITE; } if(ops[i].flags & OP_WRITE_REG) { //All dpops that write, write to RD_12 flags |= OP_HAS_RD_12; verify(! (CHK_BTS(15,12,15) && CC!=CC_AL)); } if(ops[i].flags & OP_READ_REG_1) { //Reg 1 is RS_16 flags |= OP_HAS_RS_16; //reads from pc ? READ_PC_CHECK(16); } //op2 is imm or reg ? if ( !(opcd & (1<<25)) ) { //its reg (register or imm shifted) flags |= OP_HAS_RS_0; //reads from pc ? READ_PC_CHECK(0); //is it register shifted reg ? if (opcd & (1<<4)) { verify(! (opcd & (1<<7)) ); //must be zero flags |= OP_HAS_RS_8; //can't be pc ... verify(!CHK_BTS(15,8,15)); } else { //is it RRX ? if ( ((opcd>>4)&7)==6) { //RRX needs flags to be read (even if the opcode doesn't) flags |= OP_HAS_FLAGS_READ; } } } return VOT_DataOp; } } //Lets try mem opcodes since its not data processing /* Lets Check LDR/STR ! CCCC 01 0 P UBW L Rn Rd Offset -- LDR/STR (I=0) */ if ((opcd>>25)==(0xE4/2) ) { /* I=0 Everything else handled */ arm_printf("ARM: MEM %08X L/S:%d, AWB:%d!\n",opcd,(opcd>>20)&1,(opcd>>21)&1); return VOT_Read; } else if ((opcd>>25)==(0xE6/2) && CHK_BTS(0x7,4,0) ) { arm_printf("ARM: MEM REG to Reg %08X\n",opcd); /* I=1 Logical Left shift, only */ return VOT_Read; } //LDM common case else if ((opcd>>25)==(0xE8/2) /*&& CHK_BTS(32768,0,0)*/ && CHK_BTS(1,22,0) && CHK_BTS(1,20,1) && LDM_REGCNT()==1) { //P=0 //U=1 //L=1 //W=1 //S=0 u32 old_opcd=opcd; //One register xfered //Can be rewriten as normal mem opcode .. opcd=0xE4000000; //Imm offset opcd |= 0<<25; //Post incr opcd |= old_opcd & (1<<24); //Up/Dn opcd |= old_opcd & (1<<23); //Word/Byte opcd |= 0<<22; //Write back (must be 0 for PI) opcd |= old_opcd & (1<<21); //Load opcd |= old_opcd & (1<<20); //Rn opcd |= Rn<<16; //Rd u32 Rd=findfirstset(RList)-1; opcd |= Rd<<12; //Offset opcd |= 4; arm_printf("ARM: MEM TFX R %08X\n",opcd); return VOT_Read; } //STM common case else if ((opcd>>25)==(0xE8/2) && CHK_BTS(1,22,0) && CHK_BTS(1,20,0) && LDM_REGCNT()==1) { //P=1 //U=0 //L=1 //W=1 //S=0 u32 old_opcd=opcd; //One register xfered //Can be rewriten as normal mem opcode .. opcd=0xE4000000; //Imm offset opcd |= 0<<25; //Pre/Post incr opcd |= old_opcd & (1<<24); //Up/Dn opcd |= old_opcd & (1<<23); //Word/Byte opcd |= 0<<22; //Write back opcd |= old_opcd & (1<<21); //Store/Load opcd |= old_opcd & (1<<20); //Rn opcd |= Rn<<16; //Rd u32 Rd=findfirstset(RList)-1; opcd |= Rd<<12; //Offset opcd |= 4; arm_printf("ARM: MEM TFX W %08X\n",opcd); return VOT_Read; } else if (CHK_BTS(0xE10F0FFF,0,0xE10F0000)) { return VOT_MRS; } else if (CHK_BTS(0xEFBFFFF0,0,0xE129F000)) { return VOT_MSR; } else if ((opcd>>25)==(0xE8/2) && CHK_BTS(32768,0,0)) { arm_printf("ARM: MEM FB %08X\n",opcd); flags|=OP_MFB; //(flag Just for the fallback counters) } else { arm_printf("ARM: FB %08X\n",opcd); } //by default fallback to interpr return VOT_Fallback; } //helpers ... #if HOST_CPU == CPU_ARM64 extern void LoadReg(eReg rd,u32 regn,ConditionCode cc=CC_AL); extern void StoreReg(eReg rd,u32 regn,ConditionCode cc=CC_AL); extern void armv_mov(ARM::eReg regd, ARM::eReg regn); extern void armv_add(ARM::eReg regd, ARM::eReg regn, ARM::eReg regm); extern void armv_sub(ARM::eReg regd, ARM::eReg regn, ARM::eReg regm); extern void armv_add(ARM::eReg regd, ARM::eReg regn, s32 imm); extern void armv_lsl(ARM::eReg regd, ARM::eReg regn, u32 imm); extern void armv_bic(ARM::eReg regd, ARM::eReg regn, u32 imm); extern void *armv_start_conditional(ARM::ConditionCode cc); extern void armv_end_conditional(void *ref); // Use w25 for temp mem save because w9 is not callee-saved #define r9 ((ARM::eReg)25) #else void LoadReg(eReg rd,u32 regn,ConditionCode cc=CC_AL) { LDR(rd,r8,(u8*)®[regn].I-(u8*)®[0].I,Offset,cc); } void StoreReg(eReg rd,u32 regn,ConditionCode cc=CC_AL) { STR(rd,r8,(u8*)®[regn].I-(u8*)®[0].I,Offset,cc); } void armv_mov(ARM::eReg regd, ARM::eReg regn) { MOV(regd, regn); } void armv_add(ARM::eReg regd, ARM::eReg regn, ARM::eReg regm) { ADD(regd, regn, regm); } void armv_sub(ARM::eReg regd, ARM::eReg regn, ARM::eReg regm) { SUB(regd, regn, regm); } void armv_add(ARM::eReg regd, ARM::eReg regn, s32 imm) { if (imm >= 0) ADD(regd, regn, imm); else SUB(regd, regn, -imm); } void armv_lsl(ARM::eReg regd, ARM::eReg regn, u32 imm) { LSL(regd, regn, imm); } void armv_bic(ARM::eReg regd, ARM::eReg regn, u32 imm) { BIC(regd, regn, imm); } void *armv_start_conditional(ARM::ConditionCode cc) { return NULL; } void armv_end_conditional(void *ref) { } #endif //very quick-and-dirty register rename based virtualisation u32 renamed_regs[16]; u32 rename_reg_base; void RenameRegReset() { rename_reg_base=r1; memset(renamed_regs, 0, sizeof(renamed_regs)); } //returns new reg #. didrn is true if a rename mapping was added u32 RenameReg(u32 reg, bool& didrn) { if (renamed_regs[reg] == 0) { renamed_regs[reg]=rename_reg_base; rename_reg_base++; didrn=true; } else { didrn=false; } return renamed_regs[reg]; } //For reg reads (they need to be loaded) //load can be used to skip loading (for RD if not cond) void LoadAndRename(u32& opcd, u32 bitpos, bool load,u32 pc) { bool didrn; u32 reg=(opcd>>bitpos)&15; u32 nreg=RenameReg(reg,didrn); opcd = (opcd& ~(15<>bitpos)&15; u32 nreg=RenameReg(reg,didrn); verify(!didrn); if (reg==15) reg=R15_ARM_NEXT; StoreReg((eReg)nreg,reg); } #if HOST_CPU == CPU_ARM64 extern void LoadFlags(); extern void StoreFlags(); #else //For COND void LoadFlags() { //Load flags LoadReg(r0,RN_PSR_FLAGS); //move them to flags register MSR(0,8,r0); } void StoreFlags() { //get results from flags register MRS(r1,0); //Store flags StoreReg(r1,RN_PSR_FLAGS); } #endif //Virtualise Data Processing opcode void VirtualizeOpcode(u32 opcd,u32 flag,u32 pc) { //Keep original opcode for info u32 orig=opcd; //Load arm flags, RS0/8/16, RD12/16 (as indicated by the decoder flags) if (flag & OP_HAS_FLAGS_READ) { LoadFlags(); } if (flag & OP_HAS_RS_0) LoadAndRename(opcd,0,true,pc+8); if (flag & OP_HAS_RS_8) LoadAndRename(opcd,8,true,pc+8); if (flag & OP_HAS_RS_16) LoadAndRename(opcd,16,true,pc+8); if (flag & OP_HAS_RD_12) LoadAndRename(opcd,12,flag&OP_HAS_RD_READ,pc+4); if (flag & OP_HAS_RD_16) { verify(! (flag & OP_HAS_RS_16)); LoadAndRename(opcd,16,flag&OP_HAS_RD_READ,pc+4); } //Opcode has been modified to use the new regs //Emit it ... arm_printf("Arm Virtual: %08X -> %08X\n",orig,opcd); armEmit32(opcd); //Store arm flags, rd12/rd16 (as indicated by the decoder flags) if (flag & OP_HAS_RD_12) StoreAndRename(orig,12); if (flag & OP_HAS_RD_16) StoreAndRename(orig,16); //Sanity check .. if (renamed_regs[15] != 0) { verify(flag&OP_READS_PC || (flag&OP_SETS_PC && !(flag&OP_IS_COND))); } if (flag & OP_HAS_FLAGS_WRITE) StoreFlags(); } u32 nfb,ffb,bfb,mfb; void *armGetEmitPtr() { if (icPtr < (ICache+ICacheSize-1024)) //ifdebug return static_cast(icPtr); return NULL; } #if HOST_CPU == CPU_X86 && FEAT_AREC != DYNAREC_NONE /* X86 backend * Uses a mix of * x86 code * Virtualised arm code (using the varm interpreter) * Emulated arm fallbacks (using the aica arm interpreter) * * The goal is to run as much code possible under the varm interpreter * so it will run on arm w/o changes. A few opcodes are missing from varm * (MOV32 is a notable case) and as such i've added a few varm_* hooks * * This code also performs a LOT of compiletime and runtime state/value sanity checks. * We don't care for speed here ... */ #include "emitter/x86_emitter.h" static x86_block* x86e; void DumpRegs(const char* output) { static FILE* f=fopen(output, "w"); static int id=0; #if 0 if (490710==id) { __asm int 3; } #endif verify(id!=137250); #if 1 fprintf(f,"%d\n",id); //for(int i=0;i<14;i++) { int i=R15_ARM_NEXT; fprintf(f,"r%d=%08X\n",i,reg[i].I); } #endif id++; } void DYNACALL PrintOp(u32 opcd) { printf("%08X\n",opcd); } void armv_imm_to_reg(u32 regn, u32 imm) { x86e->Emit(op_mov32,®[regn].I,imm); } void armv_MOV32(eReg regn, u32 imm) { x86e->Emit(op_mov32,&virt_arm_reg(regn),imm); } void armv_call(void* loc) { x86e->Emit(op_call,x86_ptr_imm(loc)); } x86_Label* end_lbl; void armv_setup() { //Setup emitter x86e = new x86_block(); x86e->Init(0,0); x86e->x86_buff=(u8*)EMIT_GET_PTR(); x86e->x86_size=1024*64; x86e->do_realloc=false; //load base reg .. x86e->Emit(op_mov32,&virt_arm_reg(8),(u32)&arm_Reg[0]); //the "end" label is used to exit from the block, if a code modification (expected opcode // actual opcode in ram) is detected end_lbl=x86e->CreateLabel(false,0); } void armv_intpr(u32 opcd) { //Call interpreter x86e->Emit(op_mov32,ECX,opcd); x86e->Emit(op_call,x86_ptr_imm(&arm_single_op)); } void armv_end(void* codestart, u32 cycles) { //Normal block end //Move counter to EAX for return, pop ESI, ret x86e->Emit(op_sub32,ESI,cycles); x86e->Emit(op_jns,x86_ptr_imm(arm_dispatch)); x86e->Emit(op_jmp,x86_ptr_imm(arm_exit)); //Fluch cache, move counter to EAX, pop, ret //this should never happen (triggers a breakpoint on x86) x86e->MarkLabel(end_lbl); x86e->Emit(op_int3); x86e->Emit(op_call,x86_ptr_imm(FlushCache)); x86e->Emit(op_sub32,ESI,cycles); x86e->Emit(op_jmp,x86_ptr_imm(arm_dispatch)); //Generate the code & apply fixups/relocations as needed x86e->Generate(); //Use space from the dynarec buffer icPtr+=x86e->x86_indx; //Delete the x86 emitter ... delete x86e; } //sanity check: non branch doesn't set pc void armv_check_pc(u32 pc) { x86e->Emit(op_cmp32,&armNextPC,pc); x86_Label* nof=x86e->CreateLabel(false,0); x86e->Emit(op_je,nof); x86e->Emit(op_int3); x86e->MarkLabel(nof); } //sanity check: stale cache void armv_check_cache(u32 opcd, u32 pc) { x86e->Emit(op_cmp32,&CPUReadMemoryQuick(pc),opcd); x86_Label* nof=x86e->CreateLabel(false,0); x86e->Emit(op_je,nof); x86e->Emit(op_int3); x86e->MarkLabel(nof); } //profiler hook void armv_prof(OpType opt,u32 op,u32 flags) { if (VOT_Fallback!=opt) x86e->Emit(op_add32,&nfb,1); else { if (flags & OP_SETS_PC) x86e->Emit(op_add32,&bfb,1); else if (flags & OP_MFB) x86e->Emit(op_add32,&mfb,1); else x86e->Emit(op_add32,&ffb,1); } } naked void DYNACALL arm_compilecode() { #if HOST_OS == OS_LINUX __asm ( "call CompileCode \n\t" "mov $0, %%eax \n\t" "jmp arm_dispatch \n" : ); #else __asm { call CompileCode; mov eax,0; jmp arm_dispatch; } #endif } naked void DYNACALL arm_mainloop(u32 cycl, void* regs, void* entrypoints) { #if HOST_OS == OS_LINUX __asm ( "push %%esi \n\t" "mov %%ecx, %%esi \n\t" "add %0, %%esi \n\t" "mov $0, %%eax \n\t" "jmp arm_dispatch \n\t" "arm_exit_linux: \n\t" "mov %%esi, %0 \n\t" "pop %%esi \n" : : "m" (reg[CYCL_CNT].I) ); #else __asm { push esi mov esi,ecx add esi,reg[CYCL_CNT*4].I mov eax,0; jmp arm_dispatch } #endif } naked void arm_dispatch() { #if HOST_OS == OS_LINUX __asm ( "arm_dispatch: \n\t" "mov %0, %%eax \n\t" "and $0x7FFFFC, %%eax \n\t" "cmp $0, %1 \n\t" "jne arm_dofiq \n\t" "jmp *%2(%%eax) \n" : : "m" (reg[R15_ARM_NEXT].I), "m" (reg[INTR_PEND].I), "m" (EntryPoints) ); __asm ("arm_dofiq: \n\t" "call CPUFiq \n\t" "jmp arm_dispatch \n" : ); #else __asm { arm_disp: mov eax,reg[R15_ARM_NEXT*4].I and eax,0x7FFFFC cmp reg[INTR_PEND*4].I,0 jne arm_dofiq jmp [EntryPoints+eax] arm_dofiq: call CPUFiq jmp arm_disp } #endif } naked void arm_exit() { #if HOST_OS == OS_LINUX __asm ( "jmp arm_exit_linux" :); #else __asm { arm_exit: mov reg[CYCL_CNT*4].I,esi pop esi ret } #endif } #elif (HOST_CPU == CPU_ARM) /* * * ARMv7 Compiler * */ //mprotect and stuff .. #include void armEmit32(u32 emit32) { if (icPtr >= (ICache+ICacheSize-1024)) die("ICache is full, invalidate old entries ..."); //ifdebug *(u32*)icPtr = emit32; icPtr+=4; } #if HOST_OS==OS_DARWIN #include extern "C" void armFlushICache(void *code, void *pEnd) { sys_dcache_flush(code, (u8*)pEnd - (u8*)code + 1); sys_icache_invalidate(code, (u8*)pEnd - (u8*)code + 1); } #else extern "C" void armFlushICache(void *bgn, void *end) { __clear_cache(bgn, end); } #endif void armv_imm_to_reg(u32 regn, u32 imm) { MOV32(r0,imm); StoreReg(r0,regn); } void armv_call(void* loc) { CALL((u32)loc); } void armv_setup() { //Setup emitter //r9: temp for mem ops (PI WB) //r8: base //Stored on arm_mainloop so no need for push/pop } void armv_intpr(u32 opcd) { //Call interpreter MOV32(r0,opcd); CALL((u32)arm_single_op); } void armv_end(void* codestart, u32 cycl) { //Normal block end //cycle counter rv //pop registers & return if (is_i8r4(cycl)) SUB(r5,r5,cycl,true); else { u32 togo = cycl; while(ARMImmid8r4_enc(togo) == -1) { SUB(r5,r5,256); togo -= 256; } SUB(r5,r5,togo,true); } JUMP((u32)&arm_exit,CC_MI); //statically predicted as not taken JUMP((u32)&arm_dispatch); armFlushICache(codestart,(void*)EMIT_GET_PTR()); } //Hook cus varm misses this, so x86 needs special code void armv_MOV32(eReg regn, u32 imm) { MOV32(regn,imm); } /* No sanity checks on arm .. */ #endif // HOST_CPU == CPU_ARM //Run a timeslice for ARMREC //CycleCount is pretty much fixed to (512*32) for now (might change to a diff constant, but will be constant) void arm_Run(u32 CycleCount) { if (!Arm7Enabled) return; for (int i=0;i<32;i++) { arm_mainloop(CycleCount/32, arm_Reg, EntryPoints); libAICA_TimeStep(); } /* s32 clktks=reg[CYCL_CNT].I+CycleCount; //While we have time to spend do { //Check for interrupts if (reg[INTR_PEND].I) { CPUFiq(); } //lookup code at armNextPC, run a block & remove its cycles from the timeslice clktks-=EntryPoints[(armNextPC & ARAM_MASK)/4](); #if HOST_CPU==CPU_X86 verify(armNextPC<=ARAM_MASK); #endif } while(clktks>0); reg[CYCL_CNT].I=clktks; */ } #undef r /* TODO: R15 read/writing is kind of .. weird Gotta investigate why .. */ //Mem operand 2 calculation, if Reg or large imm void MemOperand2(eReg dst,bool I, bool U,u32 offs, u32 opcd) { if (I==true) { u32 Rm=(opcd>>0)&15; verify(CHK_BTS(7,4,0));// only SHL mode LoadReg(r1,Rm); u32 SA=31&(opcd>>7); //can't do shifted add for now -- EMITTER LIMIT -- if (SA) armv_lsl(r1, r1, SA); } else { armv_MOV32(r1,offs); } if (U) armv_add(dst, r0, r1); else armv_sub(dst, r0, r1); } template void DYNACALL MSR_do(u32 v) { #if HOST_CPU==CPU_X86 v=virt_arm_reg(r0); #endif if (Pd) { if(armMode > 0x10 && armMode < 0x1f) /* !=0x10 ?*/ { reg[17].I = (reg[17].I & 0x00FFFF00) | (v & 0xFF0000FF); } } else { CPUUpdateCPSR(); u32 newValue = reg[16].I; if(armMode > 0x10) { newValue = (newValue & 0xFFFFFF00) | (v & 0x000000FF); } newValue = (newValue & 0x00FFFFFF) | (v & 0xFF000000); newValue |= 0x10; if(armMode > 0x10) { CPUSwitchMode(newValue & 0x1f, false); } reg[16].I = newValue; CPUUpdateFlags(); } } //Compile & run block of code, starting armNextPC extern "C" void CompileCode() { //Get the code ptr void* rv=EMIT_GET_PTR(); //update the block table EntryPoints[(armNextPC&ARAM_MASK)/4]=rv; //setup local pc counter u32 pc=armNextPC; //emitter/block setup armv_setup(); //the ops counter is used to terminate the block (max op count for a single block is 32 currently) //We don't want too long blocks for timing accuracy u32 ops=0; u32 Cycles=0; for(;;) { ops++; //Read opcode ... u32 opcd=CPUReadMemoryQuick(pc); #if HOST_CPU==CPU_X86 //Sanity check: Stale cache armv_check_cache(opcd,pc); #endif u32 op_flags; //Decode & handle opcode OpType opt=DecodeOpcode(opcd,op_flags); switch(opt) { case VOT_DataOp: { //data processing opcode that can be virtualised RenameRegReset(); /* if (op_flags & OP_READS_PC) armv_imm_to_reg(15,pc+8); else*/ #if HOST_CPU==CPU_X86 armv_imm_to_reg(15,rand()); #endif VirtualizeOpcode(opcd,op_flags,pc); #if HOST_CPU==CPU_X86 armv_imm_to_reg(15,rand()); #endif } break; case VOT_BR: { //Branch to reg ConditionCode cc=(ConditionCode)(opcd>>28); verify(op_flags&OP_SETS_PC); if (cc!=CC_AL) { LoadFlags(); armv_imm_to_reg(R15_ARM_NEXT,pc+4); } LoadReg(r0,opcd&0xF); #if HOST_CPU==CPU_X86 x86e->Emit(op_and32, &virt_arm_reg(0), 0xfffffffc); #else armv_bic(r0, r0, 3); #endif void *ref = armv_start_conditional(cc); StoreReg(r0,R15_ARM_NEXT,cc); armv_end_conditional(ref); } break; case VOT_B: case VOT_BL: { //Branch to imm //<<2, sign extend ! s32 offs=((s32)opcd<<8)>>6; if (op_flags & OP_IS_COND) { armv_imm_to_reg(R15_ARM_NEXT,pc+4); LoadFlags(); ConditionCode cc=(ConditionCode)(opcd>>28); void *ref = armv_start_conditional(cc); if (opt==VOT_BL) { armv_MOV32(r0,pc+4); StoreReg(r0,14,cc); } armv_MOV32(r0,pc+8+offs); StoreReg(r0,R15_ARM_NEXT,cc); armv_end_conditional(ref); } else { if (opt==VOT_BL) armv_imm_to_reg(14,pc+4); armv_imm_to_reg(R15_ARM_NEXT,pc+8+offs); } } break; case VOT_Read: { //LDR/STR u32 offs=opcd&4095; bool U=opcd&(1<<23); bool Pre=opcd&(1<<24); bool W=opcd&(1<<21); bool I=opcd&(1<<25); u32 Rn=(opcd>>16)&15; u32 Rd=(opcd>>12)&15; bool DoWB=W || (!Pre && Rn!=Rd); //Write back if: W, Post update w/ Rn!=Rd bool DoAdd=DoWB || Pre; //Register not updated anyway if (I==false && offs==0) { DoWB=false; DoAdd=false; } //verify(Rd!=15); verify(!((Rn==15) && DoWB)); //AGU if (Rn!=15) { LoadReg(r0,Rn); if (DoAdd) { eReg dst=Pre?r0:r9; if (I==false && is_i8r4(offs)) { if (U) armv_add(dst, r0, offs); else armv_add(dst, r0, -offs); } else { MemOperand2(dst,I,U,offs,opcd); } if (DoWB && dst==r0) armv_mov(r9, r0); } } else { u32 addr=pc+8; if (Pre && offs && I==false) { addr+=U?offs:-offs; } armv_MOV32(r0,addr); if (Pre && I==true) { MemOperand2(r1,I,U,offs,opcd); armv_add(r0, r0, r1); } } if (CHK_BTS(1,20,0)) { if (Rd==15) { armv_MOV32(r1,pc+12); } else { LoadReg(r1,Rd); } } //Call handler armv_call(GetMemOp(CHK_BTS(1,20,1),CHK_BTS(1,22,1))); if (CHK_BTS(1,20,1)) { if (Rd==15) { verify(op_flags & OP_SETS_PC); StoreReg(r0,R15_ARM_NEXT); } else { StoreReg(r0,Rd); } } //Write back from AGU, if any if (DoWB) { StoreReg(r9,Rn); } } break; case VOT_MRS: { u32 Rd=(opcd>>12)&15; armv_call((void*)&CPUUpdateCPSR); if (opcd & (1<<22)) { LoadReg(r0,17); } else { LoadReg(r0,16); } StoreReg(r0,Rd); } break; case VOT_MSR: { u32 Rm=(opcd>>0)&15; LoadReg(r0,Rm); if (opcd & (1<<22)) armv_call((void*)(void (DYNACALL*)(u32))&MSR_do<1>); else armv_call((void*)(void (DYNACALL*)(u32))&MSR_do<0>); if (op_flags & OP_SETS_PC) armv_imm_to_reg(R15_ARM_NEXT,pc+4); } break; /* //LDM is disabled for now //Common cases of LDM/STM are converted to STR/LDR (tsz==1) //Other cases are very uncommon and not worth implementing case VOT_LDM: { //P=0, U=1, S=0, L=1, W=1 u32 Rn=(opcd>>16)&15; u32 RList=opcd&0xFFFF; u32 tsz=(cpuBitsSet[RList & 255] + cpuBitsSet[(RList >> 8) & 255]); verify(CHK_BTS(1,24,0)); //P=0 verify(CHK_BTS(1,23,1)); //U=1 verify(CHK_BTS(1,22,0)); //S=0 verify(CHK_BTS(1,21,1)); //W=1 verify(CHK_BTS(1,20,1)); //L=0 //if (tsz!=1) // goto FALLBACK; bool _W=true; //w=1 if (RList & (1<); if (_W) { StoreReg(r9,Rn); } } break; */ case VOT_Fallback: { //interpreter fallback //arm_single_op needs PC+4 on r15 //TODO: only write it if needed -> Probably not worth the code, very few fallbacks now... armv_imm_to_reg(15,pc+8); //For cond branch, MSR if (op_flags & OP_SETS_PC) armv_imm_to_reg(R15_ARM_NEXT,pc+4); #if HOST_CPU==CPU_X86 if ( !(op_flags & OP_SETS_PC) ) armv_imm_to_reg(R15_ARM_NEXT,pc+4); #endif armv_intpr(opcd); #if HOST_CPU==CPU_X86 if ( !(op_flags & OP_SETS_PC) ) { //Sanity check: next pc armv_check_pc(pc+4); #if 0 x86e->Emit(op_mov32,ECX,opcd); x86e->Emit(op_call,x86_ptr_imm(PrintOp)); #endif } #endif } break; default: die("can't happen\n"); } //Lets say each opcode takes 9 cycles for now .. Cycles+=9; #if HOST_CPU==CPU_X86 armv_imm_to_reg(15,0xF87641FF); armv_prof(opt,opcd,op_flags); #endif //Branch ? if (op_flags & OP_SETS_PC) { //x86e->Emit(op_call,x86_ptr_imm(DumpRegs)); // great debugging tool arm_printf("ARM: %06X: Block End %d\n",pc,ops); #if HOST_CPU==CPU_X86 && 0 //Great fallback finder, also spams console if (opt==VOT_Fallback) { x86e->Emit(op_mov32,ECX,opcd); x86e->Emit(op_call,x86_ptr_imm(PrintOp)); } #endif break; } //block size limit ? if (ops>32) { arm_printf("ARM: %06X: Block split %d\n",pc,ops); armv_imm_to_reg(R15_ARM_NEXT,pc+4); break; } //Goto next opcode pc+=4; } armv_end((void*)rv,Cycles); } void FlushCache() { icPtr=ICache; for (u32 i = 0; i < ARRAY_SIZE(EntryPoints); i++) EntryPoints[i] = (void*)&arm_compilecode; } #if HOST_CPU == CPU_X86 #if HOST_OS == OS_WINDOWS #include #endif // These have to be declared somewhere or linker dies u8* ARM::emit_opt=0; eReg ARM::reg_addr; eReg ARM::reg_dst; s32 ARM::imma; void armEmit32(u32 emit32) { if (icPtr >= (ICache + ICacheSize - 64*1024)) { die("ICache is full, invalidate old entries ..."); //ifdebug } x86e->Emit(op_mov32,ECX,emit32); x86e->Emit(op_call,x86_ptr_imm(virt_arm_op)); } #endif // X86 void armt_init() { InitHash(); //align to next page .. ICache = (u8*)(((unat)ARM7_TCB+4095)& ~4095); #if HOST_OS==OS_DARWIN //Can't just mprotect on iOS munmap(ICache, ICacheSize); ICache = (u8*)mmap(ICache, ICacheSize, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_ANON, 0, 0); #endif #if HOST_OS == OS_WINDOWS DWORD old; VirtualProtect(ICache,ICacheSize,PAGE_EXECUTE_READWRITE,&old); #elif HOST_OS == OS_LINUX || HOST_OS == OS_DARWIN printf("\n\t ARM7_TCB addr: %p | from: %p | addr here: %p\n", ICache, ARM7_TCB, armt_init); if (mprotect(ICache, ICacheSize, PROT_EXEC|PROT_READ|PROT_WRITE)) { perror("\n\tError - Couldn’t mprotect ARM7_TCB!"); verify(false); } #if TARGET_IPHONE memset((u8*)mmap(ICache, ICacheSize, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_ANON, 0, 0),0xFF,ICacheSize); #else memset(ICache,0xFF,ICacheSize); #endif #endif icPtr=ICache; } #endif