From 3eb27fc1099e5046ecea25754905d265b984873d Mon Sep 17 00:00:00 2001 From: Stefanos Kornilios Mitsis Poiitidis Date: Fri, 25 Apr 2014 12:52:08 +0300 Subject: [PATCH] Compiles and links for linux x86. No worky yet. Woohoo, a new gcc bug! The x86 linux dynarec just got a couple of months further away thanks to gcc being totally retarded as some things (namely, fastcalls). And yes, reicast depends on fastcalls for a lot of things, and no, i don't care if you don't like fastcalls. --- core/core.mk | 2 +- core/emitter/x86_emitter.cpp | 18 +- core/emitter/x86_emitter.h | 42 +- core/emitter/x86_op_encoder.h | 5 + core/rec-x86/lin86_asm.S | 148 ++++ core/rec-x86/win86_driver.cpp | 781 +++++++++++++++++ core/rec-x86/win86_il.cpp | 1529 +++++++++++++++++++++++++++++++++ core/rec-x86/win86_ngen.cpp | 128 +++ core/rec-x86/win86_ngen.h | 59 ++ core/types.h | 4 +- shell/lin86/Makefile | 3 +- 11 files changed, 2689 insertions(+), 30 deletions(-) create mode 100644 core/rec-x86/lin86_asm.S create mode 100644 core/rec-x86/win86_driver.cpp create mode 100644 core/rec-x86/win86_il.cpp create mode 100644 core/rec-x86/win86_ngen.cpp create mode 100644 core/rec-x86/win86_ngen.h diff --git a/core/core.mk b/core/core.mk index 4b4e3880f..89a2a0f02 100755 --- a/core/core.mk +++ b/core/core.mk @@ -20,7 +20,7 @@ ifndef NOT_ARM endif ifdef X86_REC - RZDCY_MODULES += rec-x86/ + RZDCY_MODULES += rec-x86/ emitter/ endif ifndef NO_REND diff --git a/core/emitter/x86_emitter.cpp b/core/emitter/x86_emitter.cpp index 66eb9aaf6..5ef9d4650 100644 --- a/core/emitter/x86_emitter.cpp +++ b/core/emitter/x86_emitter.cpp @@ -3,6 +3,7 @@ #pragma warning(disable:4244) #pragma warning(disable:4245) +#include "../types.h" #include "x86_emitter.h" bool IsS8(u32 value) { @@ -183,6 +184,8 @@ x86_block_externs* x86_block::GetExterns() return rv; } + +#if 0 #include "windows.h" /*void x86_block::CopyTo(void* to) { @@ -194,6 +197,8 @@ x86_block_externs* x86_block::GetExterns() } */ +#endif + //wut ? void x86_block::ApplyPatches(u8* base) { @@ -409,7 +414,10 @@ void x86_block::Emit(x86_opcode_class op,x86_reg reg1,x86_reg reg2,u32 imm) //reg,mrm,imm, reg1 is written void x86_block::Emit(x86_opcode_class op,x86_reg reg,x86_ptr mem,u32 imm) { - ME_op_3_imm(op,reg,c_mrm(mem),imm); + //GCC bitches about using this directly. It doesn't complain for the other uses though + //go figure .... + x86_mrm_t mrm = c_mrm(mem); + ME_op_3_imm(op,reg,mrm,imm); } //reg,mrm,imm, reg1 is written @@ -449,19 +457,19 @@ u8 EncodeDisp(u32 disp,x86_mrm_t* to,u8 flags) verify(false); return 0; } -__declspec(dllexport) x86_mrm_t x86_mrm(x86_reg base) +/*__declspec(dllexport) */x86_mrm_t x86_mrm(x86_reg base) { return x86_mrm(base,NO_REG,sib_scale_1,0); } -__declspec(dllexport) x86_mrm_t x86_mrm(x86_reg base,x86_ptr disp) +/*__declspec(dllexport) */x86_mrm_t x86_mrm(x86_reg base,x86_ptr disp) { return x86_mrm(base,NO_REG,sib_scale_1,disp); } -__declspec(dllexport) x86_mrm_t x86_mrm(x86_reg index,x86_sib_scale scale,x86_ptr disp) +/*__declspec(dllexport) */x86_mrm_t x86_mrm(x86_reg index,x86_sib_scale scale,x86_ptr disp) { return x86_mrm(NO_REG,index,scale,disp); } -__declspec(dllexport) x86_mrm_t x86_mrm(x86_reg base,x86_reg index) +/*__declspec(dllexport) */x86_mrm_t x86_mrm(x86_reg base,x86_reg index) { return x86_mrm(base,index,sib_scale_1,0); } diff --git a/core/emitter/x86_emitter.h b/core/emitter/x86_emitter.h index ae709ab72..ecffcd327 100644 --- a/core/emitter/x86_emitter.h +++ b/core/emitter/x86_emitter.h @@ -303,9 +303,9 @@ public: ~x86_block(); void x86_buffer_ensure(u32 size); - void x86_block::write8(u32 value); - void x86_block::write16(u32 value); - void x86_block::write32(u32 value); + void write8(u32 value); + void write16(u32 value); + void write32(u32 value); //init things void Init(dyna_reallocFP* ral,dyna_finalizeFP* alf); @@ -332,45 +332,45 @@ public: //opcode Emitters //no param - void x86_block::Emit(x86_opcode_class op); + void Emit(x86_opcode_class op); //1 param //reg - void x86_block::Emit(x86_opcode_class op,x86_reg reg); + void Emit(x86_opcode_class op,x86_reg reg); //smrm - void x86_block::Emit(x86_opcode_class op,x86_ptr mem); + void Emit(x86_opcode_class op,x86_ptr mem); //mrm - void x86_block::Emit(x86_opcode_class op,x86_mrm_t mrm); + void Emit(x86_opcode_class op,x86_mrm_t mrm); //imm - void x86_block::Emit(x86_opcode_class op,u32 imm); + void Emit(x86_opcode_class op,u32 imm); //ptr_imm - void x86_block::Emit(x86_opcode_class op,x86_ptr_imm disp); + void Emit(x86_opcode_class op,x86_ptr_imm disp); //lbl - void x86_block::Emit(x86_opcode_class op,x86_Label* lbl); + void Emit(x86_opcode_class op,x86_Label* lbl); //2 param //reg,reg, reg1 is written - void x86_block::Emit(x86_opcode_class op,x86_reg reg1,x86_reg reg2); + void Emit(x86_opcode_class op,x86_reg reg1,x86_reg reg2); //reg,smrm, reg is written - void x86_block::Emit(x86_opcode_class op,x86_reg reg,x86_ptr mem); + void Emit(x86_opcode_class op,x86_reg reg,x86_ptr mem); //reg,mrm, reg is written - void x86_block::Emit(x86_opcode_class op,x86_reg reg1,x86_mrm_t mrm); + void Emit(x86_opcode_class op,x86_reg reg1,x86_mrm_t mrm); //reg,imm, reg is written - void x86_block::Emit(x86_opcode_class op,x86_reg reg,u32 imm); + void Emit(x86_opcode_class op,x86_reg reg,u32 imm); //smrm,reg, mem is written - void x86_block::Emit(x86_opcode_class op,x86_ptr mem,x86_reg reg); + void Emit(x86_opcode_class op,x86_ptr mem,x86_reg reg); //smrm,imm, mem is written - void x86_block::Emit(x86_opcode_class op,x86_ptr mem,u32 imm); + void Emit(x86_opcode_class op,x86_ptr mem,u32 imm); //mrm,reg, mrm is written - void x86_block::Emit(x86_opcode_class op,x86_mrm_t mrm,x86_reg reg); + void Emit(x86_opcode_class op,x86_mrm_t mrm,x86_reg reg); //mrm,imm, mrm is written - void x86_block::Emit(x86_opcode_class op,x86_mrm_t mrm,u32 imm); + void Emit(x86_opcode_class op,x86_mrm_t mrm,u32 imm); //3 param //reg,reg,imm, reg1 is written - void x86_block::Emit(x86_opcode_class op,x86_reg reg1,x86_reg reg2,u32 imm); + void Emit(x86_opcode_class op,x86_reg reg1,x86_reg reg2,u32 imm); //reg,mrm,imm, reg1 is written - void x86_block::Emit(x86_opcode_class op,x86_reg reg,x86_ptr mem,u32 imm); + void Emit(x86_opcode_class op,x86_reg reg,x86_ptr mem,u32 imm); //reg,mrm,imm, reg1 is written - void x86_block::Emit(x86_opcode_class op,x86_reg reg,x86_mrm_t mrm,u32 imm); + void Emit(x86_opcode_class op,x86_reg reg,x86_mrm_t mrm,u32 imm); }; diff --git a/core/emitter/x86_op_encoder.h b/core/emitter/x86_op_encoder.h index c77a1596b..fd6c03244 100644 --- a/core/emitter/x86_op_encoder.h +++ b/core/emitter/x86_op_encoder.h @@ -58,6 +58,11 @@ } */ +#include "build.h" + +#if BUILD_COMPILER == COMPILER_GCC + #define __fastcall +#endif enum enc_param diff --git a/core/rec-x86/lin86_asm.S b/core/rec-x86/lin86_asm.S new file mode 100644 index 000000000..4918f4313 --- /dev/null +++ b/core/rec-x86/lin86_asm.S @@ -0,0 +1,148 @@ +.section .text +.intel_syntax noprefix + + +.extern rdv_LinkBlock +.globl p_sh4rcb +.globl gas_offs +.globl rdv_FailedToFindBlock +.globl cycle_counter +.globl loop_no_update +.globl intc_sched +.globl bm_GetCode +.globl cycle_counter +.globl UpdateSystem +.globl rdv_DoInterrupts +.globl rdv_BlockCheckFail + +########################################### + +.globl ngen_LinkBlock_Shared_stub +.type ngen_LinkBlock_Shared_stub, @function + +ngen_LinkBlock_Shared_stub: + pop ecx + sub ecx,5 + call rdv_LinkBlock + jmp eax + + + +########################################### + +.globl ngen_LinkBlock_cond_Next_stub +.type ngen_LinkBlock_cond_Next_stub, @function + +ngen_LinkBlock_cond_Next_stub: + mov edx,0 + jmp ngen_LinkBlock_Shared_stub + + + +########################################### + +.globl ngen_LinkBlock_cond_Branch_stub +.type ngen_LinkBlock_cond_Branch_stub, @function + +ngen_LinkBlock_cond_Branch_stub: + mov edx,1 + jmp ngen_LinkBlock_Shared_stub + + +########################################### + +.globl ngen_LinkBlock_Generic_stub +.type ngen_LinkBlock_Generic_stub, @function + +# u32 gas_offs=offsetof(Sh4RCB,cntx.jdyn); +ngen_LinkBlock_Generic_stub: + mov edx,p_sh4rcb + add edx,gas_offs + mov edx,[edx] + jmp ngen_LinkBlock_Shared_stub + + + + +########################################### + +.globl ngen_FailedToFindBlock_ +.type ngen_FailedToFindBlock_, @function + +ngen_FailedToFindBlock_: + mov ecx,esi + call rdv_FailedToFindBlock + jmp eax + + + +########################################### +#define SH4_TIMESLICE 448 + +.globl ngen_mainloop +.type ngen_mainloop, @function + +ngen_mainloop: + push esi + push edi + push ebp + push ebx + + mov ecx,0xA0000000 + mov dword ptr cycle_counter, 448 #SH4_TIMESLICE + + mov dword ptr loop_no_update,offset no_update + mov dword ptr intc_sched,offset intc_sched_offs + + mov eax,0 + +# next_pc _MUST_ be on ecx +no_update: + mov esi,ecx + call _Z10bm_GetCodej #bm_GetCode + jmp eax + +intc_sched_offs: + add dword ptr cycle_counter, 448 #SH4_TIMESLICE + call UpdateSystem + cmp eax,0 + jnz do_iter + ret + +do_iter: + pop ecx + call rdv_DoInterrupts + mov ecx,eax +# cmp byte ptr [sh4_int_bCpuRun],0 +# jz cleanup + jmp no_update + +cleanup: + pop ebx + pop ebp + pop edi + pop esi + + ret + + + +########################################### + +.globl ngen_blockcheckfail +.type ngen_blockcheckfail, @function + +ngen_blockcheckfail: + call rdv_BlockCheckFail + jmp eax + + +########################################### + +.globl ngen_blockcheckfail2 +.type ngen_blockcheckfail2, @function + +ngen_blockcheckfail2: + int 3 + call rdv_BlockCheckFail + jmp eax diff --git a/core/rec-x86/win86_driver.cpp b/core/rec-x86/win86_driver.cpp new file mode 100644 index 000000000..4c85b6b24 --- /dev/null +++ b/core/rec-x86/win86_driver.cpp @@ -0,0 +1,781 @@ +#include "types.h" + +#ifndef HOST_NO_REC +#include "win86_ngen.h" + + + +struct DynaRBI: RuntimeBlockInfo +{ + x86_block_externs* reloc_info; + + virtual ~DynaRBI() { if (reloc_info) reloc_info->Free(); } + + virtual u32 Relink(); + virtual void Relocate(void* dst) + { + reloc_info->Apply(dst); + } +}; + +x86_block* x86e; + +u32 cycle_counter; + +void* loop_no_update; +void* intc_sched; + +bool sse_1=true; +bool sse_2=true; +bool sse_3=true; +bool ssse_3=true; +bool mmx=true; + +void DetectCpuFeatures() +{ + static bool detected=false; + if (detected) return; + detected=true; + +#if HOST_OS==OS_WINDOWS + __try + { + __asm addps xmm0,xmm0 + } + __except(1) + { + sse_1=false; + } + + __try + { + __asm addpd xmm0,xmm0 + } + __except(1) + { + sse_2=false; + } + + __try + { + __asm addsubpd xmm0,xmm0 + } + __except(1) + { + sse_3=false; + } + + __try + { + __asm phaddw xmm0,xmm0 + } + __except(1) + { + ssse_3=false; + } + + + __try + { + __asm paddd mm0,mm1 + __asm emms; + } + __except(1) + { + mmx=false; + } + #endif +} + + +#define CSC_SIZE 64 +struct csc_et +{ + u32 pc; + void* code; +}; +csc_et csc[CSC_SIZE<32?32:CSC_SIZE]; + + +#define CSC_SHIFT 1 +u32 csc_hash(u32 addr) +{ + return (addr>>CSC_SHIFT)&(CSC_SIZE-1); +} + +u32 csc_mode=0; + +u32 csc_sidx=1; + +x86_reg alloc_regs[]={EBX,EBP,ESI,EDI,NO_REG}; +x86_reg xmm_alloc_regs[]={XMM7,XMM6,XMM5,XMM4,NO_REG}; +f32 ALIGN(16) thaw_regs[4]; + + +void x86_reg_alloc::Preload(u32 reg,x86_reg nreg) +{ + x86e->Emit(op_mov32,nreg,GetRegPtr(reg)); +} +void x86_reg_alloc::Writeback(u32 reg,x86_reg nreg) +{ + x86e->Emit(op_mov32,GetRegPtr(reg),nreg); +} + +void x86_reg_alloc::Preload_FPU(u32 reg,x86_reg nreg) +{ + x86e->Emit(op_movss,nreg,GetRegPtr(reg)); +} +void x86_reg_alloc::Writeback_FPU(u32 reg,x86_reg nreg) +{ + x86e->Emit(op_movss,GetRegPtr(reg),nreg); +} +#ifdef PROF2 +extern u32 flsh; +#endif + +void x86_reg_alloc::FreezeXMM() +{ + x86_reg* fpreg=xmm_alloc_regs; + f32* slpc=thaw_regs; + while(*fpreg!=-1) + { + if (SpanNRegfIntr(current_opid,*fpreg)) + x86e->Emit(op_movss,slpc++,*fpreg); + fpreg++; + } +#ifdef PROF2 + x86e->Emit(op_add32,&flsh,1); +#endif +} + +void x86_reg_alloc::ThawXMM() +{ + x86_reg* fpreg=xmm_alloc_regs; + f32* slpc=thaw_regs; + while(*fpreg!=-1) + { + if (SpanNRegfIntr(current_opid,*fpreg)) + x86e->Emit(op_movss,*fpreg,slpc++); + fpreg++; + } +} + + +x86_reg_alloc reg; + +u32 ret_hit,ret_all,ret_stc; + +void csc_push(RuntimeBlockInfo* block) +{ + if (csc_mode==0) + { + x86e->Emit(op_mov32,&csc[csc_hash(block->NextBlock)].pc,block->NextBlock); + } + else if (csc_mode==1) + { + //x86e->Emit(op_int3); + x86e->Emit(op_ror32,&csc_sidx,1); + x86e->Emit(op_bsr32,EAX,&csc_sidx); + x86e->Emit(op_mov32,x86_mrm(EAX,sib_scale_8,x86_ptr(csc)),block->NextBlock); + } +} + +void DYNACALL csc_fail(u32 addr,u32 addy) +{ + if (csc_mode==0) + { + //too bad ? + } + else if (csc_mode==1) + { + u32 fail_idx=(csc_sidx>>1)|(csc_sidx<<31); + + printf("Ret Mismatch: %08X instead of %08X!\n",addr,addy); + } +} +void csc_pop(RuntimeBlockInfo* block) +{ + x86_Label* end=x86e->CreateLabel(false,8); + x86_Label* try_dyn=x86e->CreateLabel(false,8); + + //static guess + x86_Label* stc_hit=x86e->CreateLabel(false,8); + x86e->Emit(op_cmp32,ECX,&block->csc_RetCache); + x86e->Emit(op_je,stc_hit); + //if !eq + { + //if (cached) goto dyn + x86e->Emit(op_cmp32,&block->csc_RetCache,-1); + x86e->Emit(op_jne,try_dyn); + //else, do cache + x86e->Emit(op_mov32,&block->csc_RetCache,ECX); + } + + x86e->MarkLabel(stc_hit); + x86e->Emit(op_add32,&ret_stc,1); + if (csc_mode==1) + x86e->Emit(op_rol32,&csc_sidx,1); + x86e->Emit(op_jmp,end); + + x86e->MarkLabel(try_dyn); + + if (csc_mode==0) + { + //csc ! + //x86e->Emit(op_int3); + x86e->Emit(op_mov32,ECX,GetRegPtr(reg_pc_dyn)); + x86e->Emit(op_mov32,EAX,ECX); + x86e->Emit(op_shr32,EAX,CSC_SHIFT); + x86e->Emit(op_and32,EAX,CSC_SIZE-1); + x86e->Emit(op_cmp32,x86_mrm(EAX,sib_scale_8,x86_ptr(csc)),ECX); + } + else if (csc_mode==1) + { + //x86e->Emit(op_int3); + x86e->Emit(op_mov32,ECX,GetRegPtr(reg_pc_dyn)); + x86e->Emit(op_bsr32,EAX,&csc_sidx); + x86e->Emit(op_rol32,&csc_sidx,1); + x86e->Emit(op_mov32,EDX,x86_mrm(EAX,sib_scale_8,x86_ptr(csc))); + x86e->Emit(op_cmp32,EDX,ECX); + } + + + x86e->Emit(op_jne,end); + x86e->Emit(op_add32,&ret_hit,1); + //x86e->Emit(op_jmp,end); + + x86e->MarkLabel(end); + x86e->Emit(op_add32,&ret_all,1); + +} + +void DYNACALL PrintBlock(u32 pc) +{ + printf("block: 0x%08X\n",pc); + for (int i=0;i<16;i++) + printf("%08X ",r[i]); + printf("\n"); +} + +u32* GetRegPtr(u32 reg) +{ + return Sh4_int_GetRegisterPtr((Sh4RegType)reg); +} + +u32 cvld; +u32 rdmt[6]; +extern u32 memops_t,memops_l; + +void CheckBlock(RuntimeBlockInfo* block,x86_ptr_imm place) +{ + s32 sz=block->sh4_code_size; + u32 sa=block->addr; + while(sz>0) + { + void* ptr=(void*)GetMemPtr(sa,4); + if (ptr) + { + if (sz==2) + x86e->Emit(op_cmp16,ptr,*(u16*)ptr); + else + x86e->Emit(op_cmp32,ptr,*(u32*)ptr); + x86e->Emit(op_jne,place); + } + sz-=4; + sa+=4; + } + +} +void ngen_Compile(RuntimeBlockInfo* block,bool force_checks, bool reset, bool staging,bool optimise) +{ + //initialise stuff + DetectCpuFeatures(); + + ((DynaRBI*)block)->reloc_info=0; + + + //Setup emitter + x86e = new x86_block(); + x86e->Init(0,0); + x86e->x86_buff=(u8*)emit_GetCCPtr(); + x86e->x86_size=emit_FreeSpace(); + x86e->do_realloc=false; + + block->code=(DynarecCodeEntry*)emit_GetCCPtr(); + + x86e->Emit(op_add32,&memops_t,block->memops); + x86e->Emit(op_add32,&memops_l,block->linkedmemops); + + //run register allocator + reg.DoAlloc(block,alloc_regs,xmm_alloc_regs); + + //block header// + + //block invl. checks + x86e->Emit(op_mov32,ECX,block->addr); + + CheckBlock(block,force_checks?x86_ptr_imm(ngen_blockcheckfail):x86_ptr_imm(ngen_blockcheckfail2)); + + //Scheduler + x86_Label* no_up=x86e->CreateLabel(false,8); + + x86e->Emit(op_sub32,&cycle_counter,block->guest_cycles); + + x86e->Emit(op_jns,no_up); + { + x86e->Emit(op_call,x86_ptr_imm(intc_sched)); + } + + x86e->MarkLabel(no_up); + + //stating counter + if (staging) x86e->Emit(op_sub32,&block->staging_runs,1); + + //profiler + if (prof.enable || 1) + x86e->Emit(op_add32,&block->runs,1); + + if (prof.enable) + { + if (force_checks) + x86e->Emit(op_add32,&prof.counters.blkrun.force_check,1); + + x86e->Emit(op_add32,&prof.counters.blkrun.cycles[block->guest_cycles],1); + } + + for (size_t i=0;ioplist.size();i++) + { + shil_opcode* op=&block->oplist[i]; + + u32 opcd_start=x86e->opcode_count; + if (prof.enable) + { + x86e->Emit(op_add32,&prof.counters.shil.executed[op->op],1); + } + + op->host_offs=x86e->x86_indx; + + if (prof.enable) + { + set reg_wt; + set reg_rd; + + for (int z=0;op->rd.is_reg() && zrd.count();z++) + reg_wt.insert(op->rd._reg+z); + + for (int z=0;op->rd2.is_reg() && zrd2.count();z++) + reg_wt.insert(op->rd2._reg+z); + + for (int z=0;op->rs1.is_reg() && zrs1.count();z++) + reg_rd.insert(op->rs1._reg+z); + + for (int z=0;op->rs2.is_reg() && zrs2.count();z++) + reg_rd.insert(op->rs2._reg+z); + + for (int z=0;op->rs3.is_reg() && zrs3.count();z++) + reg_rd.insert(op->rs3._reg+z); + + set::iterator iter=reg_wt.begin(); + while( iter != reg_wt.end() ) + { + if (reg_rd.count(*iter)) + { + reg_rd.erase(*iter); + x86e->Emit(op_add32, &prof.counters.ralloc.reg_rw[*iter], 1); + } + else + { + x86e->Emit(op_add32, &prof.counters.ralloc.reg_w[*iter], 1); + } + + ++iter; + } + + iter=reg_rd.begin(); + while( iter != reg_rd.end() ) + { + x86e->Emit(op_add32,&prof.counters.ralloc.reg_r[*iter],1); + ++iter; + } + } + + reg.OpBegin(op,i); + + ngen_opcode(block,op,x86e,staging,optimise); + + if (prof.enable) x86e->Emit(op_add32,&prof.counters.shil.host_ops[op->op],x86e->opcode_count-opcd_start); + + reg.OpEnd(op); + } + + block->relink_offset=x86e->x86_indx; + block->relink_data=0; + + x86e->x86_indx+=block->Relink(); + + x86e->Generate(); + block->host_code_size=x86e->x86_indx; + block->host_opcodes=x86e->opcode_count; + + emit_Skip(block->host_code_size); + + delete x86e; + x86e=0; +} + +u32 DynaRBI::Relink() +{ + x86_block* x86e=new x86_block(); + x86e->Init(0,0); + x86e->x86_buff=(u8*)code + relink_offset; + x86e->x86_size=512; + x86e->do_realloc=false; + + if (BlockType==BET_StaticCall || BlockType==BET_DynamicCall) + { + //csc_push(this); + } + + switch(BlockType) + { + case BET_Cond_0: + case BET_Cond_1: + { + x86e->Emit(op_cmp32,GetRegPtr(has_jcond?reg_pc_dyn:reg_sr_T),BlockType&1); + + x86_Label* noBranch=x86e->CreateLabel(0,8); + + x86e->Emit(op_jne,noBranch); + { + //branch block + if (pBranchBlock) + x86e->Emit(op_jmp,x86_ptr_imm(pBranchBlock->code)); + else + x86e->Emit(op_call,x86_ptr_imm(ngen_LinkBlock_cond_Branch_stub)); + } + x86e->MarkLabel(noBranch); + { + //no branch block + if (pNextBlock) + x86e->Emit(op_jmp,x86_ptr_imm(pNextBlock->code)); + else + x86e->Emit(op_call,x86_ptr_imm(ngen_LinkBlock_cond_Next_stub)); + } + } + break; + + + case BET_DynamicRet: + { + //csc_pop(this); + } + case BET_DynamicCall: + case BET_DynamicJump: + { + if (relink_data==0) + { + if (pBranchBlock) + { + x86e->Emit(op_cmp32,GetRegPtr(reg_pc_dyn),pBranchBlock->addr); + x86e->Emit(op_je,x86_ptr_imm(pBranchBlock->code)); + x86e->Emit(op_call,x86_ptr_imm(ngen_LinkBlock_Generic_stub)); + } + else + { + x86e->Emit(op_cmp32,GetRegPtr(reg_pc_dyn),0xFABCDECF); + x86e->Emit(op_call,x86_ptr_imm(ngen_LinkBlock_Generic_stub)); + x86e->Emit(op_je,x86_ptr_imm(ngen_LinkBlock_Generic_stub)); + } + } + else + { + verify(pBranchBlock==0); + x86e->Emit(op_mov32,ECX,GetRegPtr(reg_pc_dyn)); + x86e->Emit(op_jmp,x86_ptr_imm(loop_no_update)); + } + } + break; + + case BET_StaticCall: + case BET_StaticJump: + { + if (pBranchBlock) + x86e->Emit(op_jmp,x86_ptr_imm(pBranchBlock->code)); + else + x86e->Emit(op_call,x86_ptr_imm(ngen_LinkBlock_Generic_stub)); + break; + } + + case BET_StaticIntr: + case BET_DynamicIntr: + if (BlockType==BET_StaticIntr) + { + x86e->Emit(op_mov32,&next_pc,NextBlock); + } + else + { + x86e->Emit(op_mov32,EAX,GetRegPtr(reg_pc_dyn)); + x86e->Emit(op_mov32,&next_pc,EAX); + } + x86e->Emit(op_call,x86_ptr_imm(UpdateINTC)); + + x86e->Emit(op_mov32,ECX,&next_pc); + + x86e->Emit(op_jmp,x86_ptr_imm(loop_no_update)); + + break; + } + + + + x86e->Generate(); + return x86e->x86_indx; +} + + +/* + //10 + R S8 B,M + R S16 B,M + R I32 B,M + R F32 B,M + R F32v2 B{,M} + + //13 + W I8 B,M + W I16 B,M + W I32 B,S,M + W F32 B,S,M + W F32v2 B,S{,M} +*/ + +extern u8* virt_ram_base; +#include "hw/sh4/sh4_mmr.h" + +enum mem_op_type +{ + SZ_8, + SZ_16, + SZ_32I, + SZ_32F, + SZ_64F, +}; + +void gen_hande(u32 w, u32 sz, u32 mode) +{ + static const x86_ptr_imm rwm[2][5]= + { + {x86_ptr_imm(&_vmem_ReadMem8SX32),x86_ptr_imm(&_vmem_ReadMem16SX32),x86_ptr_imm(&ReadMem32),x86_ptr_imm(&ReadMem32),x86_ptr_imm(&ReadMem64),}, + {x86_ptr_imm(&WriteMem8),x86_ptr_imm(&WriteMem16),x86_ptr_imm(&WriteMem32),x86_ptr_imm(&WriteMem32),x86_ptr_imm(&WriteMem64),} + }; + + static const x86_opcode_class opcl_i[2][3]= + { + {op_movsx8to32,op_movsx16to32,op_mov32}, + {op_mov8,op_mov16,op_mov32} + }; + + u32 si=x86e->x86_indx; + + if (mode==0) + { + //Buffer + x86e->Emit(op_mov32,EAX,ECX); + x86e->Emit(op_and32,ECX,0x1FFFFFFF); + + x86_mrm_t buff=x86_mrm(ECX,virt_ram_base); + x86_mrm_t buff4=x86_mrm(ECX,virt_ram_base+4); + + if (sz==SZ_8 || sz==SZ_16 || sz==SZ_32I) + { + if (w==0) + x86e->Emit(opcl_i[w][sz],sz==SZ_8?AL:sz==SZ_16?AX:EAX,buff); + else + x86e->Emit(opcl_i[w][sz],buff,sz==SZ_8?DL:sz==SZ_16?DX:EDX); + } + else + { + if (w==0) + { + x86e->Emit(op_movss,XMM0,buff); + if (sz==SZ_64F) + x86e->Emit(op_movss,XMM1,buff4); + } + else + { + x86e->Emit(op_movss,buff,XMM0); + if (sz==SZ_64F) + x86e->Emit(op_movss,buff4,XMM1); + } + } + } + else if (mode==1) + { + //SQ + verify(w==1); + x86e->Emit(op_mov32,EAX,ECX); + x86e->Emit(op_and32,ECX,0x3f); + + x86e->Emit(op_shr32,EAX,26); + x86e->Emit(op_cmp32,EAX,0x38); + x86_Label* l=x86e->CreateLabel(false,8); + x86e->Emit(op_je,l); + x86e->Emit(op_int3); + x86e->MarkLabel(l); + + if (sz==SZ_32I) + x86e->Emit(op_mov32,x86_mrm(ECX,sq_both),EDX); + else if (sz==SZ_32F || sz==SZ_64F) + { + x86e->Emit(op_movss,x86_mrm(ECX,sq_both),XMM0); + if (sz==SZ_64F) + x86e->Emit(op_movss,x86_mrm(ECX,sq_both+4),XMM1); + } + else + { + die("Can't happen\n"); + } + } + else + { + //General + + if ((sz==SZ_32F || sz==SZ_64F) && w==1) + { + if (sz==SZ_32F) + { + x86e->Emit(op_movd_xmm_to_r32,EDX,XMM0); + } + else + { + x86e->Emit(op_sub32,ESP,8); + x86e->Emit(op_movss,x86_mrm(ESP,x86_ptr::create(+4)),XMM1); + x86e->Emit(op_movss,x86_mrm(ESP,x86_ptr::create(-0)),XMM0); + } + } + + x86e->Emit(op_call,rwm[w][sz]); + + if ((sz==SZ_32F || sz==SZ_64F) && w==0) + { + x86e->Emit(op_movd_xmm_from_r32,XMM0,EAX); + if (sz==SZ_64F) + { + x86e->Emit(op_movd_xmm_from_r32,XMM1,EDX); + } + } + } + + x86e->Emit(op_ret); + + emit_Skip(x86e->x86_indx-si); +} + +unat mem_code_base=0; +unat mem_code_end=0; +void* mem_code[3][2][5]; + +void ngen_init() +{ + //Setup emitter + x86e = new x86_block(); + x86e->Init(0,0); + x86e->x86_buff=(u8*)emit_GetCCPtr(); + x86e->x86_size=emit_FreeSpace(); + x86e->do_realloc=false; + + + mem_code_base=(unat)emit_GetCCPtr(); + + for (int sz=0;sz<5;sz++) + { + for (int w=0;w<2;w++) + { + for (int m=0;m<3;m++) + { + if (m==1 && (sz<=SZ_16 || w==0)) + continue; + + mem_code[m][w][sz]=emit_GetCCPtr(); + gen_hande(w,sz,m); + } + } + } + + mem_code_end=(unat)emit_GetCCPtr(); + + x86e->Generate(); + + delete x86e; + + emit_SetBaseAddr(); +} + +void ngen_ResetBlocks() +{ +} + +void ngen_GetFeatures(ngen_features* dst) +{ + dst->InterpreterFallback=false; + dst->OnlyDynamicEnds=false; +} + + +RuntimeBlockInfo* ngen_AllocateBlock() +{ + return new DynaRBI(); +} + + +bool ngen_Rewrite(unat& addr,unat retadr,unat acc) +{ + if (addr>=mem_code_base && addrInit(0,0); + x86e->x86_buff=(u8*)retadr-5; + x86e->x86_size=emit_FreeSpace(); + x86e->do_realloc=false; + + for (int i=0;i<5;i++) + { + for (int w=0;w<2;w++) + { + if ((u32)mem_code[0][w][i]==ca) + { + //found ! + + if ((acc >> 26) == 0x38) //sq ? + { + verify(w == 1); + x86e->Emit(op_call, x86_ptr_imm(mem_code[1][w][i])); + } + else + { + x86e->Emit(op_call, x86_ptr_imm(mem_code[2][w][i])); + } + + x86e->Generate(); + delete x86e; + + addr=retadr-5; + + //printf("Patched: %08X for access @ %08X\n",addr,acc); + return true; + } + } + } + + die("Failed to match the code :(\n"); + + return false; + } + else + { + return false; + } +} +#endif diff --git a/core/rec-x86/win86_il.cpp b/core/rec-x86/win86_il.cpp new file mode 100644 index 000000000..ec6322d48 --- /dev/null +++ b/core/rec-x86/win86_il.cpp @@ -0,0 +1,1529 @@ +#include "win86_ngen.h" +#include "hw/sh4/sh4_mmr.h" +#include "hw/sh4/sh4_rom.h" + +void ngen_Bin(shil_opcode* op,x86_opcode_class natop,bool has_imm=true,bool has_wb=true) +{ + //x86e->Emit(op_mov32,EAX,op->rs1.reg_ptr()); + + verify(reg.IsAllocg(op->rs1._reg)); + verify(reg.IsAllocg(op->rd._reg)); + + if (has_wb && reg.mapg(op->rs1)!=reg.mapg(op->rd)) + { + x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1)); + } + + if (has_imm && op->rs2.is_imm()) + { + x86e->Emit(natop,has_wb?reg.mapg(op->rd):reg.mapg(op->rs1),op->rs2._imm); + } + else if (op->rs2.is_r32i()) + { + verify(reg.IsAllocg(op->rs2._reg)); + + x86e->Emit(natop,has_wb?reg.mapg(op->rd):reg.mapg(op->rs1),reg.mapg(op->rs2)); + } + else + { + printf("%d \n",op->rs1.type); + verify(false); + } +} + +void ngen_fp_bin(shil_opcode* op,x86_opcode_class natop) +{ + verify(reg.IsAllocf(op->rs1)); + verify(reg.IsAllocf(op->rs2)); + verify(reg.IsAllocf(op->rd)); + + if (op->rd._reg!=op->rs1._reg) + x86e->Emit(op_movss,reg.mapf(op->rd),reg.mapf(op->rs1)); + + if (op->rs2.is_r32f()) + { + x86e->Emit(natop,reg.mapf(op->rd),reg.mapf(op->rs2)); + } + else + { + printf("%d \n",op->rs2.type); + verify(false); + } +// verify(has_wb); + //x86e->Emit(op_movss,op->rd.reg_ptr(),XMM0); +} +void ngen_Unary(shil_opcode* op,x86_opcode_class natop) +{ + verify(reg.IsAllocg(op->rs1)); + verify(reg.IsAllocg(op->rd)); + + if (reg.mapg(op->rs1)!=reg.mapg(op->rd)) + x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1)); + + x86e->Emit(natop,reg.mapg(op->rd)); +} + +void* _vmem_read_const(u32 addr,bool& ismem,u32 sz); + +u32 ngen_CC_BytesPushed; +void ngen_CC_Start(shil_opcode* op) +{ + ngen_CC_BytesPushed=0; +} +void ngen_CC_Param(shil_opcode* op,shil_param* par,CanonicalParamType tp) +{ + switch(tp) + { + //push the contents + case CPT_u32: + case CPT_f32: + if (par->is_reg()) + { + if (reg.IsAllocg(*par)) + x86e->Emit(op_push32,reg.mapg(*par)); + else if (reg.IsAllocf(*par)) + { + x86e->Emit(op_sub32,ESP,4); + x86e->Emit(op_movss,x86_mrm(ESP), reg.mapf(*par)); + } + else + { + die("Must not happen !\n"); + x86e->Emit(op_push32,x86_ptr(par->reg_ptr())); + } + } + else if (par->is_imm()) + x86e->Emit(op_push,par->_imm); + else + die("invalid combination"); + ngen_CC_BytesPushed+=4; + break; + //push the ptr itself + case CPT_ptr: + verify(par->is_reg()); + + die("FAIL"); + x86e->Emit(op_push,(unat)par->reg_ptr()); + + for (u32 ri=0; ri<(*par).count(); ri++) + { + if (reg.IsAllocf(*par,ri)) + { + x86e->Emit(op_sub32,ESP,4); + x86e->Emit(op_movss,x86_mrm(ESP),reg.mapfv(*par,ri)); + } + else + { + verify(!reg.IsAllocAny((Sh4RegType)(par->_reg+ri))); + } + } + + + ngen_CC_BytesPushed+=4; + break; + + //store from EAX + case CPT_u64rvL: + case CPT_u32rv: + if (reg.IsAllocg(*par)) + x86e->Emit(op_mov32,reg.mapg(*par),EAX); + /*else if (reg.IsAllocf(*par)) + x86e->Emit(op_movd_xmm_from_r32,reg.mapf(*par),EAX);*/ + else + die("Must not happen!\n"); + break; + + case CPT_u64rvH: + if (reg.IsAllocg(*par)) + x86e->Emit(op_mov32,reg.mapg(*par),EDX); + else + die("Must not happen!\n"); + break; + + //Store from ST(0) + case CPT_f32rv: + verify(reg.IsAllocf(*par)); + x86e->Emit(op_fstp32f,x86_ptr(par->reg_ptr())); + x86e->Emit(op_movss,reg.mapf(*par),x86_ptr(par->reg_ptr())); + break; + + } +} + +void ngen_CC_Call(shil_opcode*op,void* function) +{ + reg.FreezeXMM(); + x86e->Emit(op_call,x86_ptr_imm(function)); + reg.ThawXMM(); +} +void ngen_CC_Finish(shil_opcode* op) +{ + x86e->Emit(op_add32,ESP,ngen_CC_BytesPushed); +} + +extern u32 vrml_431; +#ifdef PROF2 + +extern u32 srmls,srmlu,srmlc; +extern u32 rmls,rmlu; +extern u32 wmls,wmlu; +extern u32 vrd; +#endif + + +void DYNACALL VERIFYME(u32 addr) +{ + verify((addr>>26)==0x38); +} + +extern u8* virt_ram_base; + +/* + + ReadM + I8 GAI1 [m] + I16 GAI2 [m] + I32 GAI4 [m] + F32 GA4 [m] + F32v2 RA4 [m,m] + F32v4 RA4 [m,m,m,m] + F32v4r3i1 RA4 [m,m,m,1.0] + F32v4r3i0 RA4 [m,m,m,0.0] + + WriteM + I8 GA1 + I16 GA2 + I32 GA4 + F32 GA4 + F32v2 SA + F32v4 + F32v4s3 + F32v4s4 + + + //10 + R S8 B,M + R S16 B,M + R I32 B,M + R F32 B,M + R F32v2 B{,M} + + //13 + W I8 B,M + W I16 B,M + W I32 B,S,M + W F32 B,S,M + W F32v2 B,S{,M} +*/ + +extern void* mem_code[3][2][5]; + +void ngen_opcode(RuntimeBlockInfo* block, shil_opcode* op,x86_block* x86e, bool staging, bool optimise) +{ + switch(op->op) + { + case shop_readm: + { + void* fuct=0; + bool isram=false; + verify(op->rs1.is_imm() || op->rs1.is_r32i()); + + verify(op->rs1.is_imm() || reg.IsAllocg(op->rs1)); + verify(op->rs3.is_null() || op->rs3.is_imm() || reg.IsAllocg(op->rs3)); + + for (u32 i=0;ird.count();i++) + { + verify(reg.IsAllocAny((Sh4RegType)(op->rd._reg+i))); + } + + u32 size=op->flags&0x7f; + + if (op->rs1.is_imm()) + { + if (prof.enable) x86e->Emit(op_add32,&prof.counters.shil.readm_const,1); + void* ptr=_vmem_read_const(op->rs1._imm,isram,size); + if (isram) + { +#ifdef PROF2 + x86e->Emit(op_add32,&srmlu,1); +#endif + if (size==1) + x86e->Emit(op_movsx8to32,EAX,ptr); + else if (size==2) + x86e->Emit(op_movsx16to32,EAX,ptr); + else if (size==4) + { + x86e->Emit(op_mov32,EAX,ptr); + //this is a pretty good sieve, but its not perfect. + //whitelisting is much better, but requires side channel data + //Page locking w/ invalidation is another strategy we can try (leads to 'excessive' + //compiling. Maybe a mix of both ?), its what the mainline nulldc uses + if (optimise) + { + if (staging && !is_s8(*(u32*)ptr) && abs((int)op->rs1._imm-(int)block->addr)<=1024) + { + x86_Label* _same=x86e->CreateLabel(false,8); + x86e->Emit(op_cmp32,EAX,*(u32*)ptr); + x86e->Emit(op_je,_same); + x86e->Emit(op_and32,&op->flags,~0x40000000); + x86e->MarkLabel(_same); + + op->flags|=0x40000000; + } + else if (!staging && op->flags & 0x40000000) + { + x86_Label* _same=x86e->CreateLabel(false,8); + x86e->Emit(op_cmp32,EAX,*(u32*)ptr); + x86e->Emit(op_je,_same); + x86e->Emit(op_int3); + x86e->MarkLabel(_same); +#ifdef PROF2 + x86e->Emit(op_add32,&srmlc,1); +#endif + } + } + } + else if (size==8) + { + x86e->Emit(op_mov32,EAX,ptr); + x86e->Emit(op_mov32,EDX,(u8*)ptr+4); + } + else + { + die("Invalid mem read size"); + } + } + else + { +#ifdef PROF2 + x86e->Emit(op_add32,&srmls,1); +#endif + x86e->Emit(op_mov32,ECX,op->rs1._imm); + fuct=ptr; + } + } + else + { + x86e->Emit(op_mov32,ECX,reg.mapg(op->rs1)); + if (op->rs3.is_imm()) + { + x86e->Emit(op_add32,ECX,op->rs3._imm); + if (prof.enable) x86e->Emit(op_add32,&prof.counters.shil.readm_reg_imm,1); + } + else if (op->rs3.is_r32i()) + { + x86e->Emit(op_add32,ECX,reg.mapg(op->rs3)); + if (prof.enable) x86e->Emit(op_add32,&prof.counters.shil.readm_reg_reg,1); + } + else if (!op->rs3.is_null()) + { + die("invalid rs3"); + } + else + if (prof.enable) x86e->Emit(op_add32,&prof.counters.shil.readm_reg,1); +#if 0 + if (op->flags==0x431 || op->flags==0x440) + { + verify(!reg.IsAllocAny(op->rd)); + verify(!reg.IsAllocAny((Sh4RegType)(op->rd._reg+1))); + verify(!reg.IsAllocAny((Sh4RegType)(op->rd._reg+2))); + verify(!reg.IsAllocAny((Sh4RegType)(op->rd._reg+3))); + + x86e->Emit(op_add32,&vrml_431,1); + x86e->Emit(op_mov32,EDX,ECX); + x86e->Emit(op_and32,EDX,0x1FFFFFFF); + x86e->Emit(op_movups,XMM0,x86_mrm(EDX,x86_ptr(virt_ram_base))); + x86e->Emit(op_movaps,op->rd.reg_ptr(),XMM0); + + if (op->flags==0x431) + x86e->Emit(op_mov32,op->rd.reg_ptr()+3,0x3f800000); + else if (op->flags==0x430) + x86e->Emit(op_mov32,op->rd.reg_ptr()+3,0); + + break; + } + + bool vect=op->flags&0x80; + + if (vect) + { + u32 sz=size; + //x86e->Emit(op_add32,&cvld,sz/(op->flags&0x100?8:4)); + x86e->Emit(op_add32,&vrml_431,sz/(op->flags&0x100?8:4)*2); + verify(sz==8 || sz==12 || sz==16 || sz==32 || sz==64); + + void** vmap,** funct; + _vmem_get_ptrs(4,false,&vmap,&funct); + x86e->Emit(op_mov32,EAX,ECX); + x86e->Emit(op_shr32,EAX,24); + x86e->Emit(op_mov32,EAX,x86_mrm(EAX,sib_scale_4,vmap)); + + x86e->Emit(op_test32,EAX,~0x7F); + x86e->Emit(op_jz,x86_ptr_imm::create(op->flags)); + x86e->Emit(op_xchg32,ECX,EAX); + x86e->Emit(op_shl32,EAX,ECX); + x86e->Emit(op_shr32,EAX,ECX); + x86e->Emit(op_and32,ECX,~0x7F); + + int i=0; + for (i=0;(i+16)<=sz;i+=16) + { + x86e->Emit(op_movups,XMM0,x86_mrm(EAX,ECX,sib_scale_1,x86_ptr::create(i))); + if (op->rd._reg&3) + x86e->Emit(op_movups,op->rd.reg_ptr()+i/4,XMM0); + else + x86e->Emit(op_movaps,op->rd.reg_ptr()+i/4,XMM0); + } + for (;(i+8)<=sz;i+=8) + { + x86e->Emit(op_movlps,XMM0,x86_mrm(EAX,ECX,sib_scale_1,x86_ptr::create(i))); + x86e->Emit(op_movlps,op->rd.reg_ptr()+i/4,XMM0); + } + for (;(i+4)<=sz;i+=4) + { + x86e->Emit(op_movss,XMM0,x86_mrm(EAX,ECX,sib_scale_1,x86_ptr::create(i))); + x86e->Emit(op_movss,op->rd.reg_ptr()+i/4,XMM0); + } + + verify(i==sz); + + break; + + } + + if (optimise) + { + if (staging || op->flags&0x80000000) + { + + //opt disabled for now + op->flags|=0x80000000; + + x86_Label* _ram=x86e->CreateLabel(false,8); + void** vmap,** funct; + _vmem_get_ptrs(4,false,&vmap,&funct); + x86e->Emit(op_mov32,EAX,ECX); + x86e->Emit(op_shr32,EAX,24); + x86e->Emit(op_mov32,EAX,x86_mrm(EAX,sib_scale_4,vmap)); + + x86e->Emit(op_test32,EAX,~0x7F); + x86e->Emit(op_jnz,_ram); + + if (staging) + { + x86e->Emit(op_and32,&op->flags,~0x80000000); + } + else + { + //x86e->Emit(op_int3); + } + + x86e->MarkLabel(_ram); + } + + if ( !staging) + { + if (op->flags & 0x80000000) + { +#ifdef PROF2 + x86e->Emit(op_add32,&rmlu,1); +#endif + if (true) + { + u32 sz=op->flags&0x7f; + if (sz!=8) + { + x86e->Emit(op_mov32,EDX,ECX); + x86e->Emit(op_and32,EDX,0x1FFFFFFF); + if (sz==1) + { + x86e->Emit(op_movsx8to32,EAX,x86_mrm(EDX,x86_ptr(virt_ram_base))); + } + else if (sz==2) + { + x86e->Emit(op_movsx16to32,EAX,x86_mrm(EDX,x86_ptr(virt_ram_base))); + } + else if (sz==4) + { + x86e->Emit(op_mov32,EAX,x86_mrm(EDX,x86_ptr(virt_ram_base))); + } + isram=true; + } + } + + } +#ifdef PROF2 + else + { + x86e->Emit(op_add32,&rmls,1); + } +#endif + } + } +#endif +#if 1 + //new code ... + //yay ... + + int Lsz=0; + int sz=size; + if (sz==2) Lsz=1; + if (sz==4 && op->rd.is_r32i()) Lsz=2; + if (sz==4 && op->rd.is_r32f()) Lsz=3; + if (sz==8) Lsz=4; + + //x86e->Emit(op_int3); + + reg.FreezeXMM(); + x86e->Emit(op_call,x86_ptr_imm(mem_code[0][0][Lsz])); + reg.ThawXMM(); + + if (Lsz <= 2) + { + x86e->Emit(op_mov32, reg.mapg(op->rd), EAX); + } + else + { + x86e->Emit(op_movss, reg.mapfv(op->rd, 0), XMM0); + if (Lsz == 4) + x86e->Emit(op_movss, reg.mapfv(op->rd, 1), XMM1); + } + break; +#endif + } + + if (size<=8) + { + + if (size==8 && optimise) + { + verify(op->rd.count()==2 && reg.IsAllocf(op->rd,0) && reg.IsAllocf(op->rd,1)); + + x86e->Emit(op_mov32,EDX,ECX); + x86e->Emit(op_and32,EDX,0x1FFFFFFF); + x86e->Emit(op_movss,reg.mapfv(op->rd,0),x86_mrm(EDX,x86_ptr(virt_ram_base))); + x86e->Emit(op_movss,reg.mapfv(op->rd,1),x86_mrm(EDX,x86_ptr(4+virt_ram_base))); + break; + } + if (!isram) + { + reg.FreezeXMM(); + switch(size) + { + case 1: + if (!fuct) fuct=ReadMem8; + x86e->Emit(op_call,x86_ptr_imm(fuct)); + x86e->Emit(op_movsx8to32,EAX,EAX); + break; + case 2: + if (!fuct) fuct=ReadMem16; + x86e->Emit(op_call,x86_ptr_imm(fuct)); + x86e->Emit(op_movsx16to32,EAX,EAX); + break; + case 4: + if (!fuct) fuct=ReadMem32; + x86e->Emit(op_call,x86_ptr_imm(fuct)); + break; + case 8: + if (!fuct) fuct=ReadMem64; + x86e->Emit(op_call,x86_ptr_imm(fuct)); + break; + default: + verify(false); + } + reg.ThawXMM(); + } + + if (size!=8) + { + if (reg.IsAllocg(op->rd)) + x86e->Emit(op_mov32,reg.mapg(op->rd),EAX); + else if (reg.IsAllocf(op->rd)) + x86e->Emit(op_movd_xmm_from_r32,reg.mapf(op->rd),EAX); + else + x86e->Emit(op_mov32,op->rd.reg_ptr(),EAX); + } + else + { + verify(op->rd.count()==2 && reg.IsAllocf(op->rd,0) && reg.IsAllocf(op->rd,1)); + + x86e->Emit(op_movd_xmm_from_r32,reg.mapfv(op->rd,0),EAX); + x86e->Emit(op_movd_xmm_from_r32,reg.mapfv(op->rd,1),EDX); + } + + } + } + break; + + case shop_writem: + { + u32 size=op->flags&0x7f; + verify(reg.IsAllocg(op->rs1) || op->rs1.is_imm()); + + verify(op->rs2.is_r32() || (op->rs2.count()==2 && reg.IsAllocf(op->rs2,0) && reg.IsAllocf(op->rs2,1))); + + if (op->rs1.is_imm() && size<=4) + { + if (prof.enable) x86e->Emit(op_add32,&prof.counters.shil.readm_const,1); + bool isram; + void* ptr=_vmem_read_const(op->rs1._imm,isram,size); + if (isram) + { + if (size<=2) + x86e->Emit(op_mov32,EAX,reg.mapg(op->rs2)); + if (size==1) + x86e->Emit(op_mov8,ptr,EAX); + else if (size==2) + x86e->Emit(op_mov16,ptr,EAX); + else if (size==4) + { + if (op->rs2.is_r32i()) + x86e->Emit(op_mov32,ptr,reg.mapg(op->rs2)); + else + x86e->Emit(op_movss,ptr,reg.mapf(op->rs2)); + } + + else if (size==8) + { + die("A"); + } + else + die("Invalid mem read size"); + + goto done_writem; + } + else + x86e->Emit(op_mov32,ECX,op->rs1._imm); + } + else + { + x86e->Emit(op_mov32,ECX,reg.mapg(op->rs1)); + } + + if (op->rs3.is_imm()) + { + x86e->Emit(op_add32,ECX,op->rs3._imm); + } + else if (op->rs3.is_r32i()) + { + verify(reg.IsAllocg(op->rs3)); + x86e->Emit(op_add32,ECX,reg.mapg(op->rs3)); + } + else if (!op->rs3.is_null()) + { + printf("rs3: %08X\n",op->rs3.type); + die("invalid rs3"); + } + +#if 1 + //new code ... + //yay ... + + int Lsz=0; + int sz=size; + if (sz==2) Lsz=1; + if (sz==4 && op->rs2.is_r32i()) Lsz=2; + if (sz==4 && op->rs2.is_r32f()) Lsz=3; + if (sz==8) Lsz=4; + + //x86e->Emit(op_int3); + //if (Lsz==0) + { + + if (Lsz<=2) + x86e->Emit(op_mov32,EDX,reg.mapg(op->rs2)); + else + { + x86e->Emit(op_movss,XMM0,reg.mapfv(op->rs2,0)); + if (Lsz==4) + x86e->Emit(op_movss,XMM1,reg.mapfv(op->rs2,1)); + } + + reg.FreezeXMM(); + x86e->Emit(op_call,x86_ptr_imm(mem_code[2][1][Lsz])); + reg.ThawXMM(); + + break; + } +#endif + + die("woohoo"); + /* + if (size==8 && optimise) + { + verify(!reg.IsAllocAny(op->rd)); + verify(!reg.IsAllocAny((Sh4RegType)(op->rd._reg+1))); + + x86e->Emit(op_mov32,EDX,ECX); + x86e->Emit(op_and32,EDX,0x1FFFFFFF); + x86e->Emit(op_movlps,XMM0,op->rs2.reg_ptr()); + x86e->Emit(op_movlps,x86_mrm(EDX,x86_ptr(virt_ram_base)),XMM0); + break; + }*/ + + bool vect=op->flags&0x80; + + if (!vect && size<=8) + { + if (size!=8) + { + if (reg.IsAllocg(op->rs2)) + { + x86e->Emit(op_mov32,EDX,reg.mapg(op->rs2)); + } + else if (reg.IsAllocf(op->rs2)) + { + x86e->Emit(op_movd_xmm_to_r32,EDX,reg.mapf(op->rs2)); + } + else + { + die("Must not happen\n"); + } + } + else + { + verify(op->rs2.count()==2 && reg.IsAllocf(op->rs2,0) && reg.IsAllocf(op->rs2,1)); + + x86e->Emit(op_sub32,ESP,8); + //[ESP+4]=rs2[1]//-4 +8= +4 + //[ESP+0]=rs2[0]//-8 +8 = 0 + x86e->Emit(op_movss,x86_mrm(ESP,x86_ptr::create(+4)),reg.mapfv(op->rs2,1)); + x86e->Emit(op_movss,x86_mrm(ESP,x86_ptr::create(-0)),reg.mapfv(op->rs2,0)); + } + + + + if (optimise) + { + if (staging || op->flags&0x80000000) + { + + //opt disabled for now + op->flags|=0x80000000; + x86_Label* _ram=x86e->CreateLabel(false,8); + void** vmap,** funct; + _vmem_get_ptrs(4,false,&vmap,&funct); + x86e->Emit(op_mov32,EAX,ECX); + x86e->Emit(op_shr32,EAX,24); + x86e->Emit(op_mov32,EAX,x86_mrm(EAX,sib_scale_4,vmap)); + + x86e->Emit(op_test32,EAX,~0x7F); + x86e->Emit(op_jnz,_ram); + + if (staging) + { + x86e->Emit(op_and32,&op->flags,~0x80000000); + } + else + { + //x86e->Emit(op_int3); + } + + x86e->MarkLabel(_ram); + } + + + if (!staging) + { + if (op->flags & 0x80000000) + { +#ifdef PROF2 + x86e->Emit(op_add32,&wmlu,1); +#endif + if (false && size<4) + { + x86e->Emit(op_mov32,EAX,ECX); + x86e->Emit(op_and32,EAX,0x1FFFFFFF); + + if (size==1) + { + x86e->Emit(op_mov8,x86_mrm(EAX,x86_ptr(virt_ram_base)),EDX); + } + else if (size==2) + { + x86e->Emit(op_mov16,x86_mrm(EAX,x86_ptr(virt_ram_base)),EDX); + } + else if (size==4) + { + x86e->Emit(op_mov32,x86_mrm(EAX,x86_ptr(virt_ram_base)),EAX); + } + break; + } + + } +#ifdef PROF2 + else + x86e->Emit(op_add32,&wmls,1); +#endif + } + } + } + + if (vect) + { + u32 sz=op->flags&0x7f; + x86e->Emit(op_add32,&vrml_431,sz/(op->flags&0x100?8:4)*5); + verify(sz==8 || sz==12 || sz==16 || sz==32 || sz==64); + + void** vmap,** funct; + _vmem_get_ptrs(4,false,&vmap,&funct); + x86e->Emit(op_mov32,EAX,ECX); + x86e->Emit(op_shr32,EAX,24); + x86e->Emit(op_mov32,EAX,x86_mrm(EAX,sib_scale_4,vmap)); + + x86e->Emit(op_test32,EAX,~0x7F); + x86e->Emit(op_jz,x86_ptr_imm::create(op->flags)); + x86e->Emit(op_xchg32,ECX,EAX); + x86e->Emit(op_shl32,EAX,ECX); + x86e->Emit(op_shr32,EAX,ECX); + x86e->Emit(op_and32,ECX,~0x7F); + + u32 i=0; + for (; (i+16)<=sz; i+=16) + { + if (op->rs2._reg&3) + x86e->Emit(op_movups,XMM0,op->rs2.reg_ptr()+i/4); + else + x86e->Emit(op_movaps,XMM0,op->rs2.reg_ptr()+i/4); + + x86e->Emit(op_movups,x86_mrm(EAX,ECX,sib_scale_1,x86_ptr::create(i)),XMM0); + } + for (; (i+8)<=sz; i+=8) + { + x86e->Emit(op_movlps,XMM0,op->rs2.reg_ptr()+i/4); + x86e->Emit(op_movlps,x86_mrm(EAX,ECX,sib_scale_1,x86_ptr::create(i)),XMM0); + } + for (; (i+4)<=sz; i+=4) + { + x86e->Emit(op_movss,XMM0,op->rs2.reg_ptr()+i/4); + x86e->Emit(op_movss,x86_mrm(EAX,ECX,sib_scale_1,x86_ptr::create(i)),XMM0); + } + + verify(i==sz); + } + else + { + + reg.FreezeXMM(); + switch(size) + { + case 1: + x86e->Emit(op_call,x86_ptr_imm(&WriteMem8)); + break; + case 2: + x86e->Emit(op_call,x86_ptr_imm(&WriteMem16)); + break; + case 4: + x86e->Emit(op_call,x86_ptr_imm(&WriteMem32)); + break; + case 8: + x86e->Emit(op_call,x86_ptr_imm(&WriteMem64)); + break; + default: + verify(false); + } + reg.ThawXMM(); + } + } + done_writem: + break; + + case shop_ifb: + { + /* + //reg alloc should be flushed here. Add Check + for (int i=0;irs1._imm) + { + x86e->Emit(op_mov32,&next_pc,op->rs2._imm); + } + x86e->Emit(op_mov32,ECX,op->rs3._imm); +#ifdef PROF2 + x86e->Emit(op_add32,&OpDesc[op->rs3._imm]->fallbacks,1); + x86e->Emit(op_adc32,((u8*)&OpDesc[op->rs3._imm]->fallbacks)+4,0); +#endif + x86e->Emit(op_call,x86_ptr_imm(OpDesc[op->rs3._imm]->oph)); + } + break; + + case shop_jdyn: + { + + verify(reg.IsAllocg(op->rs1)); + verify(reg.IsAllocg(op->rd)); + + x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1)); + if (op->rs2.is_imm()) + { + x86e->Emit(op_add32,reg.mapg(op->rd),op->rs2._imm); + } + //x86e->Emit(op_mov32,op->rd.reg_ptr(),EAX); + } + break; + + case shop_jcond: + { + verify(block->has_jcond); + verify(reg.IsAllocg(op->rs1)); + verify(reg.IsAllocg(op->rd)); + + x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1)); + //x86e->Emit(op_mov32,op->rd.reg_ptr(),EAX); + } + break; + + case shop_mov64: + { + verify(op->rd.is_r64()); + verify(op->rs1.is_r64()); + + verify(reg.IsAllocf(op->rs1,0) && reg.IsAllocf(op->rs1,1)); + verify(reg.IsAllocf(op->rd,0) && reg.IsAllocf(op->rd,1)); + + + x86e->Emit(op_movaps,reg.mapfv(op->rd,0),reg.mapfv(op->rs1,0)); + x86e->Emit(op_movaps,reg.mapfv(op->rd,1),reg.mapfv(op->rs1,1)); + } + break; + + case shop_mov32: + { + verify(op->rd.is_r32()); + + if (op->rs1.is_imm()) + { + if (op->rd.is_r32i()) + { + x86e->Emit(op_mov32,reg.mapg(op->rd),op->rs1._imm); + // x86e->Emit(op_add32,&rdmt[4],1); + } + else + { + //verify(!reg.IsAllocAny(op->rd)); + x86e->Emit(op_mov32,EAX,op->rs1._imm); + x86e->Emit(op_movd_xmm_from_r32,reg.mapf(op->rd),EAX); + // x86e->Emit(op_add32,&rdmt[5],1); + } + } + else if (op->rs1.is_r32()) + { + u32 type=0; + + if (reg.IsAllocf(op->rd)) + type|=1; + + if (reg.IsAllocf(op->rs1)) + type|=2; + // x86e->Emit(op_add32,&rdmt[type],1); + switch(type) + { + case 0: //reg=reg + if (reg.mapg(op->rd) != reg.mapg(op->rs1)) + x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1)); + + break; + + case 1: //xmm=reg + x86e->Emit(op_movd_xmm_from_r32,reg.mapf(op->rd),reg.mapg(op->rs1)); + break; + + case 2: //reg=xmm + x86e->Emit(op_movd_xmm_to_r32,reg.mapg(op->rd),reg.mapf(op->rs1)); + break; + + case 3: //xmm=xmm + if (reg.mapf(op->rd) != reg.mapf(op->rs1)) + x86e->Emit(op_movss,reg.mapf(op->rd),reg.mapf(op->rs1)); + else + printf("Renamed fmov !\n"); + break; + + } + } + else + { + die("Invalid mov32 size"); + } + + } + break; + +//if CANONICAL_TEST is defined all opcodes use the C-based canonical implementation ! +//#define CANONICAL_TEST 1 +#ifndef CANONICAL_TEST + case shop_and: ngen_Bin(op,op_and32); break; + case shop_or: ngen_Bin(op,op_or32); break; + case shop_xor: ngen_Bin(op,op_xor32); break; + case shop_add: ngen_Bin(op,op_add32); break; + case shop_sub: ngen_Bin(op,op_sub32); break; + case shop_ror: ngen_Bin(op,op_ror32); break; + + case shop_shl: + case shop_shr: + case shop_sar: + { + x86_opcode_class opcd[]={op_shl32,op_shr32,op_sar32}; + ngen_Bin(op,opcd[op->op-shop_shl]); + } + break; + + case shop_rocr: + case shop_rocl: + { + x86e->Emit(op_sar32,reg.mapg(op->rs2),1); + x86e->Emit(op->op==shop_rocr?op_rcr32:op_rcl32,reg.mapg(op->rd),1); + x86e->Emit(op_rcl32,reg.mapg(op->rd2),1); + } + break; + + case shop_test: + case shop_seteq: + case shop_setge: + case shop_setgt: + case shop_setae: + case shop_setab: + { + x86_opcode_class opcls1=op->op==shop_test?op_test32:op_cmp32; + x86_opcode_class opcls2[]={op_setz,op_sete,op_setge,op_setg,op_setae,op_seta }; + ngen_Bin(op,opcls1,true,false); + x86e->Emit(opcls2[op->op-shop_test],AL); + x86e->Emit(op_movzx8to32,reg.mapg(op->rd),AL); + } + break; + + case shop_adc: + { + x86e->Emit(op_sar32,reg.mapg(op->rs3),1); + if (reg.mapg(op->rd)!=reg.mapg(op->rs1)) + x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1)); + x86e->Emit(op_adc32,reg.mapg(op->rd),reg.mapg(op->rs2)); + x86e->Emit(op_rcl32,reg.mapg(op->rd2),1); + } + break; + + //rd=rs1<rs1)); + verify(op->rs2.is_imm() || reg.IsAllocg(op->rs2)); + verify(reg.IsAllocg(op->rd)); + + x86_opcode_class sl32=op->op==shop_shad?op_sal32:op_shl32; + x86_opcode_class sr32=op->op==shop_shad?op_sar32:op_shr32; + + if (reg.mapg(op->rd)!=reg.mapg(op->rs1)) + x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1)); + + if (op->rs2.is_imm()) + { + die("sh*d: no imms please\n"); + } + else + { + x86e->Emit(op_mov32,ECX,reg.mapg(op->rs2)); + + x86_Label* _exit=x86e->CreateLabel(false,8); + x86_Label* _neg=x86e->CreateLabel(false,8); + x86_Label* _nz=x86e->CreateLabel(false,8); + + x86e->Emit(op_cmp32,reg.mapg(op->rs2),0); + x86e->Emit(op_js,_neg); + { + //>=0 + //r[n]<<=sf; + x86e->Emit(sl32,reg.mapg(op->rd),ECX); + x86e->Emit(op_jmp,_exit); + } + x86e->MarkLabel(_neg); + x86e->Emit(op_test32,reg.mapg(op->rs2),0x1f); + x86e->Emit(op_jnz,_nz); + { + //1fh==0 + if (op->op!=shop_shad) + { + //r[n]=0; + x86e->Emit(op_mov32,reg.mapg(op->rd),0); + } + else + { + //r[n]>>=31; + x86e->Emit(op_sar32,reg.mapg(op->rd),31); + } + x86e->Emit(op_jmp,_exit); + } + x86e->MarkLabel(_nz); + { + //<0 + //r[n]>>=(-sf); + x86e->Emit(op_neg32,ECX); + x86e->Emit(sr32,reg.mapg(op->rd),ECX); + } + x86e->MarkLabel(_exit); + } + } + break; + + case shop_swaplb: + { + if (reg.mapg(op->rd)!=reg.mapg(op->rs1)) + x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1)); + x86e->Emit(op_ror16,reg.mapg(op->rd),8); + } + break; + + + case shop_neg: ngen_Unary(op,op_neg32); break; + case shop_not: ngen_Unary(op,op_not32); break; + + + case shop_sync_sr: + { + //reg alloc should be flushed here. Add Check + for (int i=0;i<8;i++) + { + verify(!reg.IsAllocAny((Sh4RegType)(reg_r0+i))); + verify(!reg.IsAllocAny((Sh4RegType)(reg_r0_Bank+i))); + } + + verify(!reg.IsAllocAny(reg_old_sr_status)); + verify(!reg.IsAllocAny(reg_sr_status)); + + //reg alloc should be flushed here, add checks + x86e->Emit(op_call,x86_ptr_imm(UpdateSR)); + } + break; + + case shop_sync_fpscr: + { + //reg alloc should be flushed here. Add Check + for (int i=0;i<16;i++) + { + verify(!reg.IsAllocAny((Sh4RegType)(reg_fr_0+i))); + verify(!reg.IsAllocAny((Sh4RegType)(reg_xf_0+i))); + } + + verify(!reg.IsAllocAny(reg_old_fpscr)); + verify(!reg.IsAllocAny(reg_fpscr)); + + + //reg alloc should be flushed here, add checks + x86e->Emit(op_call,x86_ptr_imm(UpdateFPSCR)); + } + break; + + + case shop_mul_u16: + case shop_mul_s16: + case shop_mul_i32: + case shop_mul_u64: + case shop_mul_s64: + { + verify(reg.IsAllocg(op->rs1)); + verify(reg.IsAllocg(op->rs2)); + verify(reg.IsAllocg(op->rd)); + + x86_opcode_class opdt[]={op_movzx16to32,op_movsx16to32,op_mov32,op_mov32,op_mov32}; + x86_opcode_class opmt[]={op_mul32,op_mul32,op_mul32,op_mul32,op_imul32}; + //only the top 32 bits are different on signed vs unsigned + + u32 opofs=op->op-shop_mul_u16; + + x86e->Emit(opdt[opofs],EAX,reg.mapg(op->rs1)); + x86e->Emit(opdt[opofs],EDX,reg.mapg(op->rs2)); + + x86e->Emit(opmt[opofs],EDX); + x86e->Emit(op_mov32,reg.mapg(op->rd),EAX); + + if (op->op>=shop_mul_u64) + x86e->Emit(op_mov32,reg.mapg(op->rd2),EDX); + } + break; + + + //fpu + case shop_fadd: + case shop_fsub: + case shop_fmul: + case shop_fdiv: + { + verify(reg.IsAllocf(op->rs1)); + verify(reg.IsAllocf(op->rs2)); + verify(reg.IsAllocf(op->rd)); + + const x86_opcode_class opcds[]= { op_addss, op_subss, op_mulss, op_divss }; + ngen_fp_bin(op,opcds[op->op-shop_fadd]); + } + break; + + case shop_fabs: + { + verify(reg.IsAllocf(op->rs1)); + verify(reg.IsAllocf(op->rd)); + + static ALIGN(16) u32 AND_ABS_MASK[4]={0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF}; + + verify(op->rd._reg==op->rs1._reg); + x86e->Emit(op_pand,reg.mapf(op->rd),AND_ABS_MASK); + } + break; + + case shop_fneg: + { + verify(reg.IsAllocf(op->rs1)); + verify(reg.IsAllocf(op->rd)); + + static ALIGN(16) u32 XOR_NEG_MASK[4]={0x80000000,0x80000000,0x80000000,0x80000000}; + + verify(op->rd._reg==op->rs1._reg); + x86e->Emit(op_pxor,reg.mapf(op->rd),XOR_NEG_MASK); + } + break; + + case shop_fsca: + { + verify(op->rs1.is_r32i()); + + //verify(op->rd.is_vector); //double ? vector(2) ? + + verify(reg.IsAllocg(op->rs1)); + verify(reg.IsAllocf(op->rd,0) && reg.IsAllocf(op->rd,1)); + + //sin/cos + x86e->Emit(op_movzx16to32,EAX,reg.mapg(op->rs1)); + x86e->Emit(op_movss,reg.mapfv(op->rd,0),x86_mrm(EAX,sib_scale_8,x86_ptr(&sin_table->u[0]))); + x86e->Emit(op_movss,reg.mapfv(op->rd,1),x86_mrm(EAX,sib_scale_8,x86_ptr(&sin_table->u[1]))); + } + break; + + case shop_fipr: + { + //rd=rs1*rs2 (vectors) +// verify(!reg.IsAllocAny(op->rs1)); +// verify(!reg.IsAllocAny(op->rs2)); + verify(reg.IsAllocf(op->rd)); + + verify(op->rs1.is_r32fv()==4); + verify(op->rs2.is_r32fv()==4); + verify(op->rd.is_r32()); + + if (sse_3) + { + x86_reg xmm=reg.mapf(op->rd); + + x86e->Emit(op_movaps ,xmm,op->rs1.reg_ptr()); + x86e->Emit(op_mulps ,xmm,op->rs2.reg_ptr()); + //xmm0={a0 ,a1 ,a2 ,a3} + x86e->Emit(op_haddps,xmm,xmm); //xmm0={a0+a1 ,a2+a3 ,a0+a1 ,a2+a3} + x86e->Emit(op_haddps,xmm,xmm); //xmm0={(a0+a1)+(a2+a3) ,(a0+a1)+(a2+a3),(a0+a1)+(a2+a3),(a0+a1)+(a2+a3)} + } + else + { + x86_reg xmm=reg.mapf(op->rd); + + x86e->Emit(op_movaps ,xmm,op->rs1.reg_ptr()); + x86e->Emit(op_mulps ,xmm,op->rs2.reg_ptr()); + x86e->Emit(op_movhlps ,XMM1,xmm); + x86e->Emit(op_addps ,xmm,XMM1); + x86e->Emit(op_movaps ,XMM1,xmm); + x86e->Emit(op_shufps ,XMM1,XMM1,1); + x86e->Emit(op_addss ,xmm,XMM1); + } + } + break; + + case shop_fsqrt: + { + verify(reg.IsAllocf(op->rs1)); + verify(reg.IsAllocf(op->rd)); + + //rd=sqrt(rs1) + x86e->Emit(op_sqrtss ,reg.mapf(op->rd),reg.mapf(op->rs1)); + //x86e->Emit(op_movss ,op->rd.reg_ptr(),XMM0); + } + break; + + case shop_ftrv: + { +#ifdef PROF2 + x86e->Emit(op_add32,&vrd,16); +#endif + verify(!reg.IsAllocAny(op->rs1)); + verify(!reg.IsAllocAny(op->rs2)); + verify(!reg.IsAllocAny(op->rd)); + + //rd(vector)=rs1(vector)*rs2(matrix) + verify(op->rd.is_r32fv()==4); + verify(op->rs1.is_r32fv()==4); + verify(op->rs2.is_r32fv()==16); + +#if 1 + //load the vector .. + if (sse_2) + { + x86e->Emit(op_movaps ,XMM3,op->rs1.reg_ptr()); //xmm0=vector + x86e->Emit(op_pshufd ,XMM0,XMM3,0); //xmm0={v0} + x86e->Emit(op_pshufd ,XMM1,XMM3,0x55); //xmm1={v1} + x86e->Emit(op_pshufd ,XMM2,XMM3,0xaa); //xmm2={v2} + x86e->Emit(op_pshufd ,XMM3,XMM3,0xff); //xmm3={v3} + } + else + { + x86e->Emit(op_movaps ,XMM0,op->rs1.reg_ptr()); //xmm0=vector + + x86e->Emit(op_movaps ,XMM3,XMM0); //xmm3=vector + x86e->Emit(op_shufps ,XMM0,XMM0,0); //xmm0={v0} + x86e->Emit(op_movaps ,XMM1,XMM3); //xmm1=vector + x86e->Emit(op_movaps ,XMM2,XMM3); //xmm2=vector + x86e->Emit(op_shufps ,XMM3,XMM3,0xff); //xmm3={v3} + x86e->Emit(op_shufps ,XMM1,XMM1,0x55); //xmm1={v1} + x86e->Emit(op_shufps ,XMM2,XMM2,0xaa); //xmm2={v2} + } + + //do the matrix mult ! + x86e->Emit(op_mulps ,XMM0,op->rs2.reg_ptr() + 0); //v0*=vm0 + x86e->Emit(op_mulps ,XMM1,op->rs2.reg_ptr() + 4); //v1*=vm1 + x86e->Emit(op_mulps ,XMM2,op->rs2.reg_ptr() + 8); //v2*=vm2 + x86e->Emit(op_mulps ,XMM3,op->rs2.reg_ptr() + 12); //v3*=vm3 + + x86e->Emit(op_addps ,XMM0,XMM1); //sum it all up + x86e->Emit(op_addps ,XMM2,XMM3); + x86e->Emit(op_addps ,XMM0,XMM2); + + x86e->Emit(op_movaps ,op->rd.reg_ptr(),XMM0); +#else + /* + AABB CCDD + + ABCD * 0 1 2 3 0 1 4 5 + 4 5 6 7 2 3 6 7 + 8 9 a b 8 9 c d + c d e f a b e f + */ + + x86e->Emit(op_movaps ,XMM1,op->rs1.reg_ptr()); //xmm1=vector + + x86e->Emit(op_pshufd ,XMM0,XMM1,0x05); //xmm0={v0,v0,v1,v1} + x86e->Emit(op_pshufd ,XMM1,XMM1,0xaf); //xmm1={v2,v2,v3,v3} + + x86e->Emit(op_movaps,XMM2,XMM0); //xmm2={v0,v0,v1,v1} + x86e->Emit(op_movaps,XMM3,XMM1); //xmm3={v2,v2,v3,v3} + + x86e->Emit(op_mulps ,XMM0,op->rs2.reg_ptr() + 0); //aabb * 0145 + x86e->Emit(op_mulps ,XMM2,op->rs2.reg_ptr() + 4); //aabb * 2367 + x86e->Emit(op_mulps ,XMM1,op->rs2.reg_ptr() + 8); //ccdd * 89cd + x86e->Emit(op_mulps ,XMM3,op->rs2.reg_ptr() + 12); //ccdd * abef + + + x86e->Emit(op_addps ,XMM0,XMM1); //sum it all up + x86e->Emit(op_addps ,XMM2,XMM3); + + //XMM0 -> A0C8 | A1C9 | B4DC | B5DD + verify(sse_3); + + x86e->Emit(op_shufps,XMM0,XMM0,0x27); //A0C8 B4DC A1C9 B5DC + x86e->Emit(op_shufps,XMM2,XMM2,0x27); + + x86e->Emit(op_haddps,XMM0,XMM2); //haddps ={a0+a1 ,a2+a3 ,b0+b1 ,b2+b3} + + + x86e->Emit(op_movaps ,op->rd.reg_ptr(),XMM0); +#endif + } + break; + + case shop_fmac: + { + verify(reg.IsAllocf(op->rs1)); + verify(reg.IsAllocf(op->rs2)); + verify(reg.IsAllocf(op->rs3)); + verify(reg.IsAllocf(op->rd)); + + //rd=rs1+rs2*rs3 + //rd might be rs1,rs2 or rs3, so can't prestore here (iirc, rd==rs1==fr0) + x86e->Emit(op_movss ,XMM0,reg.mapf(op->rs2)); + x86e->Emit(op_mulss ,XMM0,reg.mapf(op->rs3)); + x86e->Emit(op_addss ,XMM0,reg.mapf(op->rs1)); + x86e->Emit(op_movss ,reg.mapf(op->rd),XMM0); + } + break; + + case shop_fsrra: + { + verify(reg.IsAllocf(op->rs1)); + verify(reg.IsAllocf(op->rd)); + + //rd=1/sqrt(rs1) + static float one=1.0f; + x86e->Emit(op_sqrtss ,XMM0,reg.mapf(op->rs1)); + x86e->Emit(op_movss ,reg.mapf(op->rd),&one); + x86e->Emit(op_divss ,reg.mapf(op->rd),XMM0); + } + break; + + case shop_fseteq: + case shop_fsetgt: + { + verify(reg.IsAllocf(op->rs1)); + verify(reg.IsAllocf(op->rs2)); + verify(reg.IsAllocg(op->rd)); + + //x86e->Emit(op_movss,XMM0,op->rs1.reg_ptr()); + x86e->Emit(op_ucomiss,reg.mapf(op->rs1),reg.mapf(op->rs2)); + + if (op->op==shop_fseteq) + { + //special case + //We want to take in account the 'unordered' case on the fpu + x86e->Emit(op_lahf); + x86e->Emit(op_test8,AH,0x44); + x86e->Emit(op_setnp,AL); + } + else + { + x86e->Emit(op_seta,AL); + } + + x86e->Emit(op_movzx8to32,reg.mapg(op->rd),AL); + } + break; + + case shop_pref: + { + verify(op->rs1.is_r32i()); + verify(reg.IsAllocg(op->rs1)); + + if (op->flags==0x1337) + { + // + x86e->Emit(op_mov32 ,ECX,reg.mapg(op->rs1)); + x86e->Emit(op_call,x86_ptr_imm(&VERIFYME)); //call do_sqw_mmu + } + + x86e->Emit(op_mov32 ,EDX,reg.mapg(op->rs1)); + x86e->Emit(op_mov32 ,ECX,reg.mapg(op->rs1)); + x86e->Emit(op_shr32 ,EDX,26); + + x86_Label* nosq=x86e->CreateLabel(false,8); + + x86e->Emit(op_cmp32,EDX,0x38); + x86e->Emit(op_jne,nosq); + { + if (CCN_MMUCR.AT) + x86e->Emit(op_call,x86_ptr_imm(&do_sqw_mmu)); //call do_sqw_mmu + else + { + x86e->Emit(op_mov32 ,EDX,(u32)sq_both); + x86e->Emit(op_call32,x86_ptr(&do_sqw_nommu)); //call [do_sqw_nommu] + } + } + x86e->MarkLabel(nosq); + } + break; + + case shop_ext_s8: + case shop_ext_s16: + { + verify(op->rd.is_r32i()); + verify(op->rs1.is_r32i()); + + verify(reg.IsAllocg(op->rd)); + verify(reg.IsAllocg(op->rs1)); + + x86e->Emit(op_mov32,EAX,reg.mapg(op->rs1)); + + if (op->op==shop_ext_s8) + x86e->Emit(op_movsx8to32,reg.mapg(op->rd),EAX); + else + x86e->Emit(op_movsx16to32,reg.mapg(op->rd),EAX); + } + break; + + case shop_cvt_f2i_t: + verify(op->rd.is_r32i()); + verify(op->rs1.is_r32f()); + verify(reg.IsAllocg(op->rd)); + verify(reg.IsAllocf(op->rs1)); + + x86e->Emit(op_cvttss2si,reg.mapg(op->rd),reg.mapf(op->rs1)); + break; + + //i hope that the round mode bit is set properly here :p + case shop_cvt_i2f_n: + case shop_cvt_i2f_z: + verify(op->rd.is_r32f()); + verify(op->rs1.is_r32i()); + verify(reg.IsAllocf(op->rd)); + verify(reg.IsAllocg(op->rs1)); + + x86e->Emit(op_cvtsi2ss,reg.mapf(op->rd),reg.mapg(op->rs1)); + //x86e->Emit(op_movss,op->rd.reg_ptr(),XMM0); + break; + + case shop_frswap: + { + verify(op->rd._reg==op->rs2._reg); + verify(op->rd2._reg==op->rs1._reg); + + verify(op->rs1.count()==16 && op->rs2.count()==16); + verify(op->rd2.count()==16 && op->rd.count()==16); +#ifdef PROF2 + x86e->Emit(op_add32,&vrd,32); +#endif + for (int i=0;i<4;i++) + { + x86e->Emit(op_movaps,XMM0,op->rs1.reg_ptr()+i*4); + x86e->Emit(op_movaps,XMM1,op->rs2.reg_ptr()+i*4); + x86e->Emit(op_movaps,op->rd.reg_ptr()+i*4,XMM0); + x86e->Emit(op_movaps,op->rd2.reg_ptr()+i*4,XMM1); + } + } + break; + + case shop_div32s: + case shop_div32u: + { + x86e->Emit(op_mov32,EAX,reg.mapg(op->rs1)); + if (op->op==shop_div32s) + x86e->Emit(op_cdq); + else + x86e->Emit(op_xor32,EDX,EDX); + + x86e->Emit(op->op==shop_div32s?op_idiv32:op_div32,reg.mapg(op->rs2)); + + x86e->Emit(op_mov32,reg.mapg(op->rd),EAX); + x86e->Emit(op_mov32,reg.mapg(op->rd2),EDX); + } + break; + + case shop_div32p2: + { + x86e->Emit(op_xor32,EAX,EAX); + x86e->Emit(op_cmp32,reg.mapg(op->rs3),0); + x86e->Emit(op_cmove32,EAX,reg.mapg(op->rs2)); + if (reg.mapg(op->rd)!=reg.mapg(op->rs1)) + x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1)); + + x86e->Emit(op_sub32,reg.mapg(op->rd),EAX); + } + break; + + +#endif + + default: +#if 1 || CANONICAL_TEST + shil_chf[op->op](op); + break; +#endif + + +defaulty: + printf("OH CRAP %d\n",op->op); + verify(false); + } +} \ No newline at end of file diff --git a/core/rec-x86/win86_ngen.cpp b/core/rec-x86/win86_ngen.cpp new file mode 100644 index 000000000..64986ec51 --- /dev/null +++ b/core/rec-x86/win86_ngen.cpp @@ -0,0 +1,128 @@ +#include "win86_ngen.h" + +#if HOST_OS == OS_WINDOWS + +naked void ngen_LinkBlock_Shared_stub() +{ + __asm + { + pop ecx; + sub ecx,5; + call rdv_LinkBlock; + jmp eax; + } +} + +naked void ngen_LinkBlock_cond_Next_stub() +{ + __asm + { + mov edx,0 + jmp ngen_LinkBlock_Shared_stub; + } +} +naked void ngen_LinkBlock_cond_Branch_stub() +{ + __asm + { + mov edx,1 + jmp ngen_LinkBlock_Shared_stub; + } +} + +const u32 offs=offsetof(Sh4RCB,cntx.jdyn); +naked void ngen_LinkBlock_Generic_stub() +{ + __asm + { + mov edx,[p_sh4rcb]; + add edx,[offs]; + mov edx,[edx]; + jmp ngen_LinkBlock_Shared_stub; + } +} + + + + +naked void ngen_FailedToFindBlock_() +{ + __asm + { + mov ecx,esi; + call rdv_FailedToFindBlock; + jmp eax; + } +} + +void (*ngen_FailedToFindBlock)()=&ngen_FailedToFindBlock_; +naked void ngen_mainloop(void* cntx) +{ + __asm + { + push esi; + push edi; + push ebp; + push ebx; + + mov ecx,0xA0000000; + mov [cycle_counter],SH4_TIMESLICE; + + mov [loop_no_update],offset no_update; + mov [intc_sched],offset intc_sched_offs; + + mov eax,0; + //next_pc _MUST_ be on ecx +no_update: + mov esi,ecx; + call bm_GetCode + jmp eax; + +intc_sched_offs: + add [cycle_counter],SH4_TIMESLICE; + call UpdateSystem; + cmp eax,0; + jnz do_iter; + ret; + +do_iter: + pop ecx; + call rdv_DoInterrupts; + mov ecx,eax; +// cmp byte ptr [sh4_int_bCpuRun],0; + // jz cleanup; + jmp no_update; + +cleanup: + pop ebx; + pop ebp; + pop edi; + pop esi; + + ret; + } +} + + +naked void DYNACALL ngen_blockcheckfail(u32 addr) +{ + __asm + { + call rdv_BlockCheckFail; + jmp eax; + } +} + +naked void DYNACALL ngen_blockcheckfail2(u32 addr) +{ + __asm + { + int 3; + call rdv_BlockCheckFail; + jmp eax; + } +} +#else + u32 gas_offs=offsetof(Sh4RCB,cntx.jdyn); + void (*ngen_FailedToFindBlock)()=&ngen_FailedToFindBlock_; +#endif \ No newline at end of file diff --git a/core/rec-x86/win86_ngen.h b/core/rec-x86/win86_ngen.h new file mode 100644 index 000000000..680e9cc5c --- /dev/null +++ b/core/rec-x86/win86_ngen.h @@ -0,0 +1,59 @@ +#include "types.h" + +#include "hw/sh4/sh4_opcode_list.h" +#include "hw/sh4/modules/ccn.h" +#include "hw/sh4/sh4_interrupts.h" + +#include "hw/sh4/sh4_core.h" +#include "hw/sh4/dyna/ngen.h" +#include "hw/sh4/sh4_mem.h" +#include "hw/sh4/dyna/regalloc.h" +#include "emitter/x86_emitter.h" +#include "profiler/profiler.h" +#include "oslib/oslib.h" + +void ngen_opcode(RuntimeBlockInfo* block, shil_opcode* op,x86_block* x86e, bool staging, bool optimise); + +#if BUILD_COMPILER == COMPILER_GCC +extern "C" +{ +#endif + +void ngen_LinkBlock_Generic_stub(); +void ngen_LinkBlock_cond_Next_stub(); +void ngen_LinkBlock_cond_Branch_stub(); +void ngen_FailedToFindBlock_(); +void ngen_mainloop(void* p); + + +void DYNACALL ngen_blockcheckfail(u32 addr); +void DYNACALL ngen_blockcheckfail2(u32 addr); + +#if BUILD_COMPILER == COMPILER_GCC +} +#endif + +extern x86_block* x86e; + +extern u32 cycle_counter; + +extern void* loop_no_update; +extern void* intc_sched; + +extern bool sse_1; +extern bool sse_2; +extern bool sse_3; +extern bool ssse_3; +extern bool mmx; + +struct x86_reg_alloc: RegAlloc +{ + virtual void Preload(u32 reg,x86_reg nreg); + virtual void Writeback(u32 reg,x86_reg nreg); + virtual void Preload_FPU(u32 reg,x86_reg nreg); + virtual void Writeback_FPU(u32 reg,x86_reg nreg); + void FreezeXMM(); + void ThawXMM(); +}; + +extern x86_reg_alloc reg; diff --git a/core/types.h b/core/types.h index 2fe4c8172..2d4c54e0f 100644 --- a/core/types.h +++ b/core/types.h @@ -13,8 +13,10 @@ #if BUILD_COMPILER==COMPILER_VC #define DYNACALL __fastcall +#define DYNACALL_T #else -#define DYNACALL __attribute__((fastcall)) +#define DYNACALL +#define DYNACALL_T __attribute__((fastcall)) #endif #if BUILD_COMPILER==COMPILER_VC diff --git a/shell/lin86/Makefile b/shell/lin86/Makefile index 8e3bb792a..873a2cda8 100644 --- a/shell/lin86/Makefile +++ b/shell/lin86/Makefile @@ -20,7 +20,7 @@ LD=${CC} MFLAGS := -m32 #-marm -march=armv7-a -mtune=cortex-a9 -mfpu=neon -mfloat-abi=softfp -funroll-loops -ASFLAGS := -m32 +ASFLAGS := -32 #-march=armv7-a -mfpu=neon -mfloat-abi=softfp LDFLAGS := -m32 -g -Wl,-Map,$(notdir $@).map,--gc-sections -Wl,-O3 -Wl,--sort-common @@ -81,7 +81,6 @@ PACKAGE_FILES=$(EXECUTABLE_STRIPPED) default.gcw0.desktop icon-32.png all: $(CPPFILES) $(EXECUTABLE) $(EXECUTABLE_STRIPPED) $(EXECUTABLE): $(OBJECTS) - echo $(RZDCY_FILES) $(CXX) $(MFLAGS) $(EXTRAFLAGS) $(LDFLAGS) $(OBJECTS) $(LIBS) -o $@ $(EXECUTABLE_STRIPPED): $(EXECUTABLE)