flycast/core/rec-x86/rec_x86_il.cpp

1296 lines
31 KiB
C++

#include "types.h"
#if FEAT_SHREC == DYNAREC_JIT && HOST_CPU == CPU_X86
#include "rec_x86_ngen.h"
#include "hw/sh4/sh4_mmr.h"
#include "hw/sh4/sh4_rom.h"
void ngen_Bin(shil_opcode* op,x86_opcode_class natop,bool has_imm=true,bool has_wb=true)
{
//x86e->Emit(op_mov32,EAX,op->rs1.reg_ptr());
verify(reg.IsAllocg(op->rs1._reg));
verify(reg.IsAllocg(op->rd._reg));
x86_reg rs2;
if (op->rs2.is_r32i())
{
if (has_wb && reg.mapg(op->rs2) == reg.mapg(op->rd))
{
x86e->Emit(op_mov32, EAX, reg.mapg(op->rs2));
rs2 = EAX;
}
else
{
rs2 = reg.mapg(op->rs2);
}
}
if (has_wb && reg.mapg(op->rs1) != reg.mapg(op->rd))
{
x86e->Emit(op_mov32, reg.mapg(op->rd), reg.mapg(op->rs1));
}
if (has_imm && op->rs2.is_imm())
{
x86e->Emit(natop, has_wb ? reg.mapg(op->rd) : reg.mapg(op->rs1), op->rs2._imm);
}
else if (op->rs2.is_r32i())
{
x86e->Emit(natop, has_wb ? reg.mapg(op->rd) : reg.mapg(op->rs1), rs2);
}
else
{
printf("%d \n",op->rs1.type);
verify(false);
}
}
void ngen_fp_bin(shil_opcode* op,x86_opcode_class natop)
{
verify(reg.IsAllocf(op->rs1));
verify(reg.IsAllocf(op->rd));
x86_reg rs2;
if (op->rs2.is_r32f())
{
if (reg.mapf(op->rs2) == reg.mapf(op->rd))
{
x86e->Emit(op_movss, XMM0, reg.mapf(op->rs2));
rs2 = XMM0;
}
else
{
rs2 = reg.mapf(op->rs2);
}
}
else if (op->rs2.is_imm())
{
//x86e->Emit(op_movss, XMM0, x86_ptr(&reinterpret_cast<f32&>(op->rs2._imm)));
x86e->Emit(op_mov32, EAX, op->rs2._imm);
x86e->Emit(op_movd_xmm_from_r32, XMM0, EAX);
rs2 = XMM0;
}
else
{
printf("%d \n",op->rs2.type);
verify(false);
}
if (op->rd._reg != op->rs1._reg)
x86e->Emit(op_movss, reg.mapf(op->rd), reg.mapf(op->rs1));
x86e->Emit(natop, reg.mapf(op->rd), rs2);
// verify(has_wb);
//x86e->Emit(op_movss,op->rd.reg_ptr(),XMM0);
}
void ngen_Unary(shil_opcode* op,x86_opcode_class natop)
{
verify(reg.IsAllocg(op->rs1));
verify(reg.IsAllocg(op->rd));
if (reg.mapg(op->rs1)!=reg.mapg(op->rd))
x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1));
x86e->Emit(natop,reg.mapg(op->rd));
}
void* _vmem_read_const(u32 addr,bool& ismem,u32 sz);
u32 ngen_CC_BytesPushed;
void ngen_CC_Start(shil_opcode* op)
{
ngen_CC_BytesPushed=0;
}
void ngen_CC_Param(shil_opcode* op,shil_param* par,CanonicalParamType tp)
{
switch(tp)
{
//push the contents
case CPT_u32:
case CPT_f32:
if (par->is_reg())
{
if (reg.IsAllocg(*par))
x86e->Emit(op_push32,reg.mapg(*par));
else if (reg.IsAllocf(*par))
{
x86e->Emit(op_sub32,ESP,4);
x86e->Emit(op_movss,x86_mrm(ESP), reg.mapf(*par));
}
else
{
die("Must not happen !\n");
x86e->Emit(op_push32,x86_ptr(par->reg_ptr()));
}
}
else if (par->is_imm())
x86e->Emit(op_push,par->_imm);
else
die("invalid combination");
ngen_CC_BytesPushed+=4;
break;
//push the ptr itself
case CPT_ptr:
verify(par->is_reg());
x86e->Emit(op_push,(unat)par->reg_ptr());
/* FIXME wtf is this?
for (u32 ri=0; ri<(*par).count(); ri++)
{
if (reg.IsAllocf(*par,ri))
{
x86e->Emit(op_sub32,ESP,4);
x86e->Emit(op_movss,x86_mrm(ESP),reg.mapfv(*par,ri));
}
else
{
verify(!reg.IsAllocAny((Sh4RegType)(par->_reg+ri)));
}
}
*/
ngen_CC_BytesPushed+=4;
break;
//store from EAX
case CPT_u64rvL:
case CPT_u32rv:
if (reg.IsAllocg(*par))
x86e->Emit(op_mov32,reg.mapg(*par),EAX);
/*else if (reg.IsAllocf(*par))
x86e->Emit(op_movd_xmm_from_r32,reg.mapf(*par),EAX);*/
else
die("Must not happen!\n");
break;
case CPT_u64rvH:
if (reg.IsAllocg(*par))
x86e->Emit(op_mov32,reg.mapg(*par),EDX);
else
die("Must not happen!\n");
break;
//Store from ST(0)
case CPT_f32rv:
verify(reg.IsAllocf(*par));
x86e->Emit(op_fstp32f,x86_ptr(par->reg_ptr()));
x86e->Emit(op_movss,reg.mapf(*par),x86_ptr(par->reg_ptr()));
break;
}
}
void ngen_CC_Call(shil_opcode*op,void* function)
{
reg.FreezeXMM();
x86e->Emit(op_call,x86_ptr_imm(function));
reg.ThawXMM();
}
void ngen_CC_Finish(shil_opcode* op)
{
x86e->Emit(op_add32,ESP,ngen_CC_BytesPushed);
}
extern u32 vrml_431;
#ifdef PROF2
extern u32 srmls,srmlu,srmlc;
extern u32 rmls,rmlu;
extern u32 wmls,wmlu;
extern u32 vrd;
#endif
void DYNACALL VERIFYME(u32 addr)
{
verify((addr>>26)==0x38);
}
/*
ReadM
I8 GAI1 [m]
I16 GAI2 [m]
I32 GAI4 [m]
F32 GA4 [m]
F32v2 RA4 [m,m]
F32v4 RA4 [m,m,m,m]
F32v4r3i1 RA4 [m,m,m,1.0]
F32v4r3i0 RA4 [m,m,m,0.0]
WriteM
I8 GA1
I16 GA2
I32 GA4
F32 GA4
F32v2 SA
F32v4
F32v4s3
F32v4s4
//10
R S8 B,M
R S16 B,M
R I32 B,M
R F32 B,M
R F32v2 B{,M}
//13
W I8 B,M
W I16 B,M
W I32 B,S,M
W F32 B,S,M
W F32v2 B,S{,M}
*/
extern void* mem_code[3][2][5];
void ngen_opcode(RuntimeBlockInfo* block, shil_opcode* op,x86_block* x86e, bool staging, bool optimise)
{
switch(op->op)
{
case shop_readm:
{
void* fuct = 0;
bool isram = false;
verify(op->rs1.is_imm() || op->rs1.is_r32i());
verify(op->rs1.is_imm() || reg.IsAllocg(op->rs1));
verify(op->rs3.is_null() || op->rs3.is_imm() || reg.IsAllocg(op->rs3));
for (u32 i = 0; i < op->rd.count(); i++)
{
verify(reg.IsAllocAny((Sh4RegType)(op->rd._reg + i)));
}
u32 size = op->flags & 0x7f;
if (op->rs1.is_imm())
{
if (prof.enable) x86e->Emit(op_add32, &prof.counters.shil.readm_const, 1);
void* ptr = _vmem_read_const(op->rs1._imm, isram, size);
if (isram)
{
#ifdef PROF2
x86e->Emit(op_add32, &srmlu, 1);
#endif
if (size == 1)
x86e->Emit(op_movsx8to32, EAX, ptr);
else if (size == 2)
x86e->Emit(op_movsx16to32, EAX, ptr);
else if (size == 4)
{
x86e->Emit(op_mov32, EAX, ptr);
#if 0
//this is a pretty good sieve, but its not perfect.
//whitelisting is much better, but requires side channel data
//Page locking w/ invalidation is another strategy we can try (leads to 'excessive'
//compiling. Maybe a mix of both ?), its what the mainline nulldc uses
if (optimise)
{
if (staging && !is_s8(*(u32*)ptr) && abs((int)op->rs1._imm-(int)block->addr)<=1024)
{
x86_Label* _same=x86e->CreateLabel(false,8);
x86e->Emit(op_cmp32,EAX,*(u32*)ptr);
x86e->Emit(op_je,_same);
x86e->Emit(op_and32,&op->flags,~0x40000000);
x86e->MarkLabel(_same);
op->flags|=0x40000000;
}
else if (!staging && op->flags & 0x40000000)
{
x86_Label* _same=x86e->CreateLabel(false,8);
x86e->Emit(op_cmp32,EAX,*(u32*)ptr);
x86e->Emit(op_je,_same);
x86e->Emit(op_int3);
x86e->MarkLabel(_same);
#ifdef PROF2
x86e->Emit(op_add32,&srmlc,1);
#endif
}
}
#endif
}
else if (size==8)
{
x86e->Emit(op_mov32,EAX,ptr);
x86e->Emit(op_mov32,EDX,(u8*)ptr+4);
}
else
{
die("Invalid mem read size");
}
}
else
{
#ifdef PROF2
x86e->Emit(op_add32,&srmls,1);
#endif
x86e->Emit(op_mov32,ECX,op->rs1._imm);
fuct=ptr;
}
}
else
{
x86e->Emit(op_mov32,ECX,reg.mapg(op->rs1));
if (op->rs3.is_imm())
{
x86e->Emit(op_add32,ECX,op->rs3._imm);
if (prof.enable) x86e->Emit(op_add32,&prof.counters.shil.readm_reg_imm,1);
}
else if (op->rs3.is_r32i())
{
x86e->Emit(op_add32,ECX,reg.mapg(op->rs3));
if (prof.enable) x86e->Emit(op_add32,&prof.counters.shil.readm_reg_reg,1);
}
else if (!op->rs3.is_null())
{
die("invalid rs3");
}
else
if (prof.enable) x86e->Emit(op_add32,&prof.counters.shil.readm_reg,1);
#if 1
//new code ...
//yay ...
int Lsz=0;
int sz=size;
if (sz==2) Lsz=1;
if (sz==4 && op->rd.is_r32i()) Lsz=2;
if (sz==4 && op->rd.is_r32f()) Lsz=3;
if (sz==8) Lsz=4;
//x86e->Emit(op_int3);
reg.FreezeXMM();
x86e->Emit(op_call,x86_ptr_imm(mem_code[0][0][Lsz]));
reg.ThawXMM();
if (Lsz <= 2)
{
x86e->Emit(op_mov32, reg.mapg(op->rd), EAX);
}
else
{
x86e->Emit(op_movss, reg.mapfv(op->rd, 0), XMM0);
if (Lsz == 4)
x86e->Emit(op_movss, reg.mapfv(op->rd, 1), XMM1);
}
break;
#endif
}
if (size<=8)
{
if (size==8 && optimise)
{
die("unreachable");
#ifdef OPTIMIZATION_GRAVEYARD
verify(op->rd.count()==2 && reg.IsAllocf(op->rd,0) && reg.IsAllocf(op->rd,1));
x86e->Emit(op_mov32,EDX,ECX);
x86e->Emit(op_and32,EDX,0x1FFFFFFF);
x86e->Emit(op_movss,reg.mapfv(op->rd,0),x86_mrm(EDX,x86_ptr(virt_ram_base)));
x86e->Emit(op_movss,reg.mapfv(op->rd,1),x86_mrm(EDX,x86_ptr(4+virt_ram_base)));
break;
#endif
}
if (!isram)
{
reg.FreezeXMM();
switch(size)
{
case 1:
if (!fuct) fuct=reinterpret_cast<void*>(ReadMem8);
x86e->Emit(op_call,x86_ptr_imm(fuct));
x86e->Emit(op_movsx8to32,EAX,EAX);
break;
case 2:
if (!fuct) fuct=reinterpret_cast<void*>(ReadMem16);
x86e->Emit(op_call,x86_ptr_imm(fuct));
x86e->Emit(op_movsx16to32,EAX,EAX);
break;
case 4:
if (!fuct) fuct=reinterpret_cast<void*>(ReadMem32);
x86e->Emit(op_call,x86_ptr_imm(fuct));
break;
case 8:
if (!fuct) fuct=reinterpret_cast<void*>(ReadMem64);
x86e->Emit(op_call,x86_ptr_imm(fuct));
break;
default:
verify(false);
}
reg.ThawXMM();
}
if (size!=8)
{
if (reg.IsAllocg(op->rd))
x86e->Emit(op_mov32,reg.mapg(op->rd),EAX);
else if (reg.IsAllocf(op->rd))
x86e->Emit(op_movd_xmm_from_r32,reg.mapf(op->rd),EAX);
else
x86e->Emit(op_mov32,op->rd.reg_ptr(),EAX);
}
else
{
verify(op->rd.count()==2 && reg.IsAllocf(op->rd,0) && reg.IsAllocf(op->rd,1));
x86e->Emit(op_movd_xmm_from_r32,reg.mapfv(op->rd,0),EAX);
x86e->Emit(op_movd_xmm_from_r32,reg.mapfv(op->rd,1),EDX);
}
}
}
break;
case shop_writem:
{
u32 size=op->flags&0x7f;
verify(reg.IsAllocg(op->rs1) || op->rs1.is_imm());
verify(op->rs2.is_imm() || op->rs2.is_r32() || (op->rs2.count()==2 && reg.IsAllocf(op->rs2,0) && reg.IsAllocf(op->rs2,1)));
if (op->rs1.is_imm() && size<=4)
{
if (prof.enable) x86e->Emit(op_add32,&prof.counters.shil.readm_const,1);
bool isram;
void* ptr=_vmem_read_const(op->rs1._imm,isram,size);
if (isram)
{
if (op->rs2.is_imm())
x86e->Emit(op_mov32, EAX, op->rs2._imm);
else if (size <= 2)
x86e->Emit(op_mov32, EAX, reg.mapg(op->rs2));
if (size == 1)
x86e->Emit(op_mov8, ptr, EAX);
else if (size == 2)
x86e->Emit(op_mov16, ptr, EAX);
else if (op->rs2.is_imm())
x86e->Emit(op_mov32, ptr, EAX);
else if (size == 4)
{
if (op->rs2.is_r32i())
x86e->Emit(op_mov32, ptr, reg.mapg(op->rs2));
else
x86e->Emit(op_movss, ptr, reg.mapf(op->rs2));
}
else if (size == 8)
{
die("A");
}
else
die("Invalid mem read size");
goto done_writem;
}
else
x86e->Emit(op_mov32,ECX,op->rs1._imm);
}
else
{
x86e->Emit(op_mov32,ECX,reg.mapg(op->rs1));
}
if (op->rs3.is_imm())
{
x86e->Emit(op_add32,ECX,op->rs3._imm);
}
else if (op->rs3.is_r32i())
{
verify(reg.IsAllocg(op->rs3));
x86e->Emit(op_add32,ECX,reg.mapg(op->rs3));
}
else if (!op->rs3.is_null())
{
printf("rs3: %08X\n",op->rs3.type);
die("invalid rs3");
}
#if 1
//new code ...
//yay ...
int Lsz = 0;
int sz = size;
if (sz == 2) Lsz = 1;
if (sz == 4 && (op->rs2.is_r32i() || op->rs2.is_imm())) Lsz = 2;
if (sz == 4 && op->rs2.is_r32f()) Lsz = 3;
if (sz == 8) Lsz = 4;
//x86e->Emit(op_int3);
//if (Lsz==0)
{
if (op->rs2.is_imm())
x86e->Emit(op_mov32, EDX, op->rs2._imm);
else if (Lsz <= 2)
x86e->Emit(op_mov32, EDX, reg.mapg(op->rs2));
else
{
x86e->Emit(op_movss, XMM0, reg.mapfv(op->rs2, 0));
if (Lsz == 4)
x86e->Emit(op_movss, XMM1, reg.mapfv(op->rs2, 1));
}
reg.FreezeXMM();
x86e->Emit(op_call, x86_ptr_imm(mem_code[2][1][Lsz]));
reg.ThawXMM();
break;
}
#endif
}
done_writem:
break;
case shop_ifb:
{
/*
//reg alloc should be flushed here. Add Check
for (int i=0;i<sh4_reg_count;i++)
{
verify(!reg.IsAllocAny((Sh4RegType)i));
}*/
if (op->rs1._imm)
{
x86e->Emit(op_mov32,&next_pc,op->rs2._imm);
}
x86e->Emit(op_mov32,ECX,op->rs3._imm);
#ifdef PROF2
x86e->Emit(op_add32,&OpDesc[op->rs3._imm]->fallbacks,1);
x86e->Emit(op_adc32,((u8*)&OpDesc[op->rs3._imm]->fallbacks)+4,0);
#endif
x86e->Emit(op_call,x86_ptr_imm(OpDesc[op->rs3._imm]->oph));
}
break;
case shop_jdyn:
{
verify(reg.IsAllocg(op->rs1));
verify(reg.IsAllocg(op->rd));
x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1));
if (op->rs2.is_imm())
{
x86e->Emit(op_add32,reg.mapg(op->rd),op->rs2._imm);
}
//x86e->Emit(op_mov32,op->rd.reg_ptr(),EAX);
}
break;
case shop_jcond:
{
verify(block->has_jcond);
verify(reg.IsAllocg(op->rs1));
verify(reg.IsAllocg(op->rd));
x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1));
//x86e->Emit(op_mov32,op->rd.reg_ptr(),EAX);
}
break;
case shop_mov64:
{
verify(op->rd.is_r64());
verify(op->rs1.is_r64());
verify(reg.IsAllocf(op->rs1,0) && reg.IsAllocf(op->rs1,1));
verify(reg.IsAllocf(op->rd,0) && reg.IsAllocf(op->rd,1));
x86e->Emit(op_movaps,reg.mapfv(op->rd,0),reg.mapfv(op->rs1,0));
x86e->Emit(op_movaps,reg.mapfv(op->rd,1),reg.mapfv(op->rs1,1));
}
break;
case shop_mov32:
{
verify(op->rd.is_r32());
if (op->rs1.is_imm())
{
if (op->rd.is_r32i())
{
x86e->Emit(op_mov32,reg.mapg(op->rd),op->rs1._imm);
}
else
{
//verify(!reg.IsAllocAny(op->rd));
x86e->Emit(op_mov32,EAX,op->rs1._imm);
x86e->Emit(op_movd_xmm_from_r32,reg.mapf(op->rd),EAX);
}
}
else if (op->rs1.is_r32())
{
u32 type=0;
if (reg.IsAllocf(op->rd))
type|=1;
if (reg.IsAllocf(op->rs1))
type|=2;
switch(type)
{
case 0: //reg=reg
if (reg.mapg(op->rd) != reg.mapg(op->rs1))
x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1));
break;
case 1: //xmm=reg
x86e->Emit(op_movd_xmm_from_r32,reg.mapf(op->rd),reg.mapg(op->rs1));
break;
case 2: //reg=xmm
x86e->Emit(op_movd_xmm_to_r32,reg.mapg(op->rd),reg.mapf(op->rs1));
break;
case 3: //xmm=xmm
if (reg.mapf(op->rd) != reg.mapf(op->rs1))
x86e->Emit(op_movss,reg.mapf(op->rd),reg.mapf(op->rs1));
else
printf("Renamed fmov !\n");
break;
}
}
else
{
die("Invalid mov32 size");
}
}
break;
//if CANONICAL_TEST is defined all opcodes use the C-based canonical implementation !
//#define CANONICAL_TEST 1
#ifndef CANONICAL_TEST
case shop_and: ngen_Bin(op,op_and32); break;
case shop_or: ngen_Bin(op,op_or32); break;
case shop_xor: ngen_Bin(op,op_xor32); break;
case shop_add: ngen_Bin(op,op_add32); break;
case shop_sub: ngen_Bin(op,op_sub32); break;
case shop_ror: ngen_Bin(op,op_ror32); break;
case shop_shl:
case shop_shr:
case shop_sar:
{
x86_opcode_class opcd[]={op_shl32,op_shr32,op_sar32};
ngen_Bin(op,opcd[op->op-shop_shl]);
}
break;
case shop_rocr:
case shop_rocl:
{
x86e->Emit(op_mov32, EAX, reg.mapg(op->rs2));
x86e->Emit(op_sar32, EAX, 1);
if (reg.mapg(op->rd) != reg.mapg(op->rs1))
x86e->Emit(op_mov32, reg.mapg(op->rd), reg.mapg(op->rs1));
x86e->Emit(op->op == shop_rocr ? op_rcr32 : op_rcl32, reg.mapg(op->rd), 1);
x86e->Emit(op_mov32, reg.mapg(op->rd2), 0);
x86e->Emit(op_rcl32, reg.mapg(op->rd2), 1);
}
break;
case shop_test:
case shop_seteq:
case shop_setge:
case shop_setgt:
case shop_setae:
case shop_setab:
{
x86_opcode_class opcls1=op->op==shop_test?op_test32:op_cmp32;
x86_opcode_class opcls2[]={op_setz,op_sete,op_setge,op_setg,op_setae,op_seta };
ngen_Bin(op,opcls1,true,false);
x86e->Emit(opcls2[op->op-shop_test],AL);
x86e->Emit(op_movzx8to32,reg.mapg(op->rd),AL);
}
break;
case shop_adc:
{
x86e->Emit(op_mov32, EAX, reg.mapg(op->rs3));
x86e->Emit(op_sar32, EAX, 1);
x86_reg rs1;
if (op->rs1.is_imm())
{
rs1 = ECX;
x86e->Emit(op_mov32, ECX, op->rs1._imm);
}
else
{
rs1 = reg.mapg(op->rs1);
}
x86_reg rs2;
if (op->rs2.is_imm())
{
rs2 = EDX;
x86e->Emit(op_mov32, EDX, op->rs2._imm);
}
else
{
rs2 = reg.mapg(op->rs2);
}
if (reg.mapg(op->rd) != rs1)
{
if (reg.mapg(op->rd) == rs2)
{
// Invert the operands
rs2 = rs1;
}
else
{
x86e->Emit(op_mov32, reg.mapg(op->rd), rs1);
}
}
x86e->Emit(op_adc32, reg.mapg(op->rd), rs2);
x86e->Emit(op_mov32, reg.mapg(op->rd2), 0);
x86e->Emit(op_rcl32, reg.mapg(op->rd2), 1);
}
break;
//rd=rs1<<rs2
case shop_shad:
case shop_shld:
{
verify(op->rs2.is_imm() || reg.IsAllocg(op->rs2));
verify(reg.IsAllocg(op->rd));
x86_opcode_class sl32 = op->op == shop_shad ? op_sal32 : op_shl32;
x86_opcode_class sr32 = op->op == shop_shad ? op_sar32 : op_shr32;
if (op->rs1.is_imm())
x86e->Emit(op_mov32, reg.mapg(op->rd), op->rs1._imm);
else if (reg.mapg(op->rd) != reg.mapg(op->rs1))
{
verify(reg.mapg(op->rs2) != reg.mapg(op->rd));
x86e->Emit(op_mov32, reg.mapg(op->rd), reg.mapg(op->rs1));
}
if (op->rs2.is_imm())
{
die("sh*d: no imms please\n");
}
else
{
x86e->Emit(op_mov32,ECX,reg.mapg(op->rs2));
x86_Label* _exit=x86e->CreateLabel(false,8);
x86_Label* _neg=x86e->CreateLabel(false,8);
x86_Label* _nz=x86e->CreateLabel(false,8);
x86e->Emit(op_cmp32,reg.mapg(op->rs2),0);
x86e->Emit(op_js,_neg);
{
//>=0
//r[n]<<=sf;
x86e->Emit(sl32,reg.mapg(op->rd),ECX);
x86e->Emit(op_jmp,_exit);
}
x86e->MarkLabel(_neg);
x86e->Emit(op_test32,reg.mapg(op->rs2),0x1f);
x86e->Emit(op_jnz,_nz);
{
//1fh==0
if (op->op!=shop_shad)
{
//r[n]=0;
x86e->Emit(op_mov32,reg.mapg(op->rd),0);
}
else
{
//r[n]>>=31;
x86e->Emit(op_sar32,reg.mapg(op->rd),31);
}
x86e->Emit(op_jmp,_exit);
}
x86e->MarkLabel(_nz);
{
//<0
//r[n]>>=(-sf);
x86e->Emit(op_neg32,ECX);
x86e->Emit(sr32,reg.mapg(op->rd),ECX);
}
x86e->MarkLabel(_exit);
}
}
break;
case shop_swaplb:
{
if (reg.mapg(op->rd)!=reg.mapg(op->rs1))
x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1));
x86e->Emit(op_ror16,reg.mapg(op->rd),8);
}
break;
case shop_neg: ngen_Unary(op,op_neg32); break;
case shop_not: ngen_Unary(op,op_not32); break;
case shop_sync_sr:
{
//reg alloc should be flushed here. Add Check
for (int i=0;i<8;i++)
{
verify(!reg.IsAllocAny((Sh4RegType)(reg_r0+i)));
verify(!reg.IsAllocAny((Sh4RegType)(reg_r0_Bank+i)));
}
verify(!reg.IsAllocAny(reg_old_sr_status));
verify(!reg.IsAllocAny(reg_sr_status));
//reg alloc should be flushed here, add checks
x86e->Emit(op_call,x86_ptr_imm(UpdateSR));
}
break;
case shop_sync_fpscr:
{
//reg alloc should be flushed here. Add Check
for (int i=0;i<16;i++)
{
verify(!reg.IsAllocAny((Sh4RegType)(reg_fr_0+i)));
verify(!reg.IsAllocAny((Sh4RegType)(reg_xf_0+i)));
}
verify(!reg.IsAllocAny(reg_old_fpscr));
verify(!reg.IsAllocAny(reg_fpscr));
//reg alloc should be flushed here, add checks
x86e->Emit(op_call,x86_ptr_imm(UpdateFPSCR));
}
break;
case shop_mul_u16:
case shop_mul_s16:
case shop_mul_i32:
case shop_mul_u64:
case shop_mul_s64:
{
verify(reg.IsAllocg(op->rs1));
verify(reg.IsAllocg(op->rd));
x86_opcode_class opdt[]={op_movzx16to32,op_movsx16to32,op_mov32,op_mov32,op_mov32};
x86_opcode_class opmt[]={op_mul32,op_mul32,op_mul32,op_mul32,op_imul32};
//only the top 32 bits are different on signed vs unsigned
u32 opofs=op->op-shop_mul_u16;
x86e->Emit(opdt[opofs],EAX,reg.mapg(op->rs1));
if (op->rs2.is_reg())
x86e->Emit(opdt[opofs], EDX, reg.mapg(op->rs2));
else
{
verify(op->rs2.is_imm());
if (opofs <= 2)
{
x86e->Emit(op_mov32, EDX, op->rs2._imm);
x86e->Emit(opdt[opofs], EDX, EDX);
}
else
{
x86e->Emit(opdt[opofs], EDX, op->rs2._imm);
}
}
x86e->Emit(opmt[opofs],EDX);
x86e->Emit(op_mov32,reg.mapg(op->rd),EAX);
if (op->op>=shop_mul_u64)
x86e->Emit(op_mov32,reg.mapg(op->rd2),EDX);
}
break;
//fpu
case shop_fadd:
case shop_fsub:
case shop_fmul:
case shop_fdiv:
{
const x86_opcode_class opcds[]= { op_addss, op_subss, op_mulss, op_divss };
ngen_fp_bin(op,opcds[op->op-shop_fadd]);
}
break;
case shop_fabs:
{
verify(reg.IsAllocf(op->rs1));
verify(reg.IsAllocf(op->rd));
static DECL_ALIGN(16) u32 AND_ABS_MASK[4] = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
if (op->rd._reg != op->rs1._reg)
x86e->Emit(op_movss, reg.mapf(op->rd), reg.mapf(op->rs1));
x86e->Emit(op_pand, reg.mapf(op->rd), AND_ABS_MASK);
}
break;
case shop_fneg:
{
verify(reg.IsAllocf(op->rs1));
verify(reg.IsAllocf(op->rd));
static DECL_ALIGN(16) u32 XOR_NEG_MASK[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
if (op->rd._reg != op->rs1._reg)
x86e->Emit(op_movss, reg.mapf(op->rd), reg.mapf(op->rs1));
x86e->Emit(op_pxor, reg.mapf(op->rd), XOR_NEG_MASK);
}
break;
case shop_fsca: // FIXME the canonical version doesn't work because components of the resulting vector are allocated (mapfv)
{
verify(op->rs1.is_r32i());
//verify(op->rd.is_vector); //double ? vector(2) ?
verify(reg.IsAllocg(op->rs1));
verify(reg.IsAllocf(op->rd,0) && reg.IsAllocf(op->rd,1));
//sin/cos
x86e->Emit(op_movzx16to32,EAX,reg.mapg(op->rs1));
x86e->Emit(op_movss,reg.mapfv(op->rd,0),x86_mrm(EAX,sib_scale_8,x86_ptr(&sin_table->u[0])));
x86e->Emit(op_movss,reg.mapfv(op->rd,1),x86_mrm(EAX,sib_scale_8,x86_ptr(&sin_table->u[1])));
}
break;
case shop_fipr:
{
//rd=rs1*rs2 (vectors)
// verify(!reg.IsAllocAny(op->rs1));
// verify(!reg.IsAllocAny(op->rs2));
verify(reg.IsAllocf(op->rd));
verify(op->rs1.is_r32fv()==4);
verify(op->rs2.is_r32fv()==4);
verify(op->rd.is_r32());
if (sse_3)
{
x86_reg xmm=reg.mapf(op->rd);
x86e->Emit(op_movaps ,xmm,op->rs1.reg_ptr());
x86e->Emit(op_mulps ,xmm,op->rs2.reg_ptr());
//xmm0={a0 ,a1 ,a2 ,a3}
x86e->Emit(op_haddps,xmm,xmm); //xmm0={a0+a1 ,a2+a3 ,a0+a1 ,a2+a3}
x86e->Emit(op_haddps,xmm,xmm); //xmm0={(a0+a1)+(a2+a3) ,(a0+a1)+(a2+a3),(a0+a1)+(a2+a3),(a0+a1)+(a2+a3)}
}
else
{
x86_reg xmm=reg.mapf(op->rd);
x86e->Emit(op_movaps ,xmm,op->rs1.reg_ptr());
x86e->Emit(op_mulps ,xmm,op->rs2.reg_ptr());
x86e->Emit(op_movhlps ,XMM1,xmm);
x86e->Emit(op_addps ,xmm,XMM1);
x86e->Emit(op_movaps ,XMM1,xmm);
x86e->Emit(op_shufps ,XMM1,XMM1,1);
x86e->Emit(op_addss ,xmm,XMM1);
}
}
break;
case shop_fsqrt:
{
verify(reg.IsAllocf(op->rs1));
verify(reg.IsAllocf(op->rd));
//rd=sqrt(rs1)
x86e->Emit(op_sqrtss ,reg.mapf(op->rd),reg.mapf(op->rs1));
//x86e->Emit(op_movss ,op->rd.reg_ptr(),XMM0);
}
break;
case shop_ftrv:
{
#ifdef PROF2
x86e->Emit(op_add32,&vrd,16);
#endif
verify(!reg.IsAllocAny(op->rs1));
verify(!reg.IsAllocAny(op->rs2));
verify(!reg.IsAllocAny(op->rd));
//rd(vector)=rs1(vector)*rs2(matrix)
verify(op->rd.is_r32fv()==4);
verify(op->rs1.is_r32fv()==4);
verify(op->rs2.is_r32fv()==16);
#if 1
//load the vector ..
if (sse_2)
{
x86e->Emit(op_movaps ,XMM3,op->rs1.reg_ptr()); //xmm0=vector
x86e->Emit(op_pshufd ,XMM0,XMM3,0); //xmm0={v0}
x86e->Emit(op_pshufd ,XMM1,XMM3,0x55); //xmm1={v1}
x86e->Emit(op_pshufd ,XMM2,XMM3,0xaa); //xmm2={v2}
x86e->Emit(op_pshufd ,XMM3,XMM3,0xff); //xmm3={v3}
}
else
{
x86e->Emit(op_movaps ,XMM0,op->rs1.reg_ptr()); //xmm0=vector
x86e->Emit(op_movaps ,XMM3,XMM0); //xmm3=vector
x86e->Emit(op_shufps ,XMM0,XMM0,0); //xmm0={v0}
x86e->Emit(op_movaps ,XMM1,XMM3); //xmm1=vector
x86e->Emit(op_movaps ,XMM2,XMM3); //xmm2=vector
x86e->Emit(op_shufps ,XMM3,XMM3,0xff); //xmm3={v3}
x86e->Emit(op_shufps ,XMM1,XMM1,0x55); //xmm1={v1}
x86e->Emit(op_shufps ,XMM2,XMM2,0xaa); //xmm2={v2}
}
//do the matrix mult !
x86e->Emit(op_mulps ,XMM0,op->rs2.reg_ptr() + 0); //v0*=vm0
x86e->Emit(op_mulps ,XMM1,op->rs2.reg_ptr() + 4); //v1*=vm1
x86e->Emit(op_mulps ,XMM2,op->rs2.reg_ptr() + 8); //v2*=vm2
x86e->Emit(op_mulps ,XMM3,op->rs2.reg_ptr() + 12); //v3*=vm3
x86e->Emit(op_addps ,XMM0,XMM1); //sum it all up
x86e->Emit(op_addps ,XMM2,XMM3);
x86e->Emit(op_addps ,XMM0,XMM2);
x86e->Emit(op_movaps ,op->rd.reg_ptr(),XMM0);
#else
/*
AABB CCDD
ABCD * 0 1 2 3 0 1 4 5
4 5 6 7 2 3 6 7
8 9 a b 8 9 c d
c d e f a b e f
*/
x86e->Emit(op_movaps ,XMM1,op->rs1.reg_ptr()); //xmm1=vector
x86e->Emit(op_pshufd ,XMM0,XMM1,0x05); //xmm0={v0,v0,v1,v1}
x86e->Emit(op_pshufd ,XMM1,XMM1,0xaf); //xmm1={v2,v2,v3,v3}
x86e->Emit(op_movaps,XMM2,XMM0); //xmm2={v0,v0,v1,v1}
x86e->Emit(op_movaps,XMM3,XMM1); //xmm3={v2,v2,v3,v3}
x86e->Emit(op_mulps ,XMM0,op->rs2.reg_ptr() + 0); //aabb * 0145
x86e->Emit(op_mulps ,XMM2,op->rs2.reg_ptr() + 4); //aabb * 2367
x86e->Emit(op_mulps ,XMM1,op->rs2.reg_ptr() + 8); //ccdd * 89cd
x86e->Emit(op_mulps ,XMM3,op->rs2.reg_ptr() + 12); //ccdd * abef
x86e->Emit(op_addps ,XMM0,XMM1); //sum it all up
x86e->Emit(op_addps ,XMM2,XMM3);
//XMM0 -> A0C8 | A1C9 | B4DC | B5DD
verify(sse_3);
x86e->Emit(op_shufps,XMM0,XMM0,0x27); //A0C8 B4DC A1C9 B5DC
x86e->Emit(op_shufps,XMM2,XMM2,0x27);
x86e->Emit(op_haddps,XMM0,XMM2); //haddps ={a0+a1 ,a2+a3 ,b0+b1 ,b2+b3}
x86e->Emit(op_movaps ,op->rd.reg_ptr(),XMM0);
#endif
}
break;
case shop_fmac:
{
verify(reg.IsAllocf(op->rs1));
verify(reg.IsAllocf(op->rs2));
verify(reg.IsAllocf(op->rs3));
verify(reg.IsAllocf(op->rd));
//rd=rs1+rs2*rs3
//rd might be rs1,rs2 or rs3, so can't prestore here (iirc, rd==rs1==fr0)
x86e->Emit(op_movss ,XMM0,reg.mapf(op->rs2));
x86e->Emit(op_mulss ,XMM0,reg.mapf(op->rs3));
x86e->Emit(op_addss ,XMM0,reg.mapf(op->rs1));
x86e->Emit(op_movss ,reg.mapf(op->rd),XMM0);
}
break;
case shop_fsrra:
{
verify(reg.IsAllocf(op->rs1));
verify(reg.IsAllocf(op->rd));
//rd=1/sqrt(rs1)
static float one=1.0f;
x86e->Emit(op_sqrtss ,XMM0,reg.mapf(op->rs1));
x86e->Emit(op_movss ,reg.mapf(op->rd),&one);
x86e->Emit(op_divss ,reg.mapf(op->rd),XMM0);
}
break;
case shop_fseteq:
case shop_fsetgt:
{
verify(reg.IsAllocf(op->rs1));
verify(reg.IsAllocf(op->rs2));
verify(reg.IsAllocg(op->rd));
//x86e->Emit(op_movss,XMM0,op->rs1.reg_ptr());
x86e->Emit(op_ucomiss,reg.mapf(op->rs1),reg.mapf(op->rs2));
if (op->op==shop_fseteq)
{
//special case
//We want to take in account the 'unordered' case on the fpu
x86e->Emit(op_lahf);
x86e->Emit(op_test8,AH,0x44);
x86e->Emit(op_setnp,AL);
}
else
{
x86e->Emit(op_seta,AL);
}
x86e->Emit(op_movzx8to32,reg.mapg(op->rd),AL);
}
break;
case shop_pref:
{
verify(op->rs1.is_r32i());
verify(reg.IsAllocg(op->rs1));
if (op->flags==0x1337)
{
//
x86e->Emit(op_mov32 ,ECX,reg.mapg(op->rs1));
x86e->Emit(op_call,x86_ptr_imm(&VERIFYME)); //call do_sqw_mmu
}
x86e->Emit(op_mov32 ,EDX,reg.mapg(op->rs1));
x86e->Emit(op_mov32 ,ECX,reg.mapg(op->rs1));
x86e->Emit(op_shr32 ,EDX,26);
x86_Label* nosq=x86e->CreateLabel(false,8);
x86e->Emit(op_cmp32,EDX,0x38);
x86e->Emit(op_jne,nosq);
{
if (CCN_MMUCR.AT)
x86e->Emit(op_call,x86_ptr_imm(&do_sqw_mmu)); //call do_sqw_mmu
else
{
x86e->Emit(op_mov32 ,EDX,(u32)sq_both);
x86e->Emit(op_call32,x86_ptr(&do_sqw_nommu)); //call [do_sqw_nommu]
}
}
x86e->MarkLabel(nosq);
}
break;
case shop_ext_s8:
case shop_ext_s16:
{
verify(op->rd.is_r32i());
verify(op->rs1.is_r32i());
verify(reg.IsAllocg(op->rd));
verify(reg.IsAllocg(op->rs1));
x86e->Emit(op_mov32,EAX,reg.mapg(op->rs1));
if (op->op==shop_ext_s8)
x86e->Emit(op_movsx8to32,reg.mapg(op->rd),EAX);
else
x86e->Emit(op_movsx16to32,reg.mapg(op->rd),EAX);
}
break;
case shop_cvt_f2i_t:
{
verify(op->rd.is_r32i());
verify(op->rs1.is_r32f());
verify(reg.IsAllocg(op->rd));
verify(reg.IsAllocf(op->rs1));
x86e->Emit(op_cvttss2si, reg.mapg(op->rd), reg.mapf(op->rs1));
x86e->Emit(op_mov32, EAX, 0x7fffffff);
x86e->Emit(op_cmp32, reg.mapg(op->rd), 0x7fffff80); // 2147483520.0f
x86e->Emit(op_cmovge32, reg.mapg(op->rd), EAX);
}
break;
//i hope that the round mode bit is set properly here :p
case shop_cvt_i2f_n:
case shop_cvt_i2f_z:
verify(op->rd.is_r32f());
verify(op->rs1.is_r32i());
verify(reg.IsAllocf(op->rd));
verify(reg.IsAllocg(op->rs1));
x86e->Emit(op_cvtsi2ss,reg.mapf(op->rd),reg.mapg(op->rs1));
//x86e->Emit(op_movss,op->rd.reg_ptr(),XMM0);
break;
case shop_frswap:
{
verify(op->rd._reg==op->rs2._reg);
verify(op->rd2._reg==op->rs1._reg);
verify(op->rs1.count()==16 && op->rs2.count()==16);
verify(op->rd2.count()==16 && op->rd.count()==16);
#ifdef PROF2
x86e->Emit(op_add32,&vrd,32);
#endif
for (int i=0;i<4;i++)
{
x86e->Emit(op_movaps,XMM0,op->rs1.reg_ptr()+i*4);
x86e->Emit(op_movaps,XMM1,op->rs2.reg_ptr()+i*4);
x86e->Emit(op_movaps,op->rd.reg_ptr()+i*4,XMM0);
x86e->Emit(op_movaps,op->rd2.reg_ptr()+i*4,XMM1);
}
}
break;
/* TODO Update this according to new canonical implementation
case shop_div32s:
case shop_div32u:
{
x86e->Emit(op_mov32,EAX,reg.mapg(op->rs1));
if (op->op==shop_div32s)
x86e->Emit(op_cdq);
else
x86e->Emit(op_xor32,EDX,EDX);
x86e->Emit(op->op==shop_div32s?op_idiv32:op_div32,reg.mapg(op->rs2));
x86e->Emit(op_mov32,reg.mapg(op->rd),EAX);
x86e->Emit(op_mov32,reg.mapg(op->rd2),EDX);
}
break;
case shop_div32p2:
{
x86e->Emit(op_xor32,EAX,EAX);
x86e->Emit(op_cmp32,reg.mapg(op->rs3),0);
x86e->Emit(op_cmove32,EAX,reg.mapg(op->rs2));
if (reg.mapg(op->rd)!=reg.mapg(op->rs1))
x86e->Emit(op_mov32,reg.mapg(op->rd),reg.mapg(op->rs1));
x86e->Emit(op_sub32,reg.mapg(op->rd),EAX);
}
break;
*/
#endif
default:
shil_chf[op->op](op);
break;
}
}
#endif