From 26ec76ef355dbde977b500fb911ff152ff147f23 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 29 Sep 2013 21:05:48 -0700 Subject: [PATCH] A lot of progress on altivec instructions. Some bad results, still, and some instructions are not decoding right. --- src/xenia/cpu/global_exports.cc | 44 ++ src/xenia/cpu/global_exports.h | 3 + src/xenia/cpu/ppc/state.h | 7 + src/xenia/cpu/x64/x64_emit_altivec.cc | 825 ++++++++++++++++---------- src/xenia/cpu/x64/x64_emitter.cc | 72 ++- src/xenia/cpu/x64/x64_emitter.h | 5 + src/xenia/cpu/x64/x64_jit.cc | 29 +- src/xenia/cpu/x64/x64_jit.h | 1 + 8 files changed, 655 insertions(+), 331 deletions(-) diff --git a/src/xenia/cpu/global_exports.cc b/src/xenia/cpu/global_exports.cc index dd408e45c..b5937f726 100644 --- a/src/xenia/cpu/global_exports.cc +++ b/src/xenia/cpu/global_exports.cc @@ -111,6 +111,49 @@ void _cdecl XeTraceBranch( (uint32_t)cia, (uint32_t)target_ia); } +void _cdecl XeTraceVR( + xe_ppc_state_t* state, uint64_t vr0, uint64_t vr1, uint64_t vr2, + uint64_t vr3, uint64_t vr4) { + char buffer[2048]; + buffer[0] = 0; + int offset = 0; + + offset += xesnprintfa(buffer + offset, XECOUNT(buffer) - offset, + "%.8X:", state->cia); + + offset += xesnprintfa(buffer + offset, XECOUNT(buffer) - offset, + "\nvr%.3d=[%.8X, %.8X, %.8X, %.8X] [%g, %g, %g, %g]", vr0, + state->v[vr0].ix, state->v[vr0].iy, state->v[vr0].iz, state->v[vr0].iw, + state->v[vr0].x, state->v[vr0].y, state->v[vr0].z, state->v[vr0].w); + if (vr1 != UINT_MAX) { + offset += xesnprintfa(buffer + offset, XECOUNT(buffer) - offset, + "\nvr%.3d=[%.8X, %.8X, %.8X, %.8X] [%g, %g, %g, %g]", vr1, + state->v[vr1].ix, state->v[vr1].iy, state->v[vr1].iz, state->v[vr1].iw, + state->v[vr1].x, state->v[vr1].y, state->v[vr1].z, state->v[vr1].w); + } + if (vr2 != UINT_MAX) { + offset += xesnprintfa(buffer + offset, XECOUNT(buffer) - offset, + "\nvr%.3d=[%.8X, %.8X, %.8X, %.8X] [%g, %g, %g, %g]", vr2, + state->v[vr2].ix, state->v[vr2].iy, state->v[vr2].iz, state->v[vr2].iw, + state->v[vr2].x, state->v[vr2].y, state->v[vr2].z, state->v[vr2].w); + } + if (vr3 != UINT_MAX) { + offset += xesnprintfa(buffer + offset, XECOUNT(buffer) - offset, + "\nvr%.3d=[%.8X, %.8X, %.8X, %.8X] [%g, %g, %g, %g]", vr3, + state->v[vr3].ix, state->v[vr3].iy, state->v[vr3].iz, state->v[vr3].iw, + state->v[vr3].x, state->v[vr3].y, state->v[vr3].z, state->v[vr3].w); + } + if (vr4 != UINT_MAX) { + offset += xesnprintfa(buffer + offset, XECOUNT(buffer) - offset, + "\nvr%.3d=[%.8X, %.8X, %.8X, %.8X] [%g, %g, %g, %g]", vr4, + state->v[vr4].ix, state->v[vr4].iy, state->v[vr4].iz, state->v[vr4].iw, + state->v[vr4].x, state->v[vr4].y, state->v[vr4].z, state->v[vr4].w); + } + + uint32_t thread_id = state->thread_state->thread_id(); + xe_log_line("", thread_id, "XeTraceVR", 't', buffer); +} + void _cdecl XeTraceInstruction( xe_ppc_state_t* state, uint64_t cia, uint64_t data) { char buffer[2048]; @@ -184,5 +227,6 @@ void xe::cpu::GetGlobalExports(GlobalExports* global_exports) { global_exports->XeTraceKernelCall = XeTraceKernelCall; global_exports->XeTraceUserCall = XeTraceUserCall; global_exports->XeTraceBranch = XeTraceBranch; + global_exports->XeTraceVR = XeTraceVR; global_exports->XeTraceInstruction = XeTraceInstruction; } diff --git a/src/xenia/cpu/global_exports.h b/src/xenia/cpu/global_exports.h index 5c787c3de..a0aaa65cd 100644 --- a/src/xenia/cpu/global_exports.h +++ b/src/xenia/cpu/global_exports.h @@ -39,6 +39,9 @@ typedef struct { sdb::FunctionSymbol* fn); void (_cdecl *XeTraceBranch)( xe_ppc_state_t* state, uint64_t cia, uint64_t target_ia); + void (_cdecl *XeTraceVR)( + xe_ppc_state_t* state, uint64_t vr0, uint64_t vr1, uint64_t vr2, + uint64_t vr3, uint64_t vr4); void (_cdecl *XeTraceInstruction)( xe_ppc_state_t* state, uint64_t cia, uint64_t data); } GlobalExports; diff --git a/src/xenia/cpu/ppc/state.h b/src/xenia/cpu/ppc/state.h index be31ddd99..f6a0619b2 100644 --- a/src/xenia/cpu/ppc/state.h +++ b/src/xenia/cpu/ppc/state.h @@ -48,7 +48,14 @@ typedef struct XECACHEALIGN xe_float4 { float z; float w; }; + struct { + uint32_t ix; + uint32_t iy; + uint32_t iz; + uint32_t iw; + }; float f4[4]; + uint32_t i4[4]; struct { uint64_t low; uint64_t high; diff --git a/src/xenia/cpu/x64/x64_emit_altivec.cc b/src/xenia/cpu/x64/x64_emit_altivec.cc index d58987183..abbccb2d6 100644 --- a/src/xenia/cpu/x64/x64_emit_altivec.cc +++ b/src/xenia/cpu/x64/x64_emit_altivec.cc @@ -37,6 +37,18 @@ namespace x64 { #define VX128_5(op, xop) (OP(op) | (((uint32_t)(xop)) & 0x10)) #define VX128_P(op, xop) (OP(op) | (((uint32_t)(xop)) & 0x630)) +#define VX128_VD128 (i.VX128.VD128l | (i.VX128.VD128h << 5)) +#define VX128_VA128 (i.VX128.VA128l | (i.VX128.VA128h << 5) | (i.VX128.VA128H << 6)) +#define VX128_VB128 (i.VX128.VB128l | (i.VX128.VB128h << 5)) +#define VX128_1_VD128 (i.VX128_1.VD128l | (i.VX128_1.VD128h << 5)) +#define VX128_2_VD128 (i.VX128_2.VD128l | (i.VX128_2.VD128h << 5)) +#define VX128_2_VA128 (i.VX128_2.VA128l | (i.VX128_2.VA128h << 5) | (i.VX128_2.VA128H << 6)) +#define VX128_2_VB128 (i.VX128_2.VB128l | (i.VX128_2.VD128h << 5)) +#define VX128_2_VC (i.VX128_2.VC) +#define VX128_3_VD128 (i.VX128_3.VD128l | (i.VX128_3.VD128h << 5)) +#define VX128_3_VB128 (i.VX128_3.VB128l | (i.VX128_3.VB128h << 5)) +#define VX128_3_IMM (i.VX128_3.IMM) + XEEMITTER(dst, 0x7C0002AC, XDSS)(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); @@ -63,66 +75,60 @@ XEEMITTER(lvehx, 0x7C00004E, X )(X64Emitter& e, X86Compiler& c, Instr return 1; } +int InstrEmit_lvewx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} XEEMITTER(lvewx, 0x7C00008E, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_lvewx_(e, c, i, i.X.RT, i.X.RA, i.X.RB); } - XEEMITTER(lvewx128, VX128_1(4, 131), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_lvewx_(e, c, i, i.X.RT, i.X.RA, i.X.RB); +} + +int InstrEmit_lvsl_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(lvsl, 0x7C00000C, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_lvsl_(e, c, i, i.X.RT, i.X.RA, i.X.RB); } - XEEMITTER(lvsl128, VX128_1(4, 3), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_lvsl_(e, c, i, i.X.RT, i.X.RA, i.X.RB); +} + +int InstrEmit_lvsr_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(lvsr, 0x7C00004C, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_lvsr_(e, c, i, i.X.RT, i.X.RA, i.X.RB); } - XEEMITTER(lvsr128, VX128_1(4, 67), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_lvsr_(e, c, i, i.X.RT, i.X.RA, i.X.RB); } -XEEMITTER(lvx, 0x7C0000CE, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { +int InstrEmit_lvx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { GpVar ea(c.newGpVar()); - c.mov(ea, e.gpr_value(i.X.RB)); - if (i.VX128_1.RA) { - c.add(ea, e.gpr_value(i.X.RA)); - } - XmmVar v = e.ReadMemoryXmm(i.address, ea, 4); - e.update_vr_value(i.X.RT, v); - - return 0; -} - -XEEMITTER(lvx128, VX128_1(4, 195), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { - const uint32_t vd = i.VX128_1.VD128l | (i.VX128_1.VD128h << 5); - - GpVar ea(c.newGpVar()); - c.mov(ea, e.gpr_value(i.VX128_1.RB)); - if (i.VX128_1.RA) { - c.add(ea, e.gpr_value(i.VX128_1.RA)); + c.mov(ea, e.gpr_value(rb)); + if (ra) { + c.add(ea, e.gpr_value(ra)); } XmmVar v = e.ReadMemoryXmm(i.address, ea, 4); + c.shufps(v, v, imm(0x1B)); e.update_vr_value(vd, v); - + e.TraceVR(vd); return 0; } - +XEEMITTER(lvx, 0x7C0000CE, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_lvx_(e, c, i, i.X.RT, i.X.RA, i.X.RB); +} +XEEMITTER(lvx128, VX128_1(4, 195), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_lvx_(e, c, i, VX128_1_VD128, i.VX128_1.RA, i.VX128_1.RB); +} XEEMITTER(lvxl, 0x7C0002CE, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { return InstrEmit_lvx(e, c, i); } - XEEMITTER(lvxl128, VX128_1(4, 707), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { return InstrEmit_lvx128(e, c, i); } @@ -137,118 +143,107 @@ XEEMITTER(stvehx, 0x7C00014E, X )(X64Emitter& e, X86Compiler& c, Instr return 1; } -XEEMITTER(stvewx, 0x7C00018E, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { +int InstrEmit_stvewx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { XEINSTRNOTIMPLEMENTED(); return 1; } - +XEEMITTER(stvewx, 0x7C00018E, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_stvewx_(e, c, i, i.X.RT, i.X.RA, i.X.RB); +} XEEMITTER(stvewx128, VX128_1(4, 387), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; } -XEEMITTER(stvx, 0x7C0001CE, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { +int InstrEmit_stvx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { GpVar ea(c.newGpVar()); - c.mov(ea, e.gpr_value(i.X.RB)); - if (i.X.RA) { - c.add(ea, e.gpr_value(i.X.RA)); - } - XmmVar v = e.vr_value(i.X.RT); - e.WriteMemoryXmm(i.address, ea, 4, v); - - return 0; -} - -XEEMITTER(stvx128, VX128_1(4, 451), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { - const uint32_t vd = i.VX128_1.VD128l | (i.VX128_1.VD128h << 5); - - GpVar ea(c.newGpVar()); - c.mov(ea, e.gpr_value(i.VX128_1.RB)); - if (i.X.RA) { - c.add(ea, e.gpr_value(i.VX128_1.RA)); - } + c.mov(ea, e.gpr_value(rb)); + if (ra) { + c.add(ea, e.gpr_value(ra)); + } XmmVar v = e.vr_value(vd); + c.shufps(v, v, imm(0x1B)); e.WriteMemoryXmm(i.address, ea, 4, v); - + e.TraceVR(vd); return 0; } - +XEEMITTER(stvx, 0x7C0001CE, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_stvx_(e, c, i, i.X.RT, i.X.RA, i.X.RB); +} +XEEMITTER(stvx128, VX128_1(4, 451), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_stvx_(e, c, i, VX128_1_VD128, i.VX128_1.RA, i.VX128_1.RB); +} XEEMITTER(stvxl, 0x7C0003CE, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { return InstrEmit_stvx(e, c, i); } - XEEMITTER(stvxl128, VX128_1(4, 963), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { return InstrEmit_stvx128(e, c, i); } +int InstrEmit_lvlx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} XEEMITTER(lvlx, 0x7C00040E, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_lvlx_(e, c, i, i.X.RT, i.X.RA, i.X.RB); } - XEEMITTER(lvlx128, VX128_1(4, 1027), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_lvlx_(e, c, i, VX128_1_VD128, i.VX128_1.RA, i.VX128_1.RB); } - XEEMITTER(lvlxl, 0x7C00060E, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { return InstrEmit_lvlx(e, c, i); } - XEEMITTER(lvlxl128, VX128_1(4, 1539), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { return InstrEmit_lvlx128(e, c, i); } +int InstrEmit_lvrx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} XEEMITTER(lvrx, 0x7C00044E, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_lvrx_(e, c, i, i.X.RT, i.X.RA, i.X.RB); } - XEEMITTER(lvrx128, VX128_1(4, 1091), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_lvrx_(e, c, i, VX128_1_VD128, i.VX128_1.RA, i.VX128_1.RB); } - XEEMITTER(lvrxl, 0x7C00064E, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { return InstrEmit_lvrx(e, c, i); } - XEEMITTER(lvrxl128, VX128_1(4, 1603), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { return InstrEmit_lvrx128(e, c, i); } +int InstrEmit_stvlx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} XEEMITTER(stvlx, 0x7C00050E, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_stvlx_(e, c, i, i.X.RT, i.X.RA, i.X.RB); } - XEEMITTER(stvlx128, VX128_1(4, 1283), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_stvlx_(e, c, i, VX128_1_VD128, i.VX128_1.RA, i.VX128_1.RB); } - XEEMITTER(stvlxl, 0x7C00070E, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { return InstrEmit_stvlx(e, c, i); } - XEEMITTER(stvlxl128, VX128_1(4, 1795), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { return InstrEmit_stvlx128(e, c, i); } +int InstrEmit_stvrx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} XEEMITTER(stvrx, 0x7C00054E, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_stvrx_(e, c, i, i.X.RT, i.X.RA, i.X.RB); } - XEEMITTER(stvrx128, VX128_1(4, 1347), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_stvrx_(e, c, i, VX128_1_VD128, i.VX128_1.RA, i.VX128_1.RB); } - XEEMITTER(stvrxl, 0x7C00074E, X )(X64Emitter& e, X86Compiler& c, InstrData& i) { return InstrEmit_stvrx(e, c, i); } - XEEMITTER(stvrxl128, VX128_1(4, 1859), VX128_1)(X64Emitter& e, X86Compiler& c, InstrData& i) { return InstrEmit_stvrx128(e, c, i); } @@ -323,59 +318,37 @@ XEEMITTER(vadduws, 0x10000280, VX )(X64Emitter& e, X86Compiler& c, Instr return 1; } -XEEMITTER(vand, 0x10000404, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { +int InstrEmit_vand_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb) { // VD <- (VA) & (VB) - XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(i.VX.VB)); - c.pand(v, e.vr_value(i.VX.VA)); - e.update_vr_value(i.VX.VD, v); - - return 0; -} - -XEEMITTER(vand128, VX128(5, 528), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { - // VD <- (VA) & (VB) - - const uint32_t vd = i.VX128.VD128l | (i.VX128.VD128h << 5); - const uint32_t va = i.VX128.VA128l | (i.VX128.VA128h << 5) | - (i.VX128.VA128H << 6); - const uint32_t vb = i.VX128.VB128l | (i.VX128.VB128h << 5); - - XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(vb)); + c.movaps(v, e.vr_value(vb)); c.pand(v, e.vr_value(va)); e.update_vr_value(vd, v); - + e.TraceVR(vd, va, vb); return 0; } - -XEEMITTER(vandc, 0x10000444, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { - // VD <- (VA) & ¬(VB) - - XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(i.VX.VB)); - c.pandn(v, e.vr_value(i.VX.VA)); - e.update_vr_value(i.VX.VD, v); - - return 0; +XEEMITTER(vand, 0x10000404, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vand_(e, c, i.VX.VD, i.VX.VA, i.VX.VB); +} +XEEMITTER(vand128, VX128(5, 528), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vand_(e, c, VX128_VD128, VX128_VA128, VX128_VB128); } -XEEMITTER(vandc128, VX128(5, 592), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { +int InstrEmit_vandc_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb) { // VD <- (VA) & ¬(VB) - - const uint32_t vd = i.VX128.VD128l | (i.VX128.VD128h << 5); - const uint32_t va = i.VX128.VA128l | (i.VX128.VA128h << 5) | - (i.VX128.VA128H << 6); - const uint32_t vb = i.VX128.VB128l | (i.VX128.VB128h << 5); - XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(vb)); + c.movaps(v, e.vr_value(vb)); c.pandn(v, e.vr_value(va)); e.update_vr_value(vd, v); - + e.TraceVR(vd, va, vb); return 0; } +XEEMITTER(vandc, 0x10000444, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vandc_(e, c, i.VX.VD, i.VX.VA, i.VX.VB); +} +XEEMITTER(vandc128, VX128(5, 592), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vandc_(e, c, VX128_VD128, VX128_VA128, VX128_VB128); +} XEEMITTER(vavgsb, 0x10000502, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); @@ -413,8 +386,23 @@ XEEMITTER(vcfsx, 0x1000034A, VX )(X64Emitter& e, X86Compiler& c, Instr } XEEMITTER(vcsxwfp128, VX128_3(6, 688), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + // (VD) <- float(VB) / 2^uimm + XmmVar v(c.newXmmVar()); + // TODO(benvanik): verify this is right - values may be out of range. + c.cvtdq2ps(v, e.vr_value(VX128_3_VB128)); + uint32_t uimm = VX128_3_IMM; + uimm = uimm ? (2 << (uimm - 1)) : 1; + // TODO(benvanik): this could likely be made much faster. + GpVar vt(c.newGpVar()); + c.mov(vt, imm(uimm)); + XmmVar vt_xmm(c.newXmmVar()); + c.movd(vt_xmm, vt.r32()); + c.cvtdq2ps(vt_xmm, vt_xmm); + c.shufps(vt_xmm, vt_xmm, imm(0)); + c.divps(v, vt_xmm); + e.update_vr_value(VX128_3_VD128, v); + e.TraceVR(VX128_3_VD128, VX128_3_VB128); + return 0; } XEEMITTER(vcfpsxws128, VX128_3(6, 560), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) { @@ -647,50 +635,35 @@ XEEMITTER(vlogefp128, VX128_3(6, 1776), VX128_3)(X64Emitter& e, X86Compiler& return 1; } -XEEMITTER(vmaddfp, 0x1000002E, VXA )(X64Emitter& e, X86Compiler& c, InstrData& i) { +int InstrEmit_vmaddfp_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb, uint32_t vc) { // (VD) <- ((VA) * (VC)) + (VB) - // TODO(benvanik): use AVX, which has a fused multiply-add XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(i.VXA.VA)); - c.mulps(v, e.vr_value(i.VXA.VC)); - c.addps(v, e.vr_value(i.VXA.VB)); - e.update_vr_value(i.VXA.VD, v); - + c.movaps(v, e.vr_value(va)); + c.mulps(v, e.vr_value(vc)); + c.addps(v, e.vr_value(vb)); + e.update_vr_value(vd, v); + e.TraceVR(vd, va, vb, vc); return 0; } - +XEEMITTER(vmaddfp, 0x1000002E, VXA )(X64Emitter& e, X86Compiler& c, InstrData& i) { + // (VD) <- ((VA) * (VC)) + (VB) + return InstrEmit_vmaddfp_(e, c, i.VXA.VD, i.VXA.VA, i.VXA.VB, i.VXA.VC); +} XEEMITTER(vmaddfp128, VX128(5, 208), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { // (VD) <- ((VA) * (VB)) + (VD) - - const uint32_t vd = i.VX128.VD128l | (i.VX128.VD128h << 5); - const uint32_t va = i.VX128.VA128l | (i.VX128.VA128h << 5) | - (i.VX128.VA128H << 6); - const uint32_t vb = i.VX128.VB128l | (i.VX128.VB128h << 5); - - XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(va)); - c.mulps(v, e.vr_value(vb)); - c.addps(v, e.vr_value(vd)); - e.update_vr_value(vd, v); - - return 0; + // NOTE: this resuses VD and swaps the arg order! + return InstrEmit_vmaddfp_(e, c, VX128_VD128, VX128_VA128, VX128_VD128, VX128_VB128); } XEEMITTER(vmaddcfp128, VX128(5, 272), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { // (VD) <- ((VA) * (VD)) + (VB) - - const uint32_t vd = i.VX128.VD128l | (i.VX128.VD128h << 5); - const uint32_t va = i.VX128.VA128l | (i.VX128.VA128h << 5) | - (i.VX128.VA128H << 6); - const uint32_t vb = i.VX128.VB128l | (i.VX128.VB128h << 5); - XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(va)); - c.mulps(v, e.vr_value(vd)); - c.addps(v, e.vr_value(vb)); - e.update_vr_value(vd, v); - + c.movaps(v, e.vr_value(VX128_VA128)); + c.mulps(v, e.vr_value(VX128_VD128)); + c.addps(v, e.vr_value(VX128_VB128)); + e.update_vr_value(VX128_VD128, v); + e.TraceVR(VX128_VD128, VX128_VA128, VX128_VB128); return 0; } @@ -799,14 +772,36 @@ XEEMITTER(vmrghh, 0x1000004C, VX )(X64Emitter& e, X86Compiler& c, Instr return 1; } -XEEMITTER(vmrghw, 0x1000008C, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; +int InstrEmit_vmrghw_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb) { + // (VD.x) = (VA.x) + // (VD.y) = (VB.x) + // (VD.z) = (VA.y) + // (VD.w) = (VB.y) + if (e.cpu_feature_mask() & kX86FeatureSse41) { + c.int3(); + // | VA.x | VA.x | VA.y | VA.y | + XmmVar v(c.newXmmVar()); + c.movaps(v, e.vr_value(va)); + c.shufps(v, v, imm(0x50)); + // | VB.x | VB.x | VB.y | VB.y | + XmmVar vt(c.newXmmVar()); + c.movaps(vt, e.vr_value(vb)); + c.shufps(vt, vt, imm(0x50)); + // | VA.x | VB.x | VA.y | VB.y | + c.blendps(v, vt, imm(0xA)); + e.update_vr_value(vd, v); + } else { + XEINSTRNOTIMPLEMENTED(); + return 1; + } + e.TraceVR(vd, va, vb); + return 0; +} +XEEMITTER(vmrghw, 0x1000008C, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vmrghw_(e, c, i.VX.VD, i.VX.VA, i.VX.VB); } - XEEMITTER(vmrghw128, VX128(6, 768), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vmrghw_(e, c, VX128_VD128, VX128_VA128, VX128_VB128); } XEEMITTER(vmrglb, 0x1000010C, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { @@ -819,14 +814,36 @@ XEEMITTER(vmrglh, 0x1000014C, VX )(X64Emitter& e, X86Compiler& c, Instr return 1; } -XEEMITTER(vmrglw, 0x1000018C, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; +int InstrEmit_vmrglw_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb) { + // (VD.x) = (VA.z) + // (VD.y) = (VB.z) + // (VD.z) = (VA.w) + // (VD.w) = (VB.w) + if (e.cpu_feature_mask() & kX86FeatureSse41) { + c.int3(); + // | VA.z | VA.z | VA.w | VA.w | + XmmVar v(c.newXmmVar()); + c.movaps(v, e.vr_value(va)); + c.shufps(v, v, imm(0xFA)); + // | VB.z | VB.z | VB.w | VB.w | + XmmVar vt(c.newXmmVar()); + c.movaps(vt, e.vr_value(vb)); + c.shufps(vt, vt, imm(0xFA)); + // | VA.z | VB.z | VA.w | VB.w | + c.blendps(v, vt, imm(0xA)); + e.update_vr_value(vd, v); + } else { + XEINSTRNOTIMPLEMENTED(); + return 1; + } + e.TraceVR(vd, va, vb); + return 0; +} +XEEMITTER(vmrglw, 0x1000018C, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vmrglw_(e, c, i.VX.VD, i.VX.VA, i.VX.VB); } - XEEMITTER(vmrglw128, VX128(6, 832), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vmrglw_(e, c, VX128_VD128, VX128_VA128, VX128_VB128); } XEEMITTER(vmsummbm, 0x10000025, VXA )(X64Emitter& e, X86Compiler& c, InstrData& i) { @@ -860,13 +877,54 @@ XEEMITTER(vmsumuhs, 0x10000027, VXA )(X64Emitter& e, X86Compiler& c, Instr } XEEMITTER(vmsum3fp128, VX128(5, 400), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + // Dot product XYZ. + // (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z) + if (e.cpu_feature_mask() & kX86FeatureSse41) { + // SSE4.1 required. + // Rumor is this is the same on older processors and way faster on new + // ones (post 2011ish). + XmmVar v(c.newXmmVar()); + c.movaps(v, e.vr_value(VX128_VA128)); + c.dpps(v, e.vr_value(VX128_VB128), imm(0x7F)); + e.update_vr_value(VX128_VD128, v); + } else { + //XmmVar v(c.newXmmVar()); + //c.movaps(v, e.vr_value(va)); + //c.mulps(v, e.vr_value(vb)); + //// TODO(benvanik): need to zero W + //c.haddps(v, v); + //c.haddps(v, v); + //c.pshufd(v, v, imm(0)); + //e.update_vr_value(vd, v); + XEINSTRNOTIMPLEMENTED(); + return 1; + } + e.TraceVR(VX128_VD128, VX128_VA128, VX128_VB128); + return 0; } XEEMITTER(vmsum4fp128, VX128(5, 464), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + // Dot product XYZW. + // (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z) + (VA.w * VB.w) + if (e.cpu_feature_mask() & kX86FeatureSse41) { + // SSE4.1 required. + // Rumor is this is the same on older processors and way faster on new + // ones (post 2011ish). + XmmVar v(c.newXmmVar()); + c.movaps(v, e.vr_value(VX128_VA128)); + c.dpps(v, e.vr_value(VX128_VB128), imm(0xFF)); + e.update_vr_value(VX128_VD128, v); + } else { + XmmVar v(c.newXmmVar()); + c.movaps(v, e.vr_value(VX128_VA128)); + c.mulps(v, e.vr_value(VX128_VB128)); + c.haddps(v, v); + c.haddps(v, v); + c.pshufd(v, v, imm(0)); + e.update_vr_value(VX128_VD128, v); + } + e.TraceVR(VX128_VD128, VX128_VA128, VX128_VB128); + return 0; } XEEMITTER(vmulesb, 0x10000308, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { @@ -911,17 +969,11 @@ XEEMITTER(vmulouh, 0x10000048, VX )(X64Emitter& e, X86Compiler& c, Instr XEEMITTER(vmulfp128, VX128(5, 144), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { // (VD) <- (VA) * (VB) (4 x fp) - - const uint32_t vd = i.VX128.VD128l | (i.VX128.VD128h << 5); - const uint32_t va = i.VX128.VA128l | (i.VX128.VA128h << 5) | - (i.VX128.VA128H << 6); - const uint32_t vb = i.VX128.VB128l | (i.VX128.VB128h << 5); - XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(va)); - c.mulps(v, e.vr_value(vb)); - e.update_vr_value(vd, v); - + c.movaps(v, e.vr_value(VX128_VA128)); + c.mulps(v, e.vr_value(VX128_VB128)); + e.update_vr_value(VX128_VD128, v); + e.TraceVR(VX128_VD128, VX128_VA128, VX128_VB128); return 0; } @@ -935,79 +987,76 @@ XEEMITTER(vnmsubfp128, VX128(5, 336), VX128 )(X64Emitter& e, X86Compiler& return 1; } +int InstrEmit_vnor_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb) { + // VD <- ¬((VA) | (VB)) + XmmVar v(c.newXmmVar()); + c.movaps(v, e.vr_value(vb)); + c.por(v, e.vr_value(va)); + XmmVar t(c.newXmmVar()); + c.pcmpeqd(t, t); // 0xFFFF.... + c.pxor(v, t); + e.update_vr_value(vd, v); + e.TraceVR(vd, va, vb); + return 0; +} XEEMITTER(vnor, 0x10000504, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { - // VD <- ¬((VA) | (VB)) - - XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(i.VX.VB)); - c.por(v, e.vr_value(i.VX.VA)); - XmmVar t(c.newXmmVar()); - c.pcmpeqd(t, t); // 0xFFFF.... - c.pxor(v, t); - e.update_vr_value(i.VX.VD, v); - - return 0; + return InstrEmit_vnor_(e, c, i.VX.VD, i.VX.VA, i.VX.VB); } - XEEMITTER(vnor128, VX128(5, 656), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { - // VD <- ¬((VA) | (VB)) - - const uint32_t vd = i.VX128.VD128l | (i.VX128.VD128h << 5); - const uint32_t va = i.VX128.VA128l | (i.VX128.VA128h << 5) | - (i.VX128.VA128H << 6); - const uint32_t vb = i.VX128.VB128l | (i.VX128.VB128h << 5); - - XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(vb)); - c.por(v, e.vr_value(va)); - XmmVar t(c.newXmmVar()); - c.pcmpeqd(t, t); // 0xFFFF.... - c.pxor(v, t); - e.update_vr_value(vd, v); - - return 0; + return InstrEmit_vnor_(e, c, VX128_VD128, VX128_VA128, VX128_VB128); } +int InstrEmit_vor_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb) { + // VD <- (VA) | (VB) + if (va == vb) { + // Copy VA==VB into VD. + e.update_vr_value(vd, e.vr_value(va)); + } else { + XmmVar v(c.newXmmVar()); + c.movaps(v, e.vr_value(vb)); + c.por(v, e.vr_value(va)); + e.update_vr_value(vd, v); + } + e.TraceVR(vd, va, vb); + return 0; +} XEEMITTER(vor, 0x10000484, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { - // VD <- (VA) | (VB) - - XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(i.VX.VB)); - c.por(v, e.vr_value(i.VX.VA)); - e.update_vr_value(i.VX.VD, v); - - return 0; + return InstrEmit_vor_(e, c, i.VX.VD, i.VX.VA, i.VX.VB); } - XEEMITTER(vor128, VX128(5, 720), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { - // VD <- (VA) | (VB) - - const uint32_t vd = i.VX128.VD128l | (i.VX128.VD128h << 5); - const uint32_t va = i.VX128.VA128l | (i.VX128.VA128h << 5) | - (i.VX128.VA128H << 6); - const uint32_t vb = i.VX128.VB128l | (i.VX128.VB128h << 5); - - XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(vb)); - c.por(v, e.vr_value(va)); - e.update_vr_value(vd, v); - - return 0; + return InstrEmit_vor_(e, c, VX128_VD128, VX128_VA128, VX128_VB128); } +int InstrEmit_vperm_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb, uint32_t vc) { + XEINSTRNOTIMPLEMENTED(); + return 1; +} XEEMITTER(vperm, 0x1000002B, VXA )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vperm_(e, c, i.VXA.VD, i.VXA.VA, i.VXA.VB, i.VXA.VC); } - XEEMITTER(vperm128, VX128_2(5, 0), VX128_2)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vperm_(e, c, VX128_2_VD128, VX128_2_VA128, VX128_2_VB128, VX128_2_VC); } XEEMITTER(vpermwi128, VX128_P(6, 528), VX128_P)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + // (VD.x) = (VB.uimm[6-7]) + // (VD.y) = (VB.uimm[4-5]) + // (VD.z) = (VB.uimm[2-3]) + // (VD.w) = (VB.uimm[0-1]) + const uint32_t vd = i.VX128_P.VD128l | (i.VX128_P.VD128h << 5); + const uint32_t vb = i.VX128_P.VB128l | (i.VX128_P.VB128h << 5); + uint32_t uimm = i.VX128_P.PERMl | (i.VX128_P.PERMh << 5); + // SHUFPS is flipped -- 0-1 selects X, 2-3 selects Y, etc. + uimm = ((uimm & 0x03) << 6) | + ((uimm & 0x0C) << 2) | + ((uimm & 0x30) >> 2) | + ((uimm & 0xC0) >> 6); + XmmVar v(c.newXmmVar()); + c.movaps(v, e.vr_value(vb)); + c.shufps(v, v, imm(uimm)); + e.update_vr_value(vd, v); + e.TraceVR(vd, vb); + return 0; } XEEMITTER(vpkpx, 0x1000030E, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { @@ -1171,18 +1220,65 @@ XEEMITTER(vrlw128, VX128(6, 80), VX128 )(X64Emitter& e, X86Compiler& } XEEMITTER(vrlimi128, VX128_4(6, 1808), VX128_4)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + const uint32_t vd = i.VX128_4.VD128l | (i.VX128_4.VD128h << 5); + const uint32_t vb = i.VX128_4.VB128l | (i.VX128_4.VB128h << 5); + uint32_t x = i.VX128_4.IMM; + uint32_t y = i.VX128_4.z; + XmmVar v(c.newXmmVar()); + c.movaps(v, e.vr_value(vb)); + // This is just a fancy permute. + // X Y Z W, rotated left by 2 = Z W X Y + // Then mask select the results into the dest. + // Sometimes rotation is zero, so fast path. + if (y) { + c.int3(); + switch (y) { + case 1: + // X Y Z W -> Y Z W X + c.shufps(v, v, imm(0x6C)); + break; + case 2: + // X Y Z W -> Z W X Y + c.shufps(v, v, imm(0xB1)); + break; + case 3: + // X Y Z W -> W X Y Z + c.shufps(v, v, imm(0xC6)); + break; + default: + XEASSERTALWAYS(); + return 1; + } + } + uint32_t blend_mask = + (((x & 0x08) ? 1 : 0) << 0) | + (((x & 0x04) ? 1 : 0) << 1) | + (((x & 0x02) ? 1 : 0) << 2) | + (((x & 0x01) ? 1 : 0) << 3); + // Blending src into dest, so invert. + blend_mask = (~blend_mask) & 0x3; + c.blendps(v, e.vr_value(vb), imm(blend_mask)); + e.update_vr_value(vd, v); + e.TraceVR(vd, vb); + return 0; } +int InstrEmit_vrsqrtefp_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t vb) { + // (VD) <- 1 / sqrt(VB) + // There are a lot of rules in the Altivec_PEM docs for handlings that + // result in nan/infinity/etc. They are ignored here. I hope games would + // never rely on them. + XmmVar v(c.newXmmVar()); + c.rsqrtps(v, e.vr_value(vb)); + e.update_vr_value(vd, v); + e.TraceVR(vd, vb); + return 0; +} XEEMITTER(vrsqrtefp, 0x1000014A, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vrsqrtefp_(e, c, i.VX.VD, i.VX.VB); } - XEEMITTER(vrsqrtefp128, VX128_3(6, 1648), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vrsqrtefp_(e, c, VX128_3_VD128, VX128_3_VB128); } XEEMITTER(vsel, 0x1000002A, VXA )(X64Emitter& e, X86Compiler& c, InstrData& i) { @@ -1230,14 +1326,48 @@ XEEMITTER(vslo128, VX128(5, 912), VX128 )(X64Emitter& e, X86Compiler& return 1; } -XEEMITTER(vslw, 0x10000184, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; +int InstrEmit_vslw_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb) { + // VA = |xxxxx|yyyyy|zzzzz|wwwww| + // VB = |...sh|...sh|...sh|...sh| + // VD = |x< 32 and load. + int32_t simm = (uimm & 0x10) ? (uimm | 0xFFFFFFF0) : uimm; + GpVar simm_v(c.newGpVar()); + c.mov(simm_v, imm(simm)); + c.movd(v, simm_v.r32()); + c.pshufd(v, v, imm(0)); + } else { + // Zero out the register. + c.xorps(v, v); + } + e.update_vr_value(vd, v); + e.TraceVR(vd); + return 0; +} XEEMITTER(vspltisw, 0x1000038C, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vspltisw_(e, c, i.VX.VD, i.VX.VA); } - XEEMITTER(vspltisw128, VX128_3(6, 1904), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vspltisw_(e, c, VX128_3_VD128, VX128_3_IMM); } +int InstrEmit_vspltw_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t vb, uint32_t uimm) { + // (VD.xyzw) <- (VB.uimm) + XmmVar v(c.newXmmVar()); + c.movaps(v, e.vr_value(vb)); + switch (uimm) { + case 0: // x + c.shufps(v, v, imm(0x00)); + break; + case 1: // y + c.shufps(v, v, imm(0x55)); + break; + case 2: // z + c.shufps(v, v, imm(0xAA)); + break; + case 3: // w + c.shufps(v, v, imm(0xFF)); + break; + } + e.update_vr_value(vd, v); + e.TraceVR(vd, vb); + return 0; +} XEEMITTER(vspltw, 0x1000028C, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vspltw_(e, c, i.VX.VD, i.VX.VB, i.VX.VA); } - XEEMITTER(vspltw128, VX128_3(6, 1840), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vspltw_(e, c, VX128_3_VD128, VX128_3_VB128, VX128_3_IMM); } XEEMITTER(vsr, 0x100002C4, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { @@ -1340,32 +1504,21 @@ XEEMITTER(vsubcuw, 0x10000580, VX )(X64Emitter& e, X86Compiler& c, Instr return 1; } -XEEMITTER(vsubfp, 0x1000004A, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { +int InstrEmit_vsubfp_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb) { // (VD) <- (VA) - (VB) (4 x fp) - XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(i.VX.VA)); - c.subps(v, e.vr_value(i.VX.VB)); - e.update_vr_value(i.VX.VD, v); - - return 0; -} - -XEEMITTER(vsubfp128, VX128(5, 80), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { - // (VD) <- (VA) - (VB) (4 x fp) - - const uint32_t vd = i.VX128.VD128l | (i.VX128.VD128h << 5); - const uint32_t va = i.VX128.VA128l | (i.VX128.VA128h << 5) | - (i.VX128.VA128H << 6); - const uint32_t vb = i.VX128.VB128l | (i.VX128.VB128h << 5); - - XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(va)); + c.movaps(v, e.vr_value(va)); c.subps(v, e.vr_value(vb)); e.update_vr_value(vd, v); - + e.TraceVR(vd, va, vb); return 0; } +XEEMITTER(vsubfp, 0x1000004A, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vsubfp_(e, c, i.VX.VD, i.VX.VA, i.VX.VB); +} +XEEMITTER(vsubfp128, VX128(5, 80), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vsubfp_(e, c, VX128_VD128, VX128_VA128, VX128_VB128); +} XEEMITTER(vsubsbs, 0x10000700, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); @@ -1478,37 +1631,87 @@ XEEMITTER(vupklsh, 0x100002CE, VX )(X64Emitter& e, X86Compiler& c, Instr } XEEMITTER(vupkd3d128, VX128_3(6, 2032), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - -XEEMITTER(vxor, 0x100004C4, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { - // VD <- (VA) ^ (VB) - + // Can't find many docs on this. Best reference is + // http://worldcraft.googlecode.com/svn/trunk/src/qylib/math/xmmatrix.inl, + // which shows how it's used in some cases. Since it's all intrinsics, + // finding it in code is pretty easy. + const uint32_t vd = i.VX128_3.VD128l | (i.VX128_3.VD128h << 5); + const uint32_t vb = i.VX128_3.VB128l | (i.VX128_3.VB128h << 5); + const uint32_t type = i.VX128_3.IMM >> 2; XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(i.VX.VB)); - c.pxor(v, e.vr_value(i.VX.VA)); - e.update_vr_value(i.VX.VD, v); - - return 0; -} - -XEEMITTER(vxor128, VX128(5, 784), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { - // VD <- (VA) ^ (VB) - - const uint32_t vd = i.VX128.VD128l | (i.VX128.VD128h << 5); - const uint32_t va = i.VX128.VA128l | (i.VX128.VA128h << 5) | - (i.VX128.VA128H << 6); - const uint32_t vb = i.VX128.VB128l | (i.VX128.VB128h << 5); - - XmmVar v(c.newXmmVar()); - c.movq(v, e.vr_value(vb)); - c.pxor(v, e.vr_value(va)); + GpVar gt(c.newGpVar()); + XmmVar vt(c.newXmmVar()); + switch (type) { + case 1: // VPACK_NORMSHORT2 + { + // (VD.x) = 3.0 + (VB.x)*2^-22 + // (VD.y) = 3.0 + (VB.y)*2^-22 + // (VD.z) = 0.0 + // (VD.w) = 3.0 + c.movaps(vt, e.vr_value(vb)); + c.xorps(v, v); + // VB.x|VB.y|0|0 + c.shufps(vt, v, imm(0x10)); + // *=2^-22 + c.mov(gt, imm(0x34800000)); + c.pinsrd(v, gt.r32(), imm(0)); + c.pinsrd(v, gt.r32(), imm(1)); + c.mulps(v, vt); + // {3.0, 3.0, 0, 1.0} + c.xorps(vt, vt); + c.mov(gt, imm(0x40400000)); + c.pinsrd(vt, gt.r32(), imm(0)); + c.pinsrd(vt, gt.r32(), imm(1)); + c.mov(gt, imm(0x3F800000)); + c.pinsrd(vt, gt.r32(), imm(3)); + c.addps(v, vt); + } + break; + case 3: // VPACK_... 2 FLOAT16s + { + // (VD.x) = fixed_16_to_32(VB.x) + // (VD.y) = fixed_16_to_32(VB.y) + // (VD.z) = 0.0 + // (VD.w) = 1.0 + // 1 bit sign, 5 bit exponent, 10 bit mantissa + // TODO(benvanik): fixed_16_to_32 in SSE? + // {0.0, 0.0, 0.0, 1.0} + c.mov(gt, imm(0x3F800000)); + c.pinsrd(vt, gt.r32(), imm(3)); + c.movaps(v, vt); + c.int3(); + } + break; + default: + XEASSERTALWAYS(); + return 1; + } e.update_vr_value(vd, v); - + e.TraceVR(vd, vb); return 0; } +int InstrEmit_vxor_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb) { + // VD <- (VA) ^ (VB) + XmmVar v(c.newXmmVar()); + if (va == vb) { + // Fast clear. + c.xorps(v, v); + } else { + c.movaps(v, e.vr_value(vb)); + c.pxor(v, e.vr_value(va)); + } + e.update_vr_value(vd, v); + e.TraceVR(vd, va, vb); + return 0; +} +XEEMITTER(vxor, 0x100004C4, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vxor_(e, c, i.VX.VD, i.VX.VA, i.VX.VB); +} +XEEMITTER(vxor128, VX128(5, 784), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vxor_(e, c, VX128_VD128, VX128_VA128, VX128_VB128); +} + void X64RegisterEmitCategoryAltivec() { XEREGISTERINSTR(dst, 0x7C0002AC); diff --git a/src/xenia/cpu/x64/x64_emitter.cc b/src/xenia/cpu/x64/x64_emitter.cc index 1a2069735..a02b2116f 100644 --- a/src/xenia/cpu/x64/x64_emitter.cc +++ b/src/xenia/cpu/x64/x64_emitter.cc @@ -74,6 +74,11 @@ X64Emitter::X64Emitter(xe_memory_ref memory) : assembler_.setLogger(logger_); compiler_.setLogger(logger_); } + + // Grab CPU feature mask so we can quickly check it in emitter code. + const CpuInfo* cpu = CpuInfo::getGlobal(); + const X86CpuInfo* x86Cpu = static_cast(cpu); + cpu_feature_mask_ = cpu->getFeatures(); } X64Emitter::~X64Emitter() { @@ -906,6 +911,47 @@ void X64Emitter::TraceBranch(uint32_t cia) { } } +void X64Emitter::TraceVR(uint32_t vr0, uint32_t vr1, uint32_t vr2, + uint32_t vr3, uint32_t vr4) { + X86Compiler& c = compiler_; + + for (int n = 0; n < 5; n++) { + c.nop(); + } + + if (FLAGS_annotate_disassembly) { + c.comment("XeTraceVR (+spill)"); + } + + SpillRegisters(); + + // TODO(benvanik): remove once fixed: https://code.google.com/p/asmjit/issues/detail?id=86 + GpVar arg1 = c.newGpVar(kX86VarTypeGpq); + c.mov(arg1, imm(vr0)); + GpVar arg2 = c.newGpVar(kX86VarTypeGpq); + c.mov(arg2, imm(vr1)); + GpVar arg3 = c.newGpVar(kX86VarTypeGpq); + c.mov(arg3, imm(vr2)); + GpVar arg4 = c.newGpVar(kX86VarTypeGpq); + c.mov(arg4, imm(vr3)); + GpVar arg5 = c.newGpVar(kX86VarTypeGpq); + c.mov(arg5, imm(vr4)); + X86CompilerFuncCall* call = c.call(global_exports_.XeTraceVR); + call->setPrototype(kX86FuncConvDefault, + FuncBuilder6()); + call->setArgument(0, c.getGpArg(0)); + call->setArgument(1, arg1); + call->setArgument(2, arg2); + call->setArgument(3, arg3); + call->setArgument(4, arg4); + call->setArgument(5, arg5); + + for (int n = 0; n < 2; n++) { + c.nop(); + } +} + int X64Emitter::GenerateIndirectionBranch(uint32_t cia, GpVar& target, bool lk, bool likely_local) { X86Compiler& c = compiler_; @@ -1221,9 +1267,9 @@ void X64Emitter::FillRegisters() { if (FLAGS_annotate_disassembly) { c.comment("Filling vr%d", n); } - c.movq(locals_.vr[n], - xmmword_ptr(c.getGpArg(0), - offsetof(xe_ppc_state_t, v) + 16 * n)); + c.movaps(locals_.vr[n], + xmmword_ptr(c.getGpArg(0), + offsetof(xe_ppc_state_t, v) + 16 * n)); } } } @@ -1324,9 +1370,9 @@ void X64Emitter::SpillRegisters() { if (FLAGS_annotate_disassembly) { c.comment("Spilling vr%d", n); } - c.movq(xmmword_ptr(c.getGpArg(0), - offsetof(xe_ppc_state_t, v) + 16 * n), - v); + c.movaps(xmmword_ptr(c.getGpArg(0), + offsetof(xe_ppc_state_t, v) + 16 * n), + v); } } } @@ -1669,8 +1715,8 @@ XmmVar X64Emitter::vr_value(uint32_t n) { return locals_.vr[n]; } else { XmmVar value(c.newXmmVar()); - c.movq(value, - xmmword_ptr(c.getGpArg(0), offsetof(xe_ppc_state_t, v) + 16 * n)); + c.movaps(value, + xmmword_ptr(c.getGpArg(0), offsetof(xe_ppc_state_t, v) + 16 * n)); return value; } } @@ -1680,10 +1726,10 @@ void X64Emitter::update_vr_value(uint32_t n, XmmVar& value) { XEASSERT(n >= 0 && n < 128); if (FLAGS_cache_registers) { XEASSERT(locals_.vr[n].getId() != kInvalidValue); - c.movq(locals_.vr[n], value); + c.movaps(locals_.vr[n], value); } else { - c.movq(xmmword_ptr(c.getGpArg(0), offsetof(xe_ppc_state_t, v) + 16 * n), - value); + c.movaps(xmmword_ptr(c.getGpArg(0), offsetof(xe_ppc_state_t, v) + 16 * n), + value); } } @@ -1793,7 +1839,7 @@ XmmVar X64Emitter::ReadMemoryXmm( GpVar real_address = TouchMemoryAddress(cia, addr); XmmVar value(c.newXmmVar()); - c.movq(value, xmmword_ptr(real_address)); + c.movaps(value, xmmword_ptr(real_address)); // Byte swap. // http://www.asmcommunity.net/forums/topic/?id=29743 @@ -1874,7 +1920,7 @@ void X64Emitter::WriteMemoryXmm( c.psllw(value, imm(8)); c.por(value, temp); - c.movq(xmmword_ptr(real_address), value); + c.movaps(xmmword_ptr(real_address), value); } GpVar X64Emitter::get_uint64(uint64_t value) { diff --git a/src/xenia/cpu/x64/x64_emitter.h b/src/xenia/cpu/x64/x64_emitter.h index c52094879..6daf9f49c 100644 --- a/src/xenia/cpu/x64/x64_emitter.h +++ b/src/xenia/cpu/x64/x64_emitter.h @@ -36,6 +36,8 @@ public: void Lock(); void Unlock(); + uint32_t cpu_feature_mask() const { return cpu_feature_mask_; } + int PrepareFunction(sdb::FunctionSymbol* symbol); int MakeFunction(sdb::FunctionSymbol* symbol); @@ -53,6 +55,8 @@ public: void TraceInstruction(ppc::InstrData& i); void TraceInvalidInstruction(ppc::InstrData& i); void TraceBranch(uint32_t cia); + void TraceVR(uint32_t vr0, uint32_t vr1 = UINT_MAX, uint32_t vr2 = UINT_MAX, + uint32_t vr3 = UINT_MAX, uint32_t vr4 = UINT_MAX); int GenerateIndirectionBranch(uint32_t cia, AsmJit::GpVar& target, bool lk, bool likely_local); @@ -128,6 +132,7 @@ private: xe_memory_ref memory_; GlobalExports global_exports_; xe_mutex_t* lock_; + uint32_t cpu_feature_mask_; void* gpu_this_; void* gpu_read_; diff --git a/src/xenia/cpu/x64/x64_jit.cc b/src/xenia/cpu/x64/x64_jit.cc index f748bb8fe..719ce04d9 100644 --- a/src/xenia/cpu/x64/x64_jit.cc +++ b/src/xenia/cpu/x64/x64_jit.cc @@ -100,8 +100,29 @@ static const BitDescription x86Features[] = { int X64JIT::CheckProcessor() { const CpuInfo* cpu = CpuInfo::getGlobal(); const X86CpuInfo* x86Cpu = static_cast(cpu); + const uint32_t mask = cpu->getFeatures(); + + // TODO(benvanik): ensure features we want are supported. + + // TODO(benvanik): check for SSE modes we use. + if (!(mask & kX86FeatureSse3)) { + XELOGE("CPU does not support SSE3+ instructions!"); + DumpCPUInfo(); + return 1; + } + if (!(mask & kX86FeatureSse41)) { + XELOGW("CPU does not support SSE4.1+ instructions, performance degraded!"); + DumpCPUInfo(); + } + + return 0; +} + +void X64JIT::DumpCPUInfo() { + const CpuInfo* cpu = CpuInfo::getGlobal(); + const X86CpuInfo* x86Cpu = static_cast(cpu); + const uint32_t mask = cpu->getFeatures(); -#if 0 XELOGCPU("Processor Info:"); XELOGCPU(" Vendor string : %s", cpu->getVendorString()); XELOGCPU(" Brand string : %s", cpu->getBrandString()); @@ -117,17 +138,11 @@ int X64JIT::CheckProcessor() { XELOGCPU(" Max logical Processors: %u", x86Cpu->getMaxLogicalProcessors()); XELOGCPU(" APIC Physical ID : %u", x86Cpu->getApicPhysicalId()); XELOGCPU(" Features:"); - uint32_t mask = cpu->getFeatures(); for (const BitDescription* d = x86Features; d->mask; d++) { if (mask & d->mask) { XELOGCPU(" %s", d->description); } } -#endif - - // TODO(benvanik): ensure features we want are supported. - - return 0; } int X64JIT::InitModule(ExecModule* module) { diff --git a/src/xenia/cpu/x64/x64_jit.h b/src/xenia/cpu/x64/x64_jit.h index d5204e54d..09bede44b 100644 --- a/src/xenia/cpu/x64/x64_jit.h +++ b/src/xenia/cpu/x64/x64_jit.h @@ -41,6 +41,7 @@ public: protected: int CheckProcessor(); + void DumpCPUInfo(); X64Emitter* emitter_; };