diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index 31e1dc9fd..7cb278e5d 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -446,10 +446,11 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() { EmitSaveNonvolatileRegs(); mov(rax, rcx); - mov(rsi, rdx); // context - mov(rcx, r8); // return address + mov(rsi, rdx); // context + mov(rdi, ptr[rdx + offsetof(ppc::PPCContext, virtual_membase)]); // membase + mov(rcx, r8); // return address call(rax); - + vzeroupper(); EmitLoadNonvolatileRegs(); code_offsets.epilog = getSize(); @@ -500,7 +501,8 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { code_offsets.prolog_stack_alloc = getSize(); code_offsets.body = getSize(); - + // chrispy: added this for proper vmsum impl, avx2 bitshifts + vzeroupper(); // Save off volatile registers. EmitSaveVolatileRegs(); diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 707ab5642..c6f2d6180 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -101,13 +101,11 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL); TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW); TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ); - - - - + TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI); #undef TEST_EMIT_FEATURE /* - fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in latest version of xbyak + fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in + latest version of xbyak */ unsigned int data[4]; Xbyak::util::Cpu::getCpuid(0x80000001, data); @@ -117,21 +115,19 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) } } if (cpu_.has(Xbyak::util::Cpu::tAMD)) { - - bool is_zennish = cpu_.displayFamily >= 0x17; + bool is_zennish = cpu_.displayFamily >= 0x17; - if (is_zennish) { - feature_flags_ |= kX64FastJrcx; + if (is_zennish) { + feature_flags_ |= kX64FastJrcx; - if (cpu_.displayFamily > 0x17) { - feature_flags_ |= kX64FastLoop; + if (cpu_.displayFamily > 0x17) { + feature_flags_ |= kX64FastLoop; - } else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) { - feature_flags_ |= kX64FastLoop; - } // todo:figure out at model zen+ became zen2, this is just the model - // for my cpu, which is ripper90 - - } + } else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) { + feature_flags_ |= kX64FastLoop; + } // todo:figure out at model zen+ became zen2, this is just the model + // for my cpu, which is ripper90 + } } } @@ -263,7 +259,10 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { code_offsets.prolog_stack_alloc = getSize(); code_offsets.body = getSize(); + /* + * chrispy: removed this, it serves no purpose mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg()); + */ mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx); mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0); @@ -296,9 +295,11 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { } // Load membase. - mov(GetMembaseReg(), + /* + * chrispy: removed this, as long as we load it in HostToGuestThunk we can + count on no other code modifying it. mov(GetMembaseReg(), qword[GetContextReg() + offsetof(ppc::PPCContext, virtual_membase)]); - + */ // Body. auto block = builder->first_block(); while (block) { @@ -318,7 +319,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { // NOTE: If you encounter this after adding a new instruction, do a full // rebuild! assert_always(); - XELOGE("Unable to process HIR opcode {}", instr->opcode->name); + XELOGE("Unable to process HIR opcode {}", GetOpcodeName(instr->opcode)); break; } instr = new_tail; @@ -331,8 +332,10 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { L(epilog_label); epilog_label_ = nullptr; EmitTraceUserCallReturn(); + /* + * chrispy: removed this, it serves no purpose mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]); - + */ code_offsets.epilog = getSize(); add(rsp, (uint32_t)stack_size); @@ -342,7 +345,6 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { if (cvars::emit_source_annotations) { nop(5); - } assert_zero(code_offsets.prolog); @@ -676,37 +678,9 @@ Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param) { Xbyak::Reg64 X64Emitter::GetContextReg() { return rsi; } Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdi; } -void X64Emitter::ReloadContext() { - mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]); -} - void X64Emitter::ReloadMembase() { mov(GetMembaseReg(), qword[GetContextReg() + 8]); // membase } -#define __NH_CONCAT(x, y) x##y -#define _MH_CONCAT(cb, ...) cb (__VA_ARGS__) - -#define mh_concat2_m(x, y) __NH_CONCAT(x, y) - -#define DECLNOP(n, ...) \ - static constexpr unsigned char mh_concat2_m(nop_, n)[] = {__VA_ARGS__} - -DECLNOP(1, 0x90); -DECLNOP(2, 0x66, 0x90); -DECLNOP(3, 0x0F, 0x1F, 0x00); -DECLNOP(4, 0x0F, 0x1F, 0x40, 0x00); -DECLNOP(5, 0x0F, 0x1F, 0x44, 0x00, 0x00); -DECLNOP(6, 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00); -DECLNOP(7, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00); -DECLNOP(8, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00); -DECLNOP(9, 0x66, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00); - -static constexpr const unsigned char* const g_noptable[] = { - &nop_1[0], &nop_1[0], &nop_2[0], &nop_3[0], &nop_4[0], - &nop_5[0], &nop_6[0], &nop_7[0], &nop_8[0], &nop_9[0]}; - -static constexpr unsigned LENGTHOF_NOPTABLE = - sizeof(g_noptable) / sizeof(g_noptable[0]); // Len Assembly Byte Sequence // ============================================================================ @@ -720,17 +694,8 @@ static constexpr unsigned LENGTHOF_NOPTABLE = // 8b NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H // 9b 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H void X64Emitter::nop(size_t length) { - while (length != 0) { - unsigned patchsize = length % LENGTHOF_NOPTABLE; - - // patch_memory(locptr, size, (char*)g_noptable[patchsize]); - - for (unsigned i = 0; i < patchsize; ++i) { - db(g_noptable[patchsize][i]); - } - - //locptr += patchsize; - length -= patchsize; + for (size_t i = 0; i < length; ++i) { + db(0x90); } } @@ -912,8 +877,17 @@ static const vec128_t xmm_consts[] = { 0x80, 0x80, 0x80, 0x80), /*XMMShortsToBytes*/ v128_setr_bytes(0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80) -}; + 0x80, 0x80, 0x80), + /*XMMLVSLTableBase*/ + vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + /*XMMLVSRTableBase*/ + vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + /* XMMSingleDenormalMask */ + vec128i(0x7f800000), + /* XMMThreeFloatMask */ + vec128i(~0U, ~0U, ~0U, 0U), + /*XMMXenosF16ExtRangeStart*/ + vec128f(65504)}; void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { for (auto& vec : xmm_consts) { @@ -1013,7 +987,6 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) { // 1111... vpcmpeqb(dest, dest); } else { - for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) { if (xmm_consts[i] == v) { vmovapd(dest, GetXmmConstPtr((XmmConst)i)); diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 66a02fcc1..d73d86fe1 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -118,7 +118,12 @@ enum XmmConst { XMM2To32, XMMFloatInf, XMMIntsToBytes, - XMMShortsToBytes + XMMShortsToBytes, + XMMLVSLTableBase, + XMMLVSRTableBase, + XMMSingleDenormalMask, + XMMThreeFloatMask, //for clearing the fourth float prior to DOT_PRODUCT_3 + XMMXenosF16ExtRangeStart }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. @@ -147,6 +152,7 @@ enum X64EmitterFeatureFlags { kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ, kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1) kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2) + kX64EmitAVX512VBMI = 1 << 14 }; class ResolvableGuestCall { public: @@ -225,7 +231,7 @@ class X64Emitter : public Xbyak::CodeGenerator { Xbyak::Reg64 GetContextReg(); Xbyak::Reg64 GetMembaseReg(); - void ReloadContext(); + void ReloadMembase(); void nop(size_t length = 1); diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 299e7674f..1cca6469f 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -127,6 +127,26 @@ struct VECTOR_CONVERT_F2I }; EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I); +struct VECTOR_DENORMFLUSH + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vxorps(e.xmm1, e.xmm1, e.xmm1); // 0.25 P0123 + + e.vandps(e.xmm0, i.src1, + e.GetXmmConstPtr(XMMSingleDenormalMask)); // 0.25 P0123 + e.vcmpneqps(e.xmm2, e.xmm0, e.xmm1); // 0.5 P01 + e.vandps(e.xmm1, i.src1, + e.GetXmmConstPtr(XMMSignMaskF32)); // 0.5 P0123 take signs, zeros + // must keep their signs + e.vandps(e.xmm0, i.src1, e.xmm2); // P0123 + e.vorps(i.dest, e.xmm0, e.xmm1); // P0123 make sure zeros keep signs + + // if it does not equal zero, we stay + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_DENORMFLUSH, VECTOR_DENORMFLUSH); + // ============================================================================ // OPCODE_LOAD_VECTOR_SHL // ============================================================================ @@ -154,15 +174,20 @@ struct LOAD_VECTOR_SHL_I8 if (i.src1.is_constant) { auto sh = i.src1.constant(); assert_true(sh < xe::countof(lvsl_table)); - e.mov(e.rax, (uintptr_t)&lvsl_table[sh]); - e.vmovaps(i.dest, e.ptr[e.rax]); + if (sh == 0) { + e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVSLTableBase)); + } else { + // this is probably extremely rare + e.LoadConstantXmm(i.dest, lvsl_table[sh]); + } } else { // TODO(benvanik): find a cheaper way of doing this. - e.movzx(e.rdx, i.src1); - e.and_(e.dx, 0xF); - e.shl(e.dx, 4); - e.mov(e.rax, (uintptr_t)lvsl_table); - e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]); + // chrispy: removed mask, ppc_emit_altivec already pre-ands it. + e.vmovd(e.xmm0, i.src1.reg().cvt32()); + // broadcast byte + // dont use broadcastb with avx2, its slower than shuf + e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero)); + e.vpaddb(i.dest, e.xmm0, e.GetXmmConstPtr(XMMLVSLTableBase)); } } }; @@ -195,15 +220,23 @@ struct LOAD_VECTOR_SHR_I8 if (i.src1.is_constant) { auto sh = i.src1.constant(); assert_true(sh < xe::countof(lvsr_table)); - e.mov(e.rax, (uintptr_t)&lvsr_table[sh]); - e.vmovaps(i.dest, e.ptr[e.rax]); + if (sh == 0) { + e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVSRTableBase)); + } else { + e.LoadConstantXmm(i.dest, lvsr_table[sh]); + } } else { // TODO(benvanik): find a cheaper way of doing this. - e.movzx(e.rdx, i.src1); - e.and_(e.dx, 0xF); - e.shl(e.dx, 4); - e.mov(e.rax, (uintptr_t)lvsr_table); - e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]); + + // chrispy: removed mask, ppc_emit_altivec already pre-ands it. removed + // lookup as well, compute from LVSR base instead + e.vmovd(e.xmm0, i.src1.reg().cvt32()); + e.vmovdqa(e.xmm1, e.GetXmmConstPtr(XMMLVSRTableBase)); + // broadcast byte + // dont use broadcastb with avx2, its slower than shuf + e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero)); + + e.vpsubb(i.dest, e.xmm1, e.xmm0); } } }; @@ -728,7 +761,7 @@ struct VECTOR_SHL_V128 } } -static void EmitInt8(X64Emitter& e, const EmitArgType& i) { + static void EmitInt8(X64Emitter& e, const EmitArgType& i) { // TODO(benvanik): native version (with shift magic). if (e.IsFeatureEnabled(kX64EmitAVX2)) { @@ -1793,6 +1826,14 @@ struct PERMUTE_I32 } } }; +//todo: use this on const src1 +static vec128_t FixupConstantShuf8(vec128_t input) { + for (uint32_t i = 0; i < 16; ++i) { + input.u8[i] ^= 0x03; + input.u8[i] &= 0x1F; + } + return input; +} struct PERMUTE_V128 : Sequence> { @@ -1855,7 +1896,8 @@ struct PERMUTE_V128 } else { e.vpshufb(src3_shuf, i.src3, e.xmm2); } - // Build a mask with values in src2 having 0 and values in src3 having 1. + // Build a mask with values in src2 having 0 and values in src3 + // having 1. e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15)); e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest); } diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index b647ff404..5af7db24d 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -35,11 +35,14 @@ #include "xenia/cpu/backend/x64/x64_emitter.h" #include "xenia/cpu/backend/x64/x64_op.h" #include "xenia/cpu/backend/x64/x64_tracers.h" +// needed for stmxcsr +#include "xenia/cpu/backend/x64/x64_stack_layout.h" #include "xenia/cpu/hir/hir_builder.h" #include "xenia/cpu/processor.h" DEFINE_bool(use_fast_dot_product, false, - "Experimental optimization, much shorter sequence on dot products, treating inf as overflow instead of using mcxsr" + "Experimental optimization, much shorter sequence on dot products, " + "treating inf as overflow instead of using mcxsr" "four insn dotprod", "CPU"); namespace xe { @@ -1996,8 +1999,8 @@ struct DIV_V128 : Sequence> { assert_true(!i.instr->flags); EmitAssociativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - // e.vrcpps(e.xmm0, src2); - //e.vmulps(dest, src1, e.xmm0); + // e.vrcpps(e.xmm0, src2); + // e.vmulps(dest, src1, e.xmm0); e.vdivps(dest, src1, src2); }); } @@ -2607,68 +2610,84 @@ struct LOG2_V128 : Sequence> { }; EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128); -struct DOT_PRODUCT_V128 { - static void Emit(X64Emitter& e, Xmm dest, Xmm src1, Xmm src2, uint8_t imm) { - if (cvars::use_fast_dot_product) { - e.vdpps(dest, src1, src2, imm); - e.vandps(e.xmm0, dest, e.GetXmmConstPtr(XMMAbsMaskPS)); - e.vcmpgeps(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMFloatInf)); - e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0); - - } else { - // TODO(benvanik): apparently this is very slow - // - find alternative? - Xbyak::Label end; - e.inLocalLabel(); - - // Grab space to put MXCSR. - // TODO(gibbed): stick this in TLS or - // something? - e.sub(e.rsp, 8); - - // Grab MXCSR and mask off the overflow flag, - // because it's sticky. - e.vstmxcsr(e.dword[e.rsp]); - e.mov(e.eax, e.dword[e.rsp]); - e.and_(e.eax, uint32_t(~8)); - e.mov(e.dword[e.rsp], e.eax); - e.vldmxcsr(e.dword[e.rsp]); - - // Hey we can do the dot product now. - e.vdpps(dest, src1, src2, imm); - - // Load MXCSR... - e.vstmxcsr(e.dword[e.rsp]); - - // ..free our temporary space and get MXCSR at - // the same time - e.pop(e.rax); - - // Did we overflow? - e.test(e.al, 8); - e.jz(end); - - // Infinity? HA! Give NAN. - e.vmovdqa(dest, e.GetXmmConstPtr(XMMQNaN)); - - e.L(end); - e.outLocalLabel(); - } - } -}; - // ============================================================================ // OPCODE_DOT_PRODUCT_3 // ============================================================================ struct DOT_PRODUCT_3_V128 : Sequence> { + I> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx - EmitCommutativeBinaryXmmOp( - e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - DOT_PRODUCT_V128::Emit(e, dest, src1, src2, 0b01110001); - }); + // todo: add fast_dot_product path that just checks for infinity instead of + // using mxcsr + auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64]; + + // this is going to hurt a bit... + /* + this implementation is accurate, it matches the results of xb360 vmsum3 + except that vmsum3 is often off by 1 bit, but its extremely slow. it is a + long, unbroken chain of dependencies, and the three uses of mxcsr all cost + about 15-20 cycles at the very least on amd zen processors. on older amd the + figures agner has are pretty horrible. it looks like its just as bad on + modern intel cpus also up until just recently. perhaps a better way of + detecting overflow would be to just compare with inf. todo: test whether cmp + with inf can replace + */ + e.vstmxcsr(mxcsr_storage); + + e.mov(e.eax, 8); + + auto src1v = e.xmm0; + auto src2v = e.xmm1; + if (i.src1.is_constant) { + src1v = e.xmm0; + e.LoadConstantXmm(src1v, i.src1.constant()); + } else { + src1v = i.src1.reg(); + } + if (i.src2.is_constant) { + src2v = e.xmm1; + e.LoadConstantXmm(src2v, i.src2.constant()); + } else { + src2v = i.src2.reg(); + } + e.not_(e.eax); + // todo: maybe the top element should be cleared by the InstrEmit_ function + // so that in the future this could be optimized away if the top is known to + // be zero. Right now im not sure that happens often though and its + // currently not worth it also, maybe pre-and if constant + e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask)); + e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask)); + + e.and_(mxcsr_storage, e.eax); + e.vldmxcsr(mxcsr_storage); // overflow flag is cleared, now we're good to + // go + + e.vcvtps2pd(e.ymm0, e.xmm3); + e.vcvtps2pd(e.ymm1, e.xmm2); + /* + ymm0 = src1 as doubles, ele 3 cleared + ymm1 = src2 as doubles, ele 3 cleared + */ + e.vmulpd(e.ymm3, e.ymm0, e.ymm1); + e.vextractf128(e.xmm2, e.ymm3, 1); + e.vunpckhpd(e.xmm0, e.xmm3, e.xmm3); // get element [1] in xmm3 + e.vaddsd(e.xmm3, e.xmm3, e.xmm2); + e.not_(e.eax); + e.vaddsd(e.xmm2, e.xmm3, e.xmm0); + e.vcvtsd2ss(e.xmm1, e.xmm2); + + // this is awful + e.vstmxcsr(mxcsr_storage); + e.test(mxcsr_storage, e.eax); + Xbyak::Label ret_qnan; + Xbyak::Label done; + e.jnz(ret_qnan); + // e.vshufps(i.dest, e.xmm1,e.xmm1, 0); // broadcast + e.vbroadcastss(i.dest, e.xmm1); + e.jmp(done); + e.L(ret_qnan); + e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN)); + e.L(done); } }; EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3, DOT_PRODUCT_3_V128); @@ -2678,13 +2697,81 @@ EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3, DOT_PRODUCT_3_V128); // ============================================================================ struct DOT_PRODUCT_4_V128 : Sequence> { + I> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx - EmitCommutativeBinaryXmmOp( - e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - DOT_PRODUCT_V128::Emit(e, dest, src1, src2, 0b11110001); - }); + // todo: add fast_dot_product path that just checks for infinity instead of + // using mxcsr + auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64]; + + e.vstmxcsr(mxcsr_storage); + + e.mov(e.eax, 8); + + auto src1v = e.xmm3; + auto src2v = e.xmm2; + if (i.src1.is_constant) { + src1v = e.xmm3; + e.LoadConstantXmm(src1v, i.src1.constant()); + } else { + src1v = i.src1.reg(); + } + if (i.src2.is_constant) { + src2v = e.xmm2; + e.LoadConstantXmm(src2v, i.src2.constant()); + } else { + src2v = i.src2.reg(); + } + e.not_(e.eax); + + e.and_(mxcsr_storage, e.eax); + e.vldmxcsr(mxcsr_storage); + + e.vcvtps2pd(e.ymm0, src1v); + e.vcvtps2pd(e.ymm1, src2v); + /* + e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask)); + e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask)); + + e.and_(mxcsr_storage, e.eax); + e.vldmxcsr(mxcsr_storage); // overflow flag is cleared, now we're good to + // go + + e.vcvtps2pd(e.ymm0, e.xmm3); + e.vcvtps2pd(e.ymm1, e.xmm2); + + + e.vmulpd(e.ymm5, e.ymm0, e.ymm1); + e.vextractf128(e.xmm4, e.ymm5, 1); + e.vunpckhpd(e.xmm3, e.xmm5, e.xmm5); // get element [1] in xmm3 + e.vaddsd(e.xmm5, e.xmm5, e.xmm4); + e.not_(e.eax); + e.vaddsd(e.xmm2, e.xmm5, e.xmm3); + e.vcvtsd2ss(e.xmm1, e.xmm2); + + */ + e.vmulpd(e.ymm3, e.ymm0, e.ymm1); + e.vextractf128(e.xmm2, e.ymm3, 1); + e.vaddpd(e.xmm3, e.xmm3, e.xmm2); + + e.vunpckhpd(e.xmm0, e.xmm3, e.xmm3); + e.not_(e.eax); + e.vaddsd(e.xmm2, e.xmm3, e.xmm0); + e.vcvtsd2ss(e.xmm1, e.xmm2); + + e.vstmxcsr(mxcsr_storage); + + e.test(mxcsr_storage, e.eax); + + Xbyak::Label ret_qnan; + Xbyak::Label done; + e.jnz(ret_qnan); // reorder these jmps later, just want to get this fix in + // e.vshufps(i.dest, e.xmm1, e.xmm1, 0); + e.vbroadcastss(i.dest, e.xmm1); + e.jmp(done); + e.L(ret_qnan); + e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN)); + e.L(done); + // e.DebugBreak(); } }; EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_4, DOT_PRODUCT_4_V128); @@ -2759,7 +2846,6 @@ struct AND_I64 : Sequence> { }; struct AND_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vpand(dest, src1, src2); @@ -3419,7 +3505,7 @@ bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) { return true; } } - XELOGE("No sequence match for variant {}", i->opcode->name); + XELOGE("No sequence match for variant {}", GetOpcodeName(i->opcode)); return false; } diff --git a/src/xenia/cpu/backend/x64/x64_stack_layout.h b/src/xenia/cpu/backend/x64/x64_stack_layout.h index 1736dc02a..5bd50a803 100644 --- a/src/xenia/cpu/backend/x64/x64_stack_layout.h +++ b/src/xenia/cpu/backend/x64/x64_stack_layout.h @@ -122,7 +122,8 @@ class StackLayout { * */ static const size_t GUEST_STACK_SIZE = 104; - static const size_t GUEST_CTX_HOME = 80; + //was GUEST_CTX_HOME, can't remove because that'd throw stack alignment off. instead, can be used as a temporary in sequences + static const size_t GUEST_SCRATCH64 = 80; static const size_t GUEST_RET_ADDR = 88; static const size_t GUEST_CALL_RET_ADDR = 96; }; diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index df76cc25d..025b4114e 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -312,13 +312,18 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { result = true; } else if (i->src2.value->IsConstant() && i->src3.value->IsConstant()) { - // TODO: Select - // v->set_from(i->src2.value); - // v->Select(i->src3.value, i->src1.value); - // i->Remove(); + v->set_from(i->src2.value); + v->Select(i->src3.value, i->src1.value); + i->Remove(); + result = true; } } else { - // TODO: vec128 select + if (i->src2.value->IsConstant() && i->src3.value->IsConstant()) { + v->set_from(i->src2.value); + v->Select(i->src3.value, i->src1.value); + i->Remove(); + result = true; + } } } break; @@ -744,8 +749,35 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { result = true; } break; - // TODO(benvanik): INSERT/EXTRACT - // TODO(benvanik): PERMUTE/SWIZZLE + + case OPCODE_PERMUTE: { + if (i->src1.value->IsConstant() && i->src2.value->IsConstant() && + i->src3.value->IsConstant() && + (i->flags == INT8_TYPE || i->flags == INT16_TYPE)) { + v->set_from(i->src1.value); + v->Permute(i->src2.value, i->src3.value, (TypeName)i->flags); + i->Remove(); + result = true; + } + break; + } + case OPCODE_INSERT: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant() && + i->src3.value->IsConstant()) { + v->set_from(i->src1.value); + v->Insert(i->src2.value, i->src3.value, (TypeName)i->flags); + i->Remove(); + result = true; + } + break; + case OPCODE_SWIZZLE: + if (i->src1.value->IsConstant()) { + v->set_from(i->src1.value); + v->Swizzle((uint32_t)i->src2.offset, (TypeName)i->flags); + i->Remove(); + result = true; + } + break; case OPCODE_EXTRACT: if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_zero(v->type); @@ -867,24 +899,6 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { } break; - case OPCODE_DOT_PRODUCT_3: - if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { - v->set_from(i->src1.value); - v->DotProduct3(i->src2.value); - i->Remove(); - result = true; - } - break; - - case OPCODE_DOT_PRODUCT_4: - if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { - v->set_from(i->src1.value); - v->DotProduct4(i->src2.value); - i->Remove(); - result = true; - } - break; - case OPCODE_VECTOR_AVERAGE: if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); @@ -896,7 +910,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { result = true; } break; - + case OPCODE_VECTOR_DENORMFLUSH: + if (i->src1.value->IsConstant()) { + v->set_from(i->src1.value); + v->DenormalFlush(); + i->Remove(); + result = true; + } + break; default: // Ignored. break; diff --git a/src/xenia/cpu/compiler/passes/data_flow_analysis_pass.cc b/src/xenia/cpu/compiler/passes/data_flow_analysis_pass.cc index 56cf1c769..f1613b481 100644 --- a/src/xenia/cpu/compiler/passes/data_flow_analysis_pass.cc +++ b/src/xenia/cpu/compiler/passes/data_flow_analysis_pass.cc @@ -132,10 +132,10 @@ void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder, while (outgoing_ordinal != -1) { Value* src_value = value_map[outgoing_ordinal]; assert_not_null(src_value); - if (!src_value->local_slot) { - src_value->local_slot = builder->AllocLocal(src_value->type); + if (!src_value->HasLocalSlot()) { + src_value->SetLocalSlot(builder->AllocLocal(src_value->type)); } - builder->StoreLocal(src_value->local_slot, src_value); + builder->StoreLocal(src_value->GetLocalSlot(), src_value); // If we are in the block the value was defined in: if (src_value->def->block == block) { @@ -168,10 +168,10 @@ void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder, while (incoming_ordinal != -1) { Value* src_value = value_map[incoming_ordinal]; assert_not_null(src_value); - if (!src_value->local_slot) { - src_value->local_slot = builder->AllocLocal(src_value->type); + if (!src_value->HasLocalSlot()) { + src_value->SetLocalSlot(builder->AllocLocal(src_value->type)); } - Value* local_value = builder->LoadLocal(src_value->local_slot); + Value* local_value = builder->LoadLocal(src_value->GetLocalSlot()); builder->last_instr()->MoveBefore(block->instr_head); // Swap uses of original value with the local value. diff --git a/src/xenia/cpu/compiler/passes/register_allocation_pass.cc b/src/xenia/cpu/compiler/passes/register_allocation_pass.cc index bd7380184..439b35708 100644 --- a/src/xenia/cpu/compiler/passes/register_allocation_pass.cc +++ b/src/xenia/cpu/compiler/passes/register_allocation_pass.cc @@ -365,7 +365,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block, auto new_head_use = next_use; // Allocate local. - if (spill_value->local_slot) { + if (spill_value->HasLocalSlot()) { // Value is already assigned a slot. Since we allocate in order and this is // all SSA we know the stored value will be exactly what we want. Yay, // we can prevent the redundant store! @@ -373,10 +373,10 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block, // use the spilled value and prevent the need for more locals. } else { // Allocate a local slot. - spill_value->local_slot = builder->AllocLocal(spill_value->type); + spill_value->SetLocalSlot(builder->AllocLocal(spill_value->type)); // Add store. - builder->StoreLocal(spill_value->local_slot, spill_value); + builder->StoreLocal(spill_value->GetLocalSlot(), spill_value); auto spill_store = builder->last_instr(); auto spill_store_use = spill_store->src2_use; assert_null(spill_store_use->prev); @@ -417,7 +417,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block, // use is after the instruction requesting the spill we know we haven't // done allocation for that code yet and can let that be handled // automatically when we get to it. - auto new_value = builder->LoadLocal(spill_value->local_slot); + auto new_value = builder->LoadLocal(spill_value->GetLocalSlot()); auto spill_load = builder->last_instr(); spill_load->MoveBefore(next_use->instr); // Note: implicit first use added. @@ -429,7 +429,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block, // Set the local slot of the new value to our existing one. This way we will // reuse that same memory if needed. - new_value->local_slot = spill_value->local_slot; + new_value->SetLocalSlot( spill_value->GetLocalSlot()); // Rename all future uses of the SSA value to the new value as loaded // from the local. diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc index 3cc2c9aaa..2665842a5 100644 --- a/src/xenia/cpu/hir/hir_builder.cc +++ b/src/xenia/cpu/hir/hir_builder.cc @@ -260,9 +260,9 @@ void HIRBuilder::Dump(StringBuffer* str) { str->Append(" = "); } if (i->flags) { - str->AppendFormat("{}.{}", info->name, i->flags); + str->AppendFormat("{}.{}", GetOpcodeName(info), i->flags); } else { - str->Append(info->name); + str->Append(GetOpcodeName(info)); } if (src1_type) { str->Append(' '); @@ -712,7 +712,6 @@ Value* HIRBuilder::AllocValue(TypeName type) { value->use_head = NULL; value->last_use = NULL; value->local_slot = NULL; - value->tag = NULL; value->reg.set = NULL; value->reg.index = -1; return value; @@ -723,12 +722,11 @@ Value* HIRBuilder::CloneValue(Value* source) { value->ordinal = next_value_ordinal_++; value->type = source->type; value->flags = source->flags; + value->local_slot = NULL; value->constant.v128 = source->constant.v128; value->def = NULL; value->use_head = NULL; value->last_use = NULL; - value->local_slot = NULL; - value->tag = NULL; value->reg.set = NULL; value->reg.index = -1; return value; @@ -1493,7 +1491,16 @@ Value* HIRBuilder::VectorCompareUGE(Value* value1, Value* value2, return VectorCompareXX(OPCODE_VECTOR_COMPARE_UGE_info, value1, value2, part_type); } - +Value* HIRBuilder::VectorDenormFlush(Value* value1) { + return value1; + ASSERT_VECTOR_TYPE(value1); + Instr* i = + AppendInstr(OPCODE_VECTOR_DENORMFLUSH_info, 0, AllocValue(VEC128_TYPE)); + i->set_src1(value1); + i->src2.value = nullptr; + i->src3.value = nullptr; + return i->dest; +} Value* HIRBuilder::Add(Value* value1, Value* value2, uint32_t arithmetic_flags) { ASSERT_TYPES_EQUAL(value1, value2); @@ -1713,13 +1720,13 @@ Value* HIRBuilder::Log2(Value* value) { return i->dest; } + Value* HIRBuilder::DotProduct3(Value* value1, Value* value2) { ASSERT_VECTOR_TYPE(value1); ASSERT_VECTOR_TYPE(value2); ASSERT_TYPES_EQUAL(value1, value2); - Instr* i = - AppendInstr(OPCODE_DOT_PRODUCT_3_info, 0, AllocValue(FLOAT32_TYPE)); + Instr* i = AppendInstr(OPCODE_DOT_PRODUCT_3_info, 0, AllocValue(VEC128_TYPE)); i->set_src1(value1); i->set_src2(value2); i->src3.value = NULL; @@ -1731,8 +1738,7 @@ Value* HIRBuilder::DotProduct4(Value* value1, Value* value2) { ASSERT_VECTOR_TYPE(value2); ASSERT_TYPES_EQUAL(value1, value2); - Instr* i = - AppendInstr(OPCODE_DOT_PRODUCT_4_info, 0, AllocValue(FLOAT32_TYPE)); + Instr* i = AppendInstr(OPCODE_DOT_PRODUCT_4_info, 0, AllocValue(VEC128_TYPE)); i->set_src1(value1); i->set_src2(value2); i->src3.value = NULL; diff --git a/src/xenia/cpu/hir/hir_builder.h b/src/xenia/cpu/hir/hir_builder.h index b2809d5d8..3b29867e9 100644 --- a/src/xenia/cpu/hir/hir_builder.h +++ b/src/xenia/cpu/hir/hir_builder.h @@ -199,6 +199,7 @@ class HIRBuilder { Value* VectorCompareSGE(Value* value1, Value* value2, TypeName part_type); Value* VectorCompareUGT(Value* value1, Value* value2, TypeName part_type); Value* VectorCompareUGE(Value* value1, Value* value2, TypeName part_type); + Value* VectorDenormFlush(Value* value1); Value* Add(Value* value1, Value* value2, uint32_t arithmetic_flags = 0); Value* AddWithCarry(Value* value1, Value* value2, Value* value3, diff --git a/src/xenia/cpu/hir/opcodes.cc b/src/xenia/cpu/hir/opcodes.cc index b3b14b198..00eb4f2f7 100644 --- a/src/xenia/cpu/hir/opcodes.cc +++ b/src/xenia/cpu/hir/opcodes.cc @@ -15,14 +15,23 @@ namespace hir { #define DEFINE_OPCODE(num, name, sig, flags) \ const OpcodeInfo num##_info = { \ + num, \ flags, \ sig, \ - name, \ - num, \ }; #include "xenia/cpu/hir/opcodes.inl" #undef DEFINE_OPCODE +const char* GetOpcodeName(Opcode num) { + switch (num) { +#define DEFINE_OPCODE(num, name, sig, flags) \ + case num: \ + return name; +#include "xenia/cpu/hir/opcodes.inl" +#undef DEFINE_OPCODE + } + return "invalid opcode"; +} } // namespace hir } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index 8e681c757..acc61d047 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -280,7 +280,8 @@ enum Opcode { OPCODE_ATOMIC_EXCHANGE, OPCODE_ATOMIC_COMPARE_EXCHANGE, OPCODE_SET_ROUNDING_MODE, - __OPCODE_MAX_VALUE, // Keep at end. + OPCODE_VECTOR_DENORMFLUSH, // converts denormals to signed zeros in a vector + __OPCODE_MAX_VALUE, // Keep at end. }; enum OpcodeFlags { @@ -352,17 +353,42 @@ static bool IsOpcodeBinaryValue(uint32_t signature) { ((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6)); } +static void UnpackOpcodeSig(uint32_t sig, OpcodeSignatureType& dest, + OpcodeSignatureType& src1, + OpcodeSignatureType& src2, + OpcodeSignatureType& src3) { + dest = GET_OPCODE_SIG_TYPE_DEST(sig); + src1 = GET_OPCODE_SIG_TYPE_SRC1(sig); + src2 = GET_OPCODE_SIG_TYPE_SRC2(sig); + src3 = GET_OPCODE_SIG_TYPE_SRC3(sig); +} + +constexpr uint32_t GetNumOperandsForSig(uint32_t sig) { + sig >>= 3; + + uint32_t result = 0; + while (sig) { + if (sig & 0x7) { + ++result; + } + sig >>= 3; + } + return result; +} typedef struct { + Opcode num; uint32_t flags; uint32_t signature; - const char* name; - Opcode num; } OpcodeInfo; #define DEFINE_OPCODE(num, name, sig, flags) extern const OpcodeInfo num##_info; #include "xenia/cpu/hir/opcodes.inl" #undef DEFINE_OPCODE +const char* GetOpcodeName(Opcode num); +static inline const char* GetOpcodeName(const OpcodeInfo* info) { + return GetOpcodeName(info->num); +} } // namespace hir } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/hir/opcodes.inl b/src/xenia/cpu/hir/opcodes.inl index 584b0ac55..a1ca73f7d 100644 --- a/src/xenia/cpu/hir/opcodes.inl +++ b/src/xenia/cpu/hir/opcodes.inl @@ -673,3 +673,10 @@ DEFINE_OPCODE( "set_rounding_mode", OPCODE_SIG_X_V, 0) + +DEFINE_OPCODE( + OPCODE_VECTOR_DENORMFLUSH, + "vector_denormflush", + OPCODE_SIG_V_V, + 0 +) \ No newline at end of file diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc index 2bf52a05d..211cd18f9 100644 --- a/src/xenia/cpu/hir/value.cc +++ b/src/xenia/cpu/hir/value.cc @@ -864,10 +864,112 @@ void Value::Extract(Value* vec, Value* index) { break; } } +void Value::Permute(Value* src1, Value* src2, TypeName type) { + if (type == INT8_TYPE) { + uint8_t table[32]; + for (uint32_t i = 0; i < 16; ++i) { + table[i] = src1->constant.v128.u8[i]; + table[i + 16] = src2->constant.v128.u8[i]; + } + + for (uint32_t i = 0; i < 16; ++i) { + constant.v128.u8[i] = table[(constant.v128.u8[i] ^ 3) & 0x1f]; + } + } else if (type == INT16_TYPE) { + vec128_t perm = (constant.v128 & vec128s(0xF)) ^ vec128s(0x1); + vec128_t perm_ctrl = vec128b(0); + for (int i = 0; i < 8; i++) { + perm_ctrl.i16[i] = perm.i16[i] > 7 ? -1 : 0; + + auto v = uint8_t(perm.u16[i]); + perm.u8[i * 2] = v * 2; + perm.u8[i * 2 + 1] = v * 2 + 1; + } + auto lod = [](const vec128_t& v) { + return _mm_loadu_si128((const __m128i*)&v); + }; + auto sto = [](vec128_t& v, __m128i x) { + return _mm_storeu_si128((__m128i*)&v, x); + }; + + __m128i xmm1 = lod(src1->constant.v128); + __m128i xmm2 = lod(src2->constant.v128); + xmm1 = _mm_shuffle_epi8(xmm1, lod(perm)); + xmm2 = _mm_shuffle_epi8(xmm2, lod(perm)); + uint8_t mask = 0; + for (int i = 0; i < 8; i++) { + if (perm_ctrl.i16[i] == 0) { + mask |= 1 << (7 - i); + } + } + + vec128_t unp_mask = vec128b(0); + for (int i = 0; i < 8; i++) { + if (mask & (1 << i)) { + unp_mask.u16[i] = 0xFFFF; + } + } + + sto(constant.v128, _mm_blendv_epi8(xmm1, xmm2, lod(unp_mask))); + + } else { + assert_unhandled_case(type); + } +} +void Value::Insert(Value* index, Value* part, TypeName type) { + vec128_t* me = &constant.v128; + + switch (type) { + case INT8_TYPE: + me->u8[index->constant.u8 ^ 3] = part->constant.u8; + break; + case INT16_TYPE: + me->u16[index->constant.u8 ^ 1] = part->constant.u16; + break; + case INT32_TYPE: + me->u32[index->constant.u8] = part->constant.u32; + break; + } +} +void Value::Swizzle(uint32_t mask, TypeName type) { + if (type == INT32_TYPE || type == FLOAT32_TYPE) { + vec128_t result = vec128b(0); + for (uint32_t i = 0; i < 4; ++i) { + result.u32[i] = constant.v128.u32[(mask >> (i * 2)) & 0b11]; + } + constant.v128 = result; + } else { + assert_unhandled_case(type); + } +} void Value::Select(Value* other, Value* ctrl) { - // TODO - assert_always(); + if (ctrl->type == VEC128_TYPE) { + constant.v128.low = (constant.v128.low & ~ctrl->constant.v128.low) | + (other->constant.v128.low & ctrl->constant.v128.low); + constant.v128.high = (constant.v128.high & ~ctrl->constant.v128.high) | + (other->constant.v128.high & ctrl->constant.v128.high); + + } else { + if (ctrl->constant.u8) { + switch (other->type) { + case INT8_TYPE: + constant.u8 = other->constant.u8; + break; + case INT16_TYPE: + constant.u16 = other->constant.u16; + break; + case INT32_TYPE: + case FLOAT32_TYPE: + constant.u32 = other->constant.u32; + break; + case INT64_TYPE: + case FLOAT64_TYPE: + constant.u64 = other->constant.u64; + break; + } + } + } } void Value::Splat(Value* other) { @@ -1532,7 +1634,15 @@ void Value::ByteSwap() { break; } } - +void Value::DenormalFlush() { + for (int i = 0; i < 4; ++i) { + uint32_t current_element = constant.v128.u32[i]; + if ((current_element & 0x7f800000) == 0) { + current_element = current_element & 0x80000000; + } + constant.v128.u32[i] = current_element; + } +} void Value::CountLeadingZeros(const Value* other) { switch (other->type) { case INT8_TYPE: diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h index f0312d424..1d8963b64 100644 --- a/src/xenia/cpu/hir/value.h +++ b/src/xenia/cpu/hir/value.h @@ -104,6 +104,9 @@ struct ValueMask { class Value { public: + /* + todo : this should be intrusive and be part of Instr instead. + */ typedef struct Use_s { Instr* instr; Use_s* prev; @@ -128,17 +131,16 @@ class Value { TypeName type; uint32_t flags; - RegAssignment reg; - ConstantValue constant; Instr* def; Use* use_head; // NOTE: for performance reasons this is not maintained during construction. Instr* last_use; - Value* local_slot; - - // TODO(benvanik): remove to shrink size. - void* tag; + RegAssignment reg; + union { + Value* local_slot; + ConstantValue constant; + }; Use* AddUse(Arena* arena, Instr* instr); void RemoveUse(Use* use); @@ -209,7 +211,20 @@ class Value { flags = other->flags; constant.v128 = other->constant.v128; } + bool HasLocalSlot() const { + return !(flags & VALUE_IS_CONSTANT) && local_slot; + } + void SetLocalSlot(Value* lslot) { + assert(!(flags & VALUE_IS_CONSTANT)); + local_slot = lslot; + } + Value* GetLocalSlot() { + return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot; + } + const Value* GetLocalSlot() const { + return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot; + } inline bool IsConstant() const { return !!(flags & VALUE_IS_CONSTANT); } bool IsConstantTrue() const { if (type == VEC128_TYPE) { @@ -555,7 +570,10 @@ class Value { void Shr(Value* other); void Sha(Value* other); void RotateLeft(Value* other); + void Insert(Value* index, Value* part, TypeName type); void Extract(Value* vec, Value* index); + void Permute(Value* src1, Value* src2, TypeName type); + void Swizzle(uint32_t mask, TypeName type); void Select(Value* other, Value* ctrl); void Splat(Value* other); void VectorCompareEQ(Value* other, TypeName type); @@ -575,6 +593,8 @@ class Value { void VectorAverage(Value* other, TypeName type, bool is_unsigned, bool saturate); void ByteSwap(); + void DenormalFlush(); + void CountLeadingZeros(const Value* other); bool Compare(Opcode opcode, Value* other); hir::Instr* GetDefSkipAssigns(); diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index 3ca5bc40f..37ee10396 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -279,14 +279,21 @@ int InstrEmit_stvlx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); // ea &= ~0xF ea = f.And(ea, f.LoadConstantUint64(~0xFull)); + Value* shrs = f.LoadVectorShr(eb); + Value* zerovec = f.LoadZeroVec128(); + // v = (old & ~mask) | ((new >> eb) & mask) - Value* new_value = f.Permute(f.LoadVectorShr(eb), f.LoadZeroVec128(), - f.LoadVR(vd), INT8_TYPE); + Value* new_value = f.Permute(shrs, zerovec, f.LoadVR(vd), INT8_TYPE); Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE)); + /* + these permutes need to be looked at closer. keep in mind Permute is meant to + emulate vmx's shuffles and does not generate particularly good code. The logic + here looks as if it might make more sense as a comparison ( +*/ // mask = FFFF... >> eb - Value* mask = f.Permute(f.LoadVectorShr(eb), f.LoadZeroVec128(), - f.Not(f.LoadZeroVec128()), INT8_TYPE); - Value* v = f.Or(f.AndNot(old_value, mask), f.And(new_value, mask)); + Value* mask = f.Permute(shrs, zerovec, f.Not(zerovec), INT8_TYPE); + + Value* v = f.Select(mask, old_value, new_value); // ea &= ~0xF (handled above) f.Store(ea, f.ByteSwap(v)); return 0; @@ -321,14 +328,14 @@ int InstrEmit_stvrx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, ea = CalculateEA_0(f, ra, rb); eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); ea = f.And(ea, f.LoadConstantUint64(~0xFull)); + Value* shrs = f.LoadVectorShr(eb); + Value* zerovec = f.LoadZeroVec128(); // v = (old & ~mask) | ((new << eb) & mask) - Value* new_value = f.Permute(f.LoadVectorShr(eb), f.LoadVR(vd), - f.LoadZeroVec128(), INT8_TYPE); + Value* new_value = f.Permute(shrs, f.LoadVR(vd), zerovec, INT8_TYPE); Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE)); // mask = ~FFFF... >> eb - Value* mask = f.Permute(f.LoadVectorShr(eb), f.Not(f.LoadZeroVec128()), - f.LoadZeroVec128(), INT8_TYPE); - Value* v = f.Or(f.AndNot(old_value, mask), f.And(new_value, mask)); + Value* mask = f.Permute(shrs, f.Not(zerovec), zerovec, INT8_TYPE); + Value* v = f.Select(mask, old_value, new_value); // ea &= ~0xF (handled above) f.Store(ea, f.ByteSwap(v)); f.MarkLabel(skip_label); @@ -815,8 +822,16 @@ int InstrEmit_vlogefp128(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_vmaddfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb, uint32_t vc) { + /* + chrispy: testing on POWER8 revealed that altivec vmaddfp unconditionally + flushes denormal inputs to 0, regardless of NJM setting + */ + Value* a = f.VectorDenormFlush(f.LoadVR(va)); + Value* b = f.VectorDenormFlush(f.LoadVR(vb)); + Value* c = f.VectorDenormFlush(f.LoadVR(vc)); // (VD) <- ((VA) * (VC)) + (VB) - Value* v = f.MulAdd(f.LoadVR(va), f.LoadVR(vc), f.LoadVR(vb)); + Value* v = f.MulAdd(a, c, b); + // todo: do denormal results also unconditionally become 0? f.StoreVR(vd, v); return 0; } @@ -832,9 +847,14 @@ int InstrEmit_vmaddfp128(PPCHIRBuilder& f, const InstrData& i) { } int InstrEmit_vmaddcfp128(PPCHIRBuilder& f, const InstrData& i) { + /* + see vmaddfp about these denormflushes + */ + Value* a = f.VectorDenormFlush(f.LoadVR(VX128_VA128)); + Value* b = f.VectorDenormFlush(f.LoadVR(VX128_VB128)); + Value* d = f.VectorDenormFlush(f.LoadVR(VX128_VD128)); // (VD) <- ((VA) * (VD)) + (VB) - Value* v = f.MulAdd(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VD128), - f.LoadVR(VX128_VB128)); + Value* v = f.MulAdd(a, d, b); f.StoreVR(VX128_VD128, v); return 0; } @@ -1085,7 +1105,8 @@ int InstrEmit_vmsum3fp128(PPCHIRBuilder& f, const InstrData& i) { // Dot product XYZ. // (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z) Value* v = f.DotProduct3(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128)); - v = f.Splat(v, VEC128_TYPE); + //chrispy: denormal outputs for Dot product are unconditionally made 0 + v = f.VectorDenormFlush(v); f.StoreVR(VX128_VD128, v); return 0; } @@ -1094,7 +1115,7 @@ int InstrEmit_vmsum4fp128(PPCHIRBuilder& f, const InstrData& i) { // Dot product XYZW. // (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z) + (VA.w * VB.w) Value* v = f.DotProduct4(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128)); - v = f.Splat(v, VEC128_TYPE); + v = f.VectorDenormFlush(v); f.StoreVR(VX128_VD128, v); return 0; } @@ -1151,7 +1172,19 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb, // (VD) <- -(((VA) * (VC)) - (VB)) // NOTE: only one rounding should take place, but that's hard... // This really needs VFNMSUB132PS/VFNMSUB213PS/VFNMSUB231PS but that's AVX. - Value* v = f.Neg(f.MulSub(f.LoadVR(va), f.LoadVR(vc), f.LoadVR(vb))); + // NOTE2: we could make vnmsub a new opcode, and then do it in double + // precision, rounding after the neg + + /* + chrispy: this is untested, but i believe this has the same DAZ behavior for + inputs as vmadd + */ + + Value* a = f.VectorDenormFlush(f.LoadVR(va)); + Value* b = f.VectorDenormFlush(f.LoadVR(vb)); + Value* c = f.VectorDenormFlush(f.LoadVR(vc)); + + Value* v = f.Neg(f.MulSub(a, c, b)); f.StoreVR(vd, v); return 0; }