diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index fb1fe138a..1cf4dc416 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -43,7 +43,10 @@ DEFINE_bool(ignore_undefined_externs, true, DEFINE_bool(emit_source_annotations, false, "Add extra movs and nops to make disassembly easier to read.", "CPU"); - +DEFINE_bool(resolve_rel32_guest_calls, false, + "Experimental optimization, directly call already resolved " + "functions via x86 rel32 call/jmp", + "CPU"); namespace xe { namespace cpu { namespace backend { @@ -99,7 +102,28 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW); TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ); + + + #undef TEST_EMIT_FEATURE + + if (cpu_.has(Xbyak::util::Cpu::tAMD)) { + + bool is_zennish = cpu_.displayFamily >= 0x17; + + if (is_zennish) { + feature_flags_ |= kX64FastJrcx; + + if (cpu_.displayFamily > 0x17) { + feature_flags_ |= kX64FastLoop; + + } else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) { + feature_flags_ |= kX64FastLoop; + } // todo:figure out at model zen+ became zen2, this is just the model + // for my cpu, which is ripper90 + + } + } } X64Emitter::~X64Emitter() = default; @@ -149,6 +173,26 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info, if (function) { code_cache_->PlaceGuestCode(function->address(), top_, func_info, function, new_execute_address, new_write_address); + if (cvars::resolve_rel32_guest_calls) { + for (auto&& callsite : call_sites_) { +#pragma pack(push, 1) + struct RGCEmitted { + uint8_t ff_; + uint32_t rgcid_; + }; +#pragma pack(pop) + RGCEmitted* hunter = (RGCEmitted*)new_execute_address; + while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) { + hunter = reinterpret_cast( + reinterpret_cast(hunter) + 1); + } + + hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8; + hunter->rgcid_ = + static_cast(static_cast(callsite.destination_) - + reinterpret_cast(hunter + 1)); + } + } } else { code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address, new_write_address); @@ -157,6 +201,7 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info, ready(); top_ = old_address; reset(); + call_sites_.clear(); return new_execute_address; } @@ -287,11 +332,8 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { code_offsets.tail = getSize(); if (cvars::emit_source_annotations) { - nop(); - nop(); - nop(); - nop(); - nop(); + nop(5); + } assert_zero(code_offsets.prolog); @@ -313,11 +355,9 @@ void X64Emitter::MarkSourceOffset(const Instr* i) { entry->code_offset = static_cast(getSize()); if (cvars::emit_source_annotations) { - nop(); - nop(); + nop(2); mov(eax, entry->guest_address); - nop(); - nop(); + nop(2); } if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctionCoverage) { @@ -414,10 +454,44 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) { assert_not_null(function); auto fn = static_cast(function); // Resolve address to the function to call and store in rax. + + if (cvars::resolve_rel32_guest_calls && fn->machine_code()) { + ResolvableGuestCall rgc; + rgc.destination_ = uint32_t(uint64_t(fn->machine_code())); + rgc.offset_ = current_rgc_id_; + current_rgc_id_++; + + if (!(instr->flags & hir::CALL_TAIL)) { + mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); + + db(0xFF); + rgc.is_jump_ = false; + + dd(rgc.offset_); + + } else { + // tail call + EmitTraceUserCallReturn(); + + rgc.is_jump_ = true; + // Pass the callers return address over. + mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]); + + add(rsp, static_cast(stack_size())); + db(0xFF); + dd(rgc.offset_); + } + call_sites_.push_back(rgc); + return; + } + if (fn->machine_code()) { // TODO(benvanik): is it worth it to do this? It removes the need for // a ResolveFunction call, but makes the table less useful. assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000); + // todo: this should be changed so that we can actually do a call to + // fn->machine_code. the code will be emitted near us, so 32 bit rel jmp + // should be possible mov(eax, uint32_t(uint64_t(fn->machine_code()))); } else if (code_cache_->has_indirection_table()) { // Load the pointer to the indirection table maintained in X64CodeCache. @@ -600,6 +674,30 @@ void X64Emitter::ReloadContext() { void X64Emitter::ReloadMembase() { mov(GetMembaseReg(), qword[GetContextReg() + 8]); // membase } +#define __NH_CONCAT(x, y) x##y +#define _MH_CONCAT(cb, ...) cb (__VA_ARGS__) + +#define mh_concat2_m(x, y) __NH_CONCAT(x, y) + +#define DECLNOP(n, ...) \ + static constexpr unsigned char mh_concat2_m(nop_, n)[] = {__VA_ARGS__} + +DECLNOP(1, 0x90); +DECLNOP(2, 0x66, 0x90); +DECLNOP(3, 0x0F, 0x1F, 0x00); +DECLNOP(4, 0x0F, 0x1F, 0x40, 0x00); +DECLNOP(5, 0x0F, 0x1F, 0x44, 0x00, 0x00); +DECLNOP(6, 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00); +DECLNOP(7, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00); +DECLNOP(8, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00); +DECLNOP(9, 0x66, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00); + +static constexpr const unsigned char* const g_noptable[] = { + &nop_1[0], &nop_1[0], &nop_2[0], &nop_3[0], &nop_4[0], + &nop_5[0], &nop_6[0], &nop_7[0], &nop_8[0], &nop_9[0]}; + +static constexpr unsigned LENGTHOF_NOPTABLE = + sizeof(g_noptable) / sizeof(g_noptable[0]); // Len Assembly Byte Sequence // ============================================================================ @@ -613,9 +711,17 @@ void X64Emitter::ReloadMembase() { // 8b NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H // 9b 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H void X64Emitter::nop(size_t length) { - // TODO(benvanik): fat nop - for (size_t i = 0; i < length; ++i) { - db(0x90); + while (length != 0) { + unsigned patchsize = length % LENGTHOF_NOPTABLE; + + // patch_memory(locptr, size, (char*)g_noptable[patchsize]); + + for (unsigned i = 0; i < patchsize; ++i) { + db(g_noptable[patchsize][i]); + } + + //locptr += patchsize; + length -= patchsize; } } @@ -649,6 +755,35 @@ void X64Emitter::MovMem64(const Xbyak::RegExp& addr, uint64_t v) { mov(dword[addr + 4], static_cast(v >> 32)); } } +static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1, + unsigned char v2, unsigned char v3, + unsigned char v4, unsigned char v5, + unsigned char v6, unsigned char v7, + unsigned char v8, unsigned char v9, + unsigned char v10, unsigned char v11, + unsigned char v12, unsigned char v13, + unsigned char v14, unsigned char v15) { + vec128_t result; + + result.u8[0] = v0; + result.u8[1] = v1; + result.u8[2] = v2; + result.u8[3] = v3; + result.u8[4] = v4; + result.u8[5] = v5; + result.u8[6] = v6; + result.u8[7] = v7; + result.u8[8] = v8; + result.u8[9] = v9; + result.u8[10] = v10; + result.u8[11] = v11; + result.u8[12] = v12; + result.u8[13] = v13; + result.u8[14] = v14; + + result.u8[15] = v15; + return result; +} static const vec128_t xmm_consts[] = { /* XMMZero */ vec128f(0.0f), @@ -761,8 +896,60 @@ static const vec128_t xmm_consts[] = { /* XMMQNaN */ vec128i(0x7FC00000u), /* XMMInt127 */ vec128i(0x7Fu), /* XMM2To32 */ vec128f(0x1.0p32f), + /* xmminf */ vec128i(0x7f800000), + + /* XMMIntsToBytes*/ + v128_setr_bytes(0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80), + /*XMMShortsToBytes*/ + v128_setr_bytes(0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80) }; +void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { + for (auto& vec : xmm_consts) { + for (auto& u8 : vec.u8) { + if (u8 == bytevalue) { + return reinterpret_cast(backend_->emitter_data() + + (&u8 - &xmm_consts[0].u8[0])); + } + } + } + return nullptr; +} +void* X64Emitter::FindWordConstantOffset(unsigned wordvalue) { + for (auto& vec : xmm_consts) { + for (auto& u16 : vec.u16) { + if (u16 == wordvalue) { + return reinterpret_cast(backend_->emitter_data() + + ((&u16 - &xmm_consts[0].u16[0]) * 2)); + } + } + } + return nullptr; +} +void* X64Emitter::FindDwordConstantOffset(unsigned dwordvalue) { + for (auto& vec : xmm_consts) { + for (auto& u32 : vec.u32) { + if (u32 == dwordvalue) { + return reinterpret_cast(backend_->emitter_data() + + ((&u32 - &xmm_consts[0].u32[0]) * 4)); + } + } + } + return nullptr; +} +void* X64Emitter::FindQwordConstantOffset(uint64_t qwordvalue) { + for (auto& vec : xmm_consts) { + for (auto& u64 : vec.u64) { + if (u64 == qwordvalue) { + return reinterpret_cast(backend_->emitter_data() + + ((&u64 - &xmm_consts[0].u64[0]) * 8)); + } + } + } + return nullptr; +} // First location to try and place constants. static const uintptr_t kConstDataLocation = 0x20000000; static const uintptr_t kConstDataSize = sizeof(xmm_consts); @@ -806,7 +993,6 @@ Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) { return ptr[reinterpret_cast(backend_->emitter_data() + sizeof(vec128_t) * id)]; } - // Implies possible StashXmm(0, ...)! void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) { // https://www.agner.org/optimize/optimizing_assembly.pdf @@ -818,12 +1004,115 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) { // 1111... vpcmpeqb(dest, dest); } else { + for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) { if (xmm_consts[i] == v) { vmovapd(dest, GetXmmConstPtr((XmmConst)i)); return; } } + if (IsFeatureEnabled(kX64EmitAVX2)) { + bool all_equal_bytes = true; + + unsigned firstbyte = v.u8[0]; + for (unsigned i = 1; i < 16; ++i) { + if (v.u8[i] != firstbyte) { + all_equal_bytes = false; + break; + } + } + + if (all_equal_bytes) { + void* bval = FindByteConstantOffset(firstbyte); + + if (bval) { + vpbroadcastb(dest, byte[bval]); + return; + } + // didnt find existing mem with the value + mov(byte[rsp + kStashOffset], firstbyte); + vpbroadcastb(dest, byte[rsp + kStashOffset]); + return; + } + + bool all_equal_words = true; + unsigned firstword = v.u16[0]; + for (unsigned i = 1; i < 8; ++i) { + if (v.u16[i] != firstword) { + all_equal_words = false; + break; + } + } + if (all_equal_words) { + void* wval = FindWordConstantOffset(firstword); + if (wval) { + vpbroadcastw(dest, word[wval]); + return; + } + // didnt find existing mem with the value + mov(word[rsp + kStashOffset], firstword); + vpbroadcastw(dest, word[rsp + kStashOffset]); + return; + } + + bool all_equal_dwords = true; + unsigned firstdword = v.u32[0]; + for (unsigned i = 1; i < 4; ++i) { + if (v.u32[i] != firstdword) { + all_equal_dwords = false; + break; + } + } + if (all_equal_dwords) { + void* dwval = FindDwordConstantOffset(firstdword); + if (dwval) { + vpbroadcastd(dest, dword[dwval]); + return; + } + mov(dword[rsp + kStashOffset], firstdword); + vpbroadcastd(dest, dword[rsp + kStashOffset]); + return; + } + + bool all_equal_qwords = v.low == v.high; + + if (all_equal_qwords) { + void* qwval = FindQwordConstantOffset(v.low); + if (qwval) { + vpbroadcastq(dest, qword[qwval]); + return; + } + MovMem64(rsp + kStashOffset, v.low); + vpbroadcastq(dest, qword[rsp + kStashOffset]); + return; + } + } + + for (auto& vec : xmm_consts) { + if (vec.low == v.low && vec.high == v.high) { + vmovdqa(dest, + ptr[reinterpret_cast(backend_->emitter_data() + + ((&vec - &xmm_consts[0]) * 16))]); + return; + } + } + + if (v.high == 0 && v.low == ~0ULL) { + vpcmpeqb(dest, dest); + movq(dest, dest); + return; + } + if (v.high == 0) { + if ((v.low & 0xFFFFFFFF) == v.low) { + mov(dword[rsp + kStashOffset], static_cast(v.low)); + movd(dest, dword[rsp + kStashOffset]); + return; + } + MovMem64(rsp + kStashOffset, v.low); + movq(dest, qword[rsp + kStashOffset]); + return; + } + // TODO(benvanik): see what other common values are. // TODO(benvanik): build constant table - 99% are reused. MovMem64(rsp + kStashOffset, v.low); diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 247d6175c..66a02fcc1 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -116,6 +116,9 @@ enum XmmConst { XMMQNaN, XMMInt127, XMM2To32, + XMMFloatInf, + XMMIntsToBytes, + XMMShortsToBytes }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. @@ -141,7 +144,16 @@ enum X64EmitterFeatureFlags { kX64EmitAVX512DQ = 1 << 11, kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL, - kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ + kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ, + kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1) + kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2) +}; +class ResolvableGuestCall { + public: + bool is_jump_; + uintptr_t destination_; + // rgcid + unsigned offset_; }; class X64Emitter : public Xbyak::CodeGenerator { @@ -230,7 +242,10 @@ class X64Emitter : public Xbyak::CodeGenerator { Xbyak::Address StashConstantXmm(int index, float v); Xbyak::Address StashConstantXmm(int index, double v); Xbyak::Address StashConstantXmm(int index, const vec128_t& v); - + void* FindByteConstantOffset(unsigned bytevalue); + void* FindWordConstantOffset(unsigned wordvalue); + void* FindDwordConstantOffset(unsigned bytevalue); + void* FindQwordConstantOffset(uint64_t bytevalue); bool IsFeatureEnabled(uint32_t feature_flag) const { return (feature_flags_ & feature_flag) == feature_flag; } @@ -267,6 +282,8 @@ class X64Emitter : public Xbyak::CodeGenerator { static const uint32_t gpr_reg_map_[GPR_COUNT]; static const uint32_t xmm_reg_map_[XMM_COUNT]; + uint32_t current_rgc_id_ = 0xEEDDF00F; + std::vector call_sites_; }; } // namespace x64 diff --git a/src/xenia/cpu/backend/x64/x64_seq_control.cc b/src/xenia/cpu/backend/x64/x64_seq_control.cc index 46c879218..715614753 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_control.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc @@ -109,22 +109,39 @@ struct DEBUG_BREAK_TRUE_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.DebugBreak(); - e.L(skip); + + if (e.IsFeatureEnabled(kX64FastJrcx)) { + e.mov(e.ecx, i.src1); + Xbyak::Label skip; + e.jrcxz(skip); + e.DebugBreak(); + e.L(skip); + } else { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } } }; struct DEBUG_BREAK_TRUE_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.DebugBreak(); - e.L(skip); + if (e.IsFeatureEnabled(kX64FastJrcx)) { + e.mov(e.rcx, i.src1); + Xbyak::Label skip; + e.jrcxz(skip); + e.DebugBreak(); + e.L(skip); + } else { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } } }; struct DEBUG_BREAK_TRUE_F32 @@ -190,21 +207,37 @@ struct TRAP_TRUE_I16 struct TRAP_TRUE_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Trap(i.instr->flags); - e.L(skip); + if (e.IsFeatureEnabled(kX64FastJrcx)) { + e.mov(e.ecx, i.src1); + Xbyak::Label skip; + e.jrcxz(skip); + e.Trap(i.instr->flags); + e.L(skip); + } else { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(i.instr->flags); + e.L(skip); + } } }; struct TRAP_TRUE_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Trap(i.instr->flags); - e.L(skip); + if (e.IsFeatureEnabled(kX64FastJrcx)) { + e.mov(e.rcx, i.src1); + Xbyak::Label skip; + e.jrcxz(skip); + e.Trap(i.instr->flags); + e.L(skip); + } else { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(i.instr->flags); + e.L(skip); + } } }; struct TRAP_TRUE_F32 @@ -355,22 +388,39 @@ struct CALL_INDIRECT_TRUE_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip, CodeGenerator::T_NEAR); - e.CallIndirect(i.instr, i.src2); - e.L(skip); + + if (e.IsFeatureEnabled(kX64FastJrcx)) { + e.mov(e.ecx, i.src1); + Xbyak::Label skip; + e.jrcxz(skip); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } else { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip, CodeGenerator::T_NEAR); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } } }; struct CALL_INDIRECT_TRUE_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip, CodeGenerator::T_NEAR); - e.CallIndirect(i.instr, i.src2); - e.L(skip); + if (e.IsFeatureEnabled(kX64FastJrcx)) { + e.mov(e.rcx, i.src1); + Xbyak::Label skip; + e.jrcxz(skip); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } else { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip, CodeGenerator::T_NEAR); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } } }; struct CALL_INDIRECT_TRUE_F32 diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc index 191146e6f..0646fdb39 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc @@ -15,6 +15,13 @@ #include "xenia/base/memory.h" #include "xenia/cpu/backend/x64/x64_op.h" #include "xenia/cpu/backend/x64/x64_tracers.h" +#include "xenia/cpu/ppc/ppc_context.h" +#include "xenia/base/cvar.h" + +DEFINE_bool( + elide_e0_check, false, + "Eliminate e0 check on some memory accesses, like to r13(tls) or r1(sp)", + "CPU"); namespace xe { namespace cpu { @@ -27,7 +34,30 @@ volatile int anchor_memory = 0; RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) { return e.GetContextReg() + offset.value; } +static bool is_eo_def(const hir::Value* v) { + if (v->def) { + auto df = v->def; + if (df->opcode == &OPCODE_LOAD_CONTEXT_info) { + size_t offs = df->src1.offset; + if (offs == offsetof(ppc::PPCContext_s, r[1]) || + offs == offsetof(ppc::PPCContext_s, r[13])) { + return true; + } + } else if (df->opcode == &OPCODE_ASSIGN_info) { + return is_eo_def(df->src1.value); + } + } + return false; +} +template +static bool is_definitely_not_eo(const T& v) { + if (!cvars::elide_e0_check) { + return false; + } + + return is_eo_def(v.value); +} template RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest, const T& offset) { @@ -49,7 +79,8 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest, return e.GetMembaseReg() + e.rax; } } else { - if (xe::memory::allocation_granularity() > 0x1000) { + if (xe::memory::allocation_granularity() > 0x1000 && + !is_definitely_not_eo(guest)) { // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do // it via memory mapping. e.xor_(e.eax, e.eax); @@ -60,12 +91,12 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest, } else { // Clear the top 32 bits, as they are likely garbage. // TODO(benvanik): find a way to avoid doing this. + e.mov(e.eax, guest.reg().cvt32()); } return e.GetMembaseReg() + e.rax + offset_const; } } - // Note: most *should* be aligned, but needs to be checked! template RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { @@ -86,7 +117,8 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { return e.GetMembaseReg() + e.rax; } } else { - if (xe::memory::allocation_granularity() > 0x1000) { + if (xe::memory::allocation_granularity() > 0x1000 && + !is_definitely_not_eo(guest)) { // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do // it via memory mapping. e.xor_(e.eax, e.eax); diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 09eb2b00e..299e7674f 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -728,28 +728,103 @@ struct VECTOR_SHL_V128 } } - static void EmitInt8(X64Emitter& e, const EmitArgType& i) { +static void EmitInt8(X64Emitter& e, const EmitArgType& i) { // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - if (e.IsFeatureEnabled(kX64EmitGFNI)) { - const auto& shamt = i.src2.constant(); + + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + if (!i.src2.is_constant) { + // get high 8 bytes + e.vpunpckhqdq(e.xmm1, i.src1, i.src1); + e.vpunpckhqdq(e.xmm3, i.src2, i.src2); + + e.vpmovzxbd(e.ymm0, i.src1); + e.vpmovzxbd(e.ymm1, e.xmm1); + + e.vpmovzxbd(e.ymm2, i.src2); + e.vpmovzxbd(e.ymm3, e.xmm3); + + e.vpsllvd(e.ymm0, e.ymm0, e.ymm2); + e.vpsllvd(e.ymm1, e.ymm1, e.ymm3); + e.vextracti128(e.xmm2, e.ymm0, 1); + e.vextracti128(e.xmm3, e.ymm1, 1); + e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMIntsToBytes)); + e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMIntsToBytes)); + e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMIntsToBytes)); + e.vpshufb(e.xmm3, e.xmm3, e.GetXmmConstPtr(XMMIntsToBytes)); + + e.vpunpckldq(e.xmm0, e.xmm0, e.xmm1); + e.vpunpckldq(e.xmm2, e.xmm2, e.xmm3); + e.vpunpcklqdq(i.dest, e.xmm0, e.xmm2); + return; + } else { + vec128_t constmask = i.src2.constant(); + + for (unsigned i = 0; i < 16; ++i) { + constmask.u8[i] &= 7; + } + + unsigned seenvalue = constmask.u8[0]; bool all_same = true; - for (size_t n = 0; n < 16 - n; ++n) { - if (shamt.u8[n] != shamt.u8[n + 1]) { + for (unsigned i = 1; i < 16; ++i) { + if (constmask.u8[i] != seenvalue) { all_same = false; break; } } if (all_same) { - // Every count is the same, so we can use gf2p8affineqb. - const uint8_t shift_amount = shamt.u8[0] & 0b111; - const uint64_t shift_matrix = - UINT64_C(0x0102040810204080) >> (shift_amount * 8); - e.vgf2p8affineqb(i.dest, i.src1, - e.StashConstantXmm(0, vec128q(shift_matrix)), 0); + // mul by two + /*if (seenvalue == 1) { + e.vpaddb(i.dest, i.src1, i.src1); + } else if (seenvalue == 2) { + e.vpaddb(i.dest, i.src1, i.src1); + e.vpaddb(i.dest, i.dest, i.dest); + } else if (seenvalue == 3) { + // mul by 8 + e.vpaddb(i.dest, i.src1, i.src1); + e.vpaddb(i.dest, i.dest, i.dest); + e.vpaddb(i.dest, i.dest, i.dest); + } else*/ + { + e.vpmovzxbw(e.ymm0, i.src1); + e.vpsllw(e.ymm0, e.ymm0, seenvalue); + e.vextracti128(e.xmm1, e.ymm0, 1); + + e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMShortsToBytes)); + e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMShortsToBytes)); + e.vpunpcklqdq(i.dest, e.xmm0, e.xmm1); + return; + } + + } else { + e.LoadConstantXmm(e.xmm2, constmask); + + e.vpunpckhqdq(e.xmm1, i.src1, i.src1); + e.vpunpckhqdq(e.xmm3, e.xmm2, e.xmm2); + + e.vpmovzxbd(e.ymm0, i.src1); + e.vpmovzxbd(e.ymm1, e.xmm1); + + e.vpmovzxbd(e.ymm2, e.xmm2); + e.vpmovzxbd(e.ymm3, e.xmm3); + + e.vpsllvd(e.ymm0, e.ymm0, e.ymm2); + e.vpsllvd(e.ymm1, e.ymm1, e.ymm3); + e.vextracti128(e.xmm2, e.ymm0, 1); + e.vextracti128(e.xmm3, e.ymm1, 1); + e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMIntsToBytes)); + e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMIntsToBytes)); + e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMIntsToBytes)); + e.vpshufb(e.xmm3, e.xmm3, e.GetXmmConstPtr(XMMIntsToBytes)); + + e.vpunpckldq(e.xmm0, e.xmm0, e.xmm1); + e.vpunpckldq(e.xmm2, e.xmm2, e.xmm3); + e.vpunpcklqdq(i.dest, e.xmm0, e.xmm2); + return; } } + } + if (i.src2.is_constant) { e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); } else { e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); @@ -758,7 +833,6 @@ struct VECTOR_SHL_V128 e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); e.vmovaps(i.dest, e.xmm0); } - static void EmitInt16(X64Emitter& e, const EmitArgType& i) { Xmm src1; if (i.src1.is_constant) { diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 77aa235cd..b4b9a70e2 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -38,6 +38,10 @@ #include "xenia/cpu/hir/hir_builder.h" #include "xenia/cpu/processor.h" +DEFINE_bool(use_fast_dot_product, false, + "Experimental optimization, much shorter sequence on dot products, treating inf as overflow instead of using mcxsr" + "four insn dotprod", + "CPU"); namespace xe { namespace cpu { namespace backend { @@ -886,7 +890,10 @@ struct COMPARE_EQ_I8 e.cmp(src1, src2); }, [](X64Emitter& e, const Reg8& src1, int32_t constant) { - e.cmp(src1, constant); + if (constant == 0) { + e.test(src1, src1); + } else + e.cmp(src1, constant); }); e.sete(i.dest); } @@ -900,7 +907,10 @@ struct COMPARE_EQ_I16 e.cmp(src1, src2); }, [](X64Emitter& e, const Reg16& src1, int32_t constant) { - e.cmp(src1, constant); + if (constant == 0) { + e.test(src1, src1); + } else + e.cmp(src1, constant); }); e.sete(i.dest); } @@ -914,7 +924,10 @@ struct COMPARE_EQ_I32 e.cmp(src1, src2); }, [](X64Emitter& e, const Reg32& src1, int32_t constant) { - e.cmp(src1, constant); + if (constant == 0) { + e.test(src1, src1); + } else + e.cmp(src1, constant); }); e.sete(i.dest); } @@ -928,7 +941,10 @@ struct COMPARE_EQ_I64 e.cmp(src1, src2); }, [](X64Emitter& e, const Reg64& src1, int32_t constant) { - e.cmp(src1, constant); + if (constant == 0) { + e.test(src1, src1); + } else + e.cmp(src1, constant); }); e.sete(i.dest); } @@ -1980,6 +1996,8 @@ struct DIV_V128 : Sequence> { assert_true(!i.instr->flags); EmitAssociativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + // e.vrcpps(e.xmm0, src2); + //e.vmulps(dest, src1, e.xmm0); e.vdivps(dest, src1, src2); }); } @@ -2591,43 +2609,51 @@ EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128); struct DOT_PRODUCT_V128 { static void Emit(X64Emitter& e, Xmm dest, Xmm src1, Xmm src2, uint8_t imm) { - // TODO(benvanik): apparently this is very slow - // - find alternative? - Xbyak::Label end; - e.inLocalLabel(); + if (cvars::use_fast_dot_product) { + e.vdpps(dest, src1, src2, imm); + e.vandps(e.xmm0, dest, e.GetXmmConstPtr(XMMAbsMaskPS)); + e.vcmpgeps(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMFloatInf)); + e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0); - // Grab space to put MXCSR. - // TODO(gibbed): stick this in TLS or - // something? - e.sub(e.rsp, 8); + } else { + // TODO(benvanik): apparently this is very slow + // - find alternative? + Xbyak::Label end; + e.inLocalLabel(); - // Grab MXCSR and mask off the overflow flag, - // because it's sticky. - e.vstmxcsr(e.dword[e.rsp]); - e.mov(e.eax, e.dword[e.rsp]); - e.and_(e.eax, uint32_t(~8)); - e.mov(e.dword[e.rsp], e.eax); - e.vldmxcsr(e.dword[e.rsp]); + // Grab space to put MXCSR. + // TODO(gibbed): stick this in TLS or + // something? + e.sub(e.rsp, 8); - // Hey we can do the dot product now. - e.vdpps(dest, src1, src2, imm); + // Grab MXCSR and mask off the overflow flag, + // because it's sticky. + e.vstmxcsr(e.dword[e.rsp]); + e.mov(e.eax, e.dword[e.rsp]); + e.and_(e.eax, uint32_t(~8)); + e.mov(e.dword[e.rsp], e.eax); + e.vldmxcsr(e.dword[e.rsp]); - // Load MXCSR... - e.vstmxcsr(e.dword[e.rsp]); + // Hey we can do the dot product now. + e.vdpps(dest, src1, src2, imm); - // ..free our temporary space and get MXCSR at - // the same time - e.pop(e.rax); + // Load MXCSR... + e.vstmxcsr(e.dword[e.rsp]); - // Did we overflow? - e.test(e.al, 8); - e.jz(end); + // ..free our temporary space and get MXCSR at + // the same time + e.pop(e.rax); - // Infinity? HA! Give NAN. - e.vmovdqa(dest, e.GetXmmConstPtr(XMMQNaN)); + // Did we overflow? + e.test(e.al, 8); + e.jz(end); - e.L(end); - e.outLocalLabel(); + // Infinity? HA! Give NAN. + e.vmovdqa(dest, e.GetXmmConstPtr(XMMQNaN)); + + e.L(end); + e.outLocalLabel(); + } } };