From c1d922eebf3fcb720b9bc9a452b6529ba1180d27 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sat, 5 Nov 2022 10:50:33 -0700 Subject: [PATCH 1/2] Minor decoder optimizations, kernel fixes, cpu backend fixes --- .gitmodules | 2 +- src/xenia/app/emulator_window.cc | 4 +- src/xenia/apu/conversion.h | 2 - src/xenia/apu/xaudio2/xaudio2_audio_driver.cc | 14 +- src/xenia/base/assert.h | 5 +- src/xenia/base/cvar.h | 6 +- src/xenia/base/filesystem_win.cc | 2 +- src/xenia/base/memory.cc | 3 +- src/xenia/base/memory.h | 10 +- src/xenia/base/mutex.h | 4 +- src/xenia/base/platform.h | 3 + src/xenia/base/ring_buffer.cc | 3 + src/xenia/base/ring_buffer.h | 1 - src/xenia/base/split_map.h | 2 - src/xenia/base/threading_win.cc | 7 +- src/xenia/cpu/backend/x64/x64_code_cache.h | 3 +- .../cpu/backend/x64/x64_code_cache_win.cc | 9 + src/xenia/cpu/backend/x64/x64_emitter.cc | 96 +-- src/xenia/cpu/backend/x64/x64_emitter.h | 18 +- src/xenia/cpu/backend/x64/x64_op.h | 9 +- src/xenia/cpu/backend/x64/x64_seq_control.cc | 44 +- src/xenia/cpu/backend/x64/x64_seq_vector.cc | 778 +++++++++++------- src/xenia/cpu/backend/x64/x64_sequences.cc | 104 ++- .../compiler/passes/conditional_group_pass.cc | 3 - .../cpu/compiler/passes/finalization_pass.cc | 12 - .../compiler/passes/simplification_pass.cc | 81 +- src/xenia/cpu/hir/label.h | 7 + src/xenia/cpu/hir/opcodes.h | 5 +- src/xenia/cpu/mmio_handler.cc | 3 +- src/xenia/cpu/module.h | 1 + src/xenia/cpu/ppc/ppc_context.h | 21 + src/xenia/cpu/ppc/ppc_decode_data.h | 24 + src/xenia/cpu/ppc/ppc_emit_altivec.cc | 6 +- src/xenia/cpu/ppc/ppc_emit_memory.cc | 2 +- src/xenia/cpu/ppc/ppc_frontend.cc | 5 + src/xenia/cpu/ppc/ppc_hir_builder.cc | 60 +- src/xenia/cpu/ppc/ppc_instr.h | 16 +- src/xenia/cpu/xex_module.cc | 265 +++++- src/xenia/cpu/xex_module.h | 8 +- src/xenia/debug/ui/debug_window.h | 2 +- .../gpu/d3d12/d3d12_command_processor.cc | 5 +- src/xenia/gpu/d3d12/d3d12_command_processor.h | 5 +- src/xenia/gpu/d3d12/d3d12_nvapi.hpp | 11 - src/xenia/gpu/d3d12/d3d12_texture_cache.h | 2 +- src/xenia/gpu/pm4_command_processor_declare.h | 17 +- .../gpu/pm4_command_processor_implement.h | 50 +- src/xenia/gpu/render_target_cache.h | 5 +- src/xenia/gpu/trace_viewer.cc | 5 - .../gpu/vulkan/vulkan_command_processor.h | 3 +- src/xenia/kernel/kernel_state.cc | 3 + src/xenia/kernel/util/shim_utils.h | 13 +- src/xenia/kernel/xam/xam_info.cc | 12 +- src/xenia/kernel/xam/xam_ui.cc | 26 +- src/xenia/kernel/xbdm/xbdm_misc.cc | 47 +- src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc | 125 ++- .../kernel/xboxkrnl/xboxkrnl_threading.cc | 41 +- src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc | 2 +- src/xenia/kernel/xthread.h | 4 +- src/xenia/memory.cc | 13 +- src/xenia/memory.h | 13 +- src/xenia/vfs/devices/null_device.cc | 2 +- third_party/FFmpeg | 2 +- 62 files changed, 1254 insertions(+), 802 deletions(-) diff --git a/.gitmodules b/.gitmodules index 6c356ec38..b4e3119b2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -36,7 +36,7 @@ url = https://github.com/skystrife/cpptoml.git [submodule "third_party/cxxopts"] path = third_party/cxxopts - url = https://github.com/chrisps/cxxopts.git + url = https://github.com/jarro2783/cxxopts.git [submodule "third_party/SDL2"] path = third_party/SDL2 url = https://github.com/libsdl-org/SDL.git diff --git a/src/xenia/app/emulator_window.cc b/src/xenia/app/emulator_window.cc index d401eaed2..a9688aecb 100644 --- a/src/xenia/app/emulator_window.cc +++ b/src/xenia/app/emulator_window.cc @@ -614,7 +614,7 @@ bool EmulatorWindow::Initialize() { MenuItem::Type::kString, "Build commit on GitHub...", "F2", std::bind(&EmulatorWindow::ShowBuildCommit, this))); help_menu->AddChild(MenuItem::Create( - MenuItem::Type::kString, "Recent changes on GitHub...", [this]() { + MenuItem::Type::kString, "Recent changes on GitHub...", []() { LaunchWebBrowser( "https://github.com/xenia-project/xenia/compare/" XE_BUILD_COMMIT "..." XE_BUILD_BRANCH); @@ -622,7 +622,7 @@ bool EmulatorWindow::Initialize() { help_menu->AddChild(MenuItem::Create(MenuItem::Type::kSeparator)); help_menu->AddChild(MenuItem::Create( MenuItem::Type::kString, "&About...", - [this]() { LaunchWebBrowser("https://xenia.jp/about/"); })); + []() { LaunchWebBrowser("https://xenia.jp/about/"); })); } main_menu->AddChild(std::move(help_menu)); diff --git a/src/xenia/apu/conversion.h b/src/xenia/apu/conversion.h index da9e761f3..672a6e0c6 100644 --- a/src/xenia/apu/conversion.h +++ b/src/xenia/apu/conversion.h @@ -71,8 +71,6 @@ inline void sequential_6_BE_to_interleaved_2_LE(float* output, const float* input, size_t ch_sample_count) { assert_true(ch_sample_count % 4 == 0); - const uint32_t* in = reinterpret_cast(input); - uint32_t* out = reinterpret_cast(output); const __m128i byte_swap_shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); const __m128 half = _mm_set1_ps(0.5f); diff --git a/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc b/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc index aa150e797..0c869fae3 100644 --- a/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc +++ b/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc @@ -28,16 +28,16 @@ class XAudio2AudioDriver::VoiceCallback : public api::IXAudio2VoiceCallback { : semaphore_(semaphore) {} ~VoiceCallback() {} - void OnStreamEnd() {} - void OnVoiceProcessingPassEnd() {} - void OnVoiceProcessingPassStart(uint32_t samples_required) {} - void OnBufferEnd(void* context) { + void OnStreamEnd() noexcept {} + void OnVoiceProcessingPassEnd() noexcept {} + void OnVoiceProcessingPassStart(uint32_t samples_required) noexcept {} + void OnBufferEnd(void* context) noexcept { auto ret = semaphore_->Release(1, nullptr); assert_true(ret); } - void OnBufferStart(void* context) {} - void OnLoopEnd(void* context) {} - void OnVoiceError(void* context, HRESULT result) {} + void OnBufferStart(void* context) noexcept {} + void OnLoopEnd(void* context) noexcept {} + void OnVoiceError(void* context, HRESULT result) noexcept {} private: xe::threading::Semaphore* semaphore_ = nullptr; diff --git a/src/xenia/base/assert.h b/src/xenia/base/assert.h index 83dcf1f73..ab78d353d 100644 --- a/src/xenia/base/assert.h +++ b/src/xenia/base/assert.h @@ -21,8 +21,11 @@ namespace xe { "bad definition for " #type ": must be " #size " bytes") // We rely on assert being compiled out in NDEBUG. +#if defined(NDEBUG) +#define xenia_assert static_cast +#else #define xenia_assert assert - +#endif #define __XENIA_EXPAND(x) x #define __XENIA_ARGC(...) \ __XENIA_EXPAND(__XENIA_ARGC_IMPL(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, \ diff --git a/src/xenia/base/cvar.h b/src/xenia/base/cvar.h index 144703665..e1e83f5d4 100644 --- a/src/xenia/base/cvar.h +++ b/src/xenia/base/cvar.h @@ -170,8 +170,10 @@ CommandVar::CommandVar(const char* name, T* default_value, const char* description) : name_(name), default_value_(*default_value), - description_(description), - current_value_(default_value) {} + current_value_(default_value), + commandline_value_(), + description_(description) + {} template ConfigVar::ConfigVar(const char* name, T* default_value, diff --git a/src/xenia/base/filesystem_win.cc b/src/xenia/base/filesystem_win.cc index f1f16c063..fb6edbb91 100644 --- a/src/xenia/base/filesystem_win.cc +++ b/src/xenia/base/filesystem_win.cc @@ -149,7 +149,7 @@ class Win32FileHandle : public FileHandle { return false; } } - bool SetLength(size_t length) { + bool SetLength(size_t length) override { LARGE_INTEGER position; position.QuadPart = length; if (!SetFilePointerEx(handle_, position, nullptr, SEEK_SET)) { diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc index 26f34318e..b83e545d2 100644 --- a/src/xenia/base/memory.cc +++ b/src/xenia/base/memory.cc @@ -59,7 +59,7 @@ static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to, CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3); CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3); -#pragma loop(no_vector) + for (uint32_t i = 0; i < num_lines_for_8k; ++i) { xe::swcache::CacheLine line0, line1, line2, line3; @@ -92,7 +92,6 @@ static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to, CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3); CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3); -#pragma loop(no_vector) for (uint32_t i = 0; i < num_lines_for_8k; ++i) { _movdir64b(dest1 + i, src1 + i); _movdir64b(dest2 + i, src2 + i); diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h index bd7081418..178d88fb7 100644 --- a/src/xenia/base/memory.h +++ b/src/xenia/base/memory.h @@ -620,23 +620,23 @@ static void Prefetch(const void* addr) { } template <> -void Prefetch(const void* addr) { +XE_MAYBE_UNUSED void Prefetch(const void* addr) { PrefetchW(addr); } template <> -void Prefetch(const void* addr) { +XE_MAYBE_UNUSED void Prefetch(const void* addr) { PrefetchNTA(addr); } template <> -void Prefetch(const void* addr) { +XE_MAYBE_UNUSED void Prefetch(const void* addr) { PrefetchL3(addr); } template <> -void Prefetch(const void* addr) { +XE_MAYBE_UNUSED void Prefetch(const void* addr) { PrefetchL2(addr); } template <> -void Prefetch(const void* addr) { +XE_MAYBE_UNUSED void Prefetch(const void* addr) { PrefetchL1(addr); } // todo: does aarch64 have streaming stores/loads? diff --git a/src/xenia/base/mutex.h b/src/xenia/base/mutex.h index b7fc09896..5f1bc8a60 100644 --- a/src/xenia/base/mutex.h +++ b/src/xenia/base/mutex.h @@ -25,6 +25,7 @@ namespace xe { */ class alignas(4096) xe_global_mutex { + XE_MAYBE_UNUSED char detail[64]; public: @@ -38,6 +39,7 @@ class alignas(4096) xe_global_mutex { using global_mutex_type = xe_global_mutex; class alignas(64) xe_fast_mutex { + XE_MAYBE_UNUSED char detail[64]; public: @@ -62,8 +64,6 @@ class xe_unlikely_mutex { ~xe_unlikely_mutex() { mut = 0; } void lock() { - uint32_t lock_expected = 0; - if (XE_LIKELY(_tryget())) { return; } else { diff --git a/src/xenia/base/platform.h b/src/xenia/base/platform.h index 61749e4c7..c258ad08f 100644 --- a/src/xenia/base/platform.h +++ b/src/xenia/base/platform.h @@ -144,9 +144,11 @@ #define XE_MSVC_OPTIMIZE_SMALL() #define XE_MSVC_OPTIMIZE_REVERT() #endif + #if XE_COMPILER_HAS_GNU_EXTENSIONS == 1 #define XE_LIKELY_IF(...) if (XE_LIKELY(__VA_ARGS__)) #define XE_UNLIKELY_IF(...) if (XE_UNLIKELY(__VA_ARGS__)) +#define XE_MAYBE_UNUSED __attribute__((unused)) #else #if __cplusplus >= 202002 #define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) [[likely]] @@ -155,6 +157,7 @@ #define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) #define XE_UNLIKELY_IF(...) if (!!(__VA_ARGS__)) #endif +#define XE_MAYBE_UNUSED #endif // only use __restrict if MSVC, for clang/gcc we can use -fstrict-aliasing which // acts as __restrict across the board todo: __restrict is part of the type diff --git a/src/xenia/base/ring_buffer.cc b/src/xenia/base/ring_buffer.cc index 53cd4d703..4f40b0670 100644 --- a/src/xenia/base/ring_buffer.cc +++ b/src/xenia/base/ring_buffer.cc @@ -78,7 +78,9 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t _count) { if (read_offset_ < write_offset_) { assert_true(read_offset_ + count <= write_offset_); } else if (read_offset_ + count >= capacity_) { + XE_MAYBE_UNUSED ring_size_t left_half = capacity_ - read_offset_; + assert_true(count - left_half <= write_offset_); } @@ -107,6 +109,7 @@ size_t RingBuffer::Write(const uint8_t* buffer, size_t _count) { if (write_offset_ < read_offset_) { assert_true(write_offset_ + count <= read_offset_); } else if (write_offset_ + count >= capacity_) { + XE_MAYBE_UNUSED size_t left_half = capacity_ - write_offset_; assert_true(count - left_half <= read_offset_); } diff --git a/src/xenia/base/ring_buffer.h b/src/xenia/base/ring_buffer.h index e481f4f27..e914e226f 100644 --- a/src/xenia/base/ring_buffer.h +++ b/src/xenia/base/ring_buffer.h @@ -68,7 +68,6 @@ class RingBuffer { ring_size_t offset_delta = write_offs - read_offs; ring_size_t wrap_read_count = (cap - read_offs) + write_offs; - ring_size_t comparison_value = read_offs <= write_offs; if (XE_LIKELY(read_offs <= write_offs)) { return offset_delta; // will be 0 if they are equal, semantically diff --git a/src/xenia/base/split_map.h b/src/xenia/base/split_map.h index 510c2ed70..e52857020 100644 --- a/src/xenia/base/split_map.h +++ b/src/xenia/base/split_map.h @@ -67,8 +67,6 @@ class split_map { void InsertAt(TKey k, TValue v, uint32_t idx) { uint32_t old_size = size(); - bool needs_shiftup = idx != old_size; - values_.insert(values_.begin() + idx, v); keys_.insert(keys_.begin() + idx, k); } diff --git a/src/xenia/base/threading_win.cc b/src/xenia/base/threading_win.cc index ed7874458..a8aa7889c 100644 --- a/src/xenia/base/threading_win.cc +++ b/src/xenia/base/threading_win.cc @@ -117,7 +117,7 @@ void set_name(const std::string_view name) { // checked ntoskrnl, it does not modify delay, so we can place this as a // constant and avoid creating a stack variable -static const LARGE_INTEGER sleepdelay0_for_maybeyield{0LL}; +static const LARGE_INTEGER sleepdelay0_for_maybeyield{{0LL}}; void MaybeYield() { #if 0 @@ -314,7 +314,8 @@ class Win32Event : public Win32Handle { } #endif - EventInfo Query() { EventInfo result{}; + EventInfo Query() override { + EventInfo result{}; NtQueryEventPointer.invoke(handle_, 0, &result, sizeof(EventInfo), nullptr); return result; } @@ -429,7 +430,7 @@ class Win32Timer : public Win32Handle { } bool SetRepeatingAt(GClock_::time_point due_time, std::chrono::milliseconds period, - std::function opt_callback = nullptr) { + std::function opt_callback = nullptr) override { return SetRepeatingAt(date::clock_cast(due_time), period, std::move(opt_callback)); } diff --git a/src/xenia/cpu/backend/x64/x64_code_cache.h b/src/xenia/cpu/backend/x64/x64_code_cache.h index 021e5e684..9667425bd 100644 --- a/src/xenia/cpu/backend/x64/x64_code_cache.h +++ b/src/xenia/cpu/backend/x64/x64_code_cache.h @@ -93,7 +93,8 @@ class X64CodeCache : public CodeCache { // This is picked to be high enough to cover whatever we can reasonably // expect. If we hit issues with this it probably means some corner case // in analysis triggering. - static const size_t kMaximumFunctionCount = 100000; + //chrispy: raised this, some games that were compiled with low optimization levels can exceed this + static const size_t kMaximumFunctionCount = 1000000; struct UnwindReservation { size_t data_size = 0; diff --git a/src/xenia/cpu/backend/x64/x64_code_cache_win.cc b/src/xenia/cpu/backend/x64/x64_code_cache_win.cc index 0aff67034..2da73345f 100644 --- a/src/xenia/cpu/backend/x64/x64_code_cache_win.cc +++ b/src/xenia/cpu/backend/x64/x64_code_cache_win.cc @@ -209,7 +209,16 @@ bool Win32X64CodeCache::Initialize() { Win32X64CodeCache::UnwindReservation Win32X64CodeCache::RequestUnwindReservation(uint8_t* entry_address) { +#if defined(NDEBUG) + if (unwind_table_count_ >= kMaximumFunctionCount) { + // we should not just be ignoring this in release if it happens + xe::FatalError( + "Unwind table count (unwind_table_count_) exceeded maximum! Please report this to " + "Xenia/Canary developers"); + } +#else assert_false(unwind_table_count_ >= kMaximumFunctionCount); +#endif UnwindReservation unwind_reservation; unwind_reservation.data_size = xe::round_up(kUnwindInfoSize, 16); unwind_reservation.table_slot = unwind_table_count_++; diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 3ba47cad4..bc9224ab6 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -46,10 +46,6 @@ DEFINE_bool(ignore_undefined_externs, true, DEFINE_bool(emit_source_annotations, false, "Add extra movs and nops to make disassembly easier to read.", "CPU"); -DEFINE_bool(resolve_rel32_guest_calls, true, - "Experimental optimization, directly call already resolved " - "functions via x86 rel32 call/jmp", - "CPU"); DEFINE_bool(enable_incorrect_roundingmode_behavior, false, "Disables the FPU/VMX MXCSR sharing workaround, potentially " @@ -78,7 +74,6 @@ using namespace xe::literals; static const size_t kMaxCodeSize = 1_MiB; -static const size_t kStashOffset = 32; // static const size_t kStashOffsetHigh = 32 + 32; const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = { @@ -141,55 +136,6 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder, return true; } -#pragma pack(push, 1) -struct RGCEmitted { - uint8_t ff_; - uint32_t rgcid_; -}; -#pragma pack(pop) - -#if 0 -void X64Emitter::InjectCallAddresses(void* new_execute_address) { - for (auto&& callsite : call_sites_) { - RGCEmitted* hunter = (RGCEmitted*)new_execute_address; - while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) { - hunter = - reinterpret_cast(reinterpret_cast(hunter) + 1); - } - - hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8; - hunter->rgcid_ = - static_cast(static_cast(callsite.destination_) - - reinterpret_cast(hunter + 1)); - } -} - -#else -void X64Emitter::InjectCallAddresses(void* new_execute_address) { -#if 0 - RGCEmitted* hunter = (RGCEmitted*)new_execute_address; - - std::map id_to_rgc{}; - - for (auto&& callsite : call_sites_) { - id_to_rgc[callsite.offset_] = &callsite; - } -#else - RGCEmitted* hunter = (RGCEmitted*)new_execute_address; - for (auto&& callsite : call_sites_) { - while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) { - hunter = - reinterpret_cast(reinterpret_cast(hunter) + 1); - } - - hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8; - hunter->rgcid_ = - static_cast(static_cast(callsite.destination_) - - reinterpret_cast(hunter + 1)); - } -#endif -} -#endif void* X64Emitter::Emplace(const EmitFunctionInfo& func_info, GuestFunction* function) { // To avoid changing xbyak, we do a switcharoo here. @@ -207,10 +153,6 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info, if (function) { code_cache_->PlaceGuestCode(function->address(), top_, func_info, function, new_execute_address, new_write_address); - - if (cvars::resolve_rel32_guest_calls) { - InjectCallAddresses(new_execute_address); - } } else { code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address, new_write_address); @@ -219,7 +161,6 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info, ready(); top_ = old_address; reset(); - call_sites_.clear(); tail_code_.clear(); for (auto&& cached_label : label_cache_) { delete cached_label; @@ -336,7 +277,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { // Mark block labels. auto label = block->label_head; while (label) { - L(label->name); + L(std::to_string(label->id)); label = label->next; } @@ -418,7 +359,6 @@ void X64Emitter::EmitProfilerEpilogue() { // actually... lets just try without atomics lol // lock(); add(qword[r10], rdx); - } #endif } @@ -534,44 +474,23 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) { auto fn = static_cast(function); // Resolve address to the function to call and store in rax. - if (cvars::resolve_rel32_guest_calls && fn->machine_code()) { - ResolvableGuestCall rgc; - rgc.destination_ = uint32_t(uint64_t(fn->machine_code())); - rgc.offset_ = current_rgc_id_; - current_rgc_id_++; - + if (fn->machine_code()) { if (!(instr->flags & hir::CALL_TAIL)) { mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); - db(0xFF); - rgc.is_jump_ = false; - - dd(rgc.offset_); + call((void*)fn->machine_code()); } else { // tail call EmitTraceUserCallReturn(); - - rgc.is_jump_ = true; + EmitProfilerEpilogue(); // Pass the callers return address over. mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]); add(rsp, static_cast(stack_size())); - db(0xFF); - dd(rgc.offset_); + jmp((void*)fn->machine_code(), T_NEAR); } - call_sites_.push_back(rgc); return; - } - - if (fn->machine_code()) { - // TODO(benvanik): is it worth it to do this? It removes the need for - // a ResolveFunction call, but makes the table less useful. - assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000); - // todo: this should be changed so that we can actually do a call to - // fn->machine_code. the code will be emitted near us, so 32 bit rel jmp - // should be possible - mov(eax, uint32_t(uint64_t(fn->machine_code()))); } else if (code_cache_->has_indirection_table()) { // Load the pointer to the indirection table maintained in X64CodeCache. // The target dword will either contain the address of the generated code @@ -1017,7 +936,10 @@ static const vec128_t xmm_consts[] = { /*XMMSTVLShuffle*/ v128_setr_bytes(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), /* XMMSTVRSwapMask*/ - vec128b((uint8_t)0x83)}; + vec128b((uint8_t)0x83), /*XMMVSRShlByteshuf*/ + v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80), + // XMMVSRMask + vec128b(1)}; void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { for (auto& vec : xmm_consts) { diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 91f4016c1..155994bf9 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -66,7 +66,7 @@ enum class SimdDomain : uint32_t { }; enum class MXCSRMode : uint32_t { Unknown, Fpu, Vmx }; - +XE_MAYBE_UNUSED static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) { if (dom1 == dom2) { return dom1; @@ -172,7 +172,9 @@ enum XmmConst { XMMLVLShuffle, XMMLVRCmp16, XMMSTVLShuffle, - XMMSTVRSwapMask // swapwordmask with bit 7 set + XMMSTVRSwapMask, // swapwordmask with bit 7 set + XMMVSRShlByteshuf, + XMMVSRMask }; using amdfx::xopcompare_e; @@ -190,13 +192,6 @@ class XbyakAllocator : public Xbyak::Allocator { virtual bool useProtect() const { return false; } }; -class ResolvableGuestCall { - public: - bool is_jump_; - uintptr_t destination_; - // rgcid - unsigned offset_; -}; class X64Emitter; using TailEmitCallback = std::function; struct TailEmitter { @@ -220,7 +215,6 @@ class X64Emitter : public Xbyak::CodeGenerator { uint32_t debug_info_flags, FunctionDebugInfo* debug_info, void** out_code_address, size_t* out_code_size, std::vector* out_source_map); - void InjectCallAddresses(void* new_execute_addr); public: // Reserved: rsp, rsi, rdi @@ -230,7 +224,7 @@ class X64Emitter : public Xbyak::CodeGenerator { // xmm4-xmm15 (save to get xmm3) static const int GPR_COUNT = 7; static const int XMM_COUNT = 12; - + static constexpr size_t kStashOffset = 32; static void SetupReg(const hir::Value* v, Xbyak::Reg8& r) { auto idx = gpr_reg_map_[v->reg.index]; r = Xbyak::Reg8(idx); @@ -410,8 +404,6 @@ class X64Emitter : public Xbyak::CodeGenerator { static const uint32_t gpr_reg_map_[GPR_COUNT]; static const uint32_t xmm_reg_map_[XMM_COUNT]; - uint32_t current_rgc_id_ = 0xEEDDF00F; - std::vector call_sites_; /* set to true if the low 32 bits of membase == 0. only really advantageous if you are storing 32 bit 0 to a displaced address, diff --git a/src/xenia/cpu/backend/x64/x64_op.h b/src/xenia/cpu/backend/x64/x64_op.h index 78c459101..654119bfa 100644 --- a/src/xenia/cpu/backend/x64/x64_op.h +++ b/src/xenia/cpu/backend/x64/x64_op.h @@ -398,21 +398,22 @@ struct I : DestField { }; template +XE_MAYBE_UNUSED static const T GetTempReg(X64Emitter& e); template <> -const Reg8 GetTempReg(X64Emitter& e) { +XE_MAYBE_UNUSED const Reg8 GetTempReg(X64Emitter& e) { return e.al; } template <> -const Reg16 GetTempReg(X64Emitter& e) { +XE_MAYBE_UNUSED const Reg16 GetTempReg(X64Emitter& e) { return e.ax; } template <> -const Reg32 GetTempReg(X64Emitter& e) { +XE_MAYBE_UNUSED const Reg32 GetTempReg(X64Emitter& e) { return e.eax; } template <> -const Reg64 GetTempReg(X64Emitter& e) { +XE_MAYBE_UNUSED const Reg64 GetTempReg(X64Emitter& e) { return e.rax; } diff --git a/src/xenia/cpu/backend/x64/x64_seq_control.cc b/src/xenia/cpu/backend/x64/x64_seq_control.cc index 54e7ac8a0..2e2d273cc 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_control.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc @@ -25,46 +25,46 @@ static void EmitFusedBranch(X64Emitter& e, const T& i) { bool valid = i.instr->prev && i.instr->prev->dest == i.src1.value; auto opcode = valid ? i.instr->prev->opcode->num : -1; if (valid) { - auto name = i.src2.value->name; + std::string name = i.src2.value->GetIdString(); switch (opcode) { case OPCODE_COMPARE_EQ: - e.je(name, e.T_NEAR); + e.je(std::move(name), e.T_NEAR); break; case OPCODE_COMPARE_NE: - e.jne(name, e.T_NEAR); + e.jne(std::move(name), e.T_NEAR); break; case OPCODE_COMPARE_SLT: - e.jl(name, e.T_NEAR); + e.jl(std::move(name), e.T_NEAR); break; case OPCODE_COMPARE_SLE: - e.jle(name, e.T_NEAR); + e.jle(std::move(name), e.T_NEAR); break; case OPCODE_COMPARE_SGT: - e.jg(name, e.T_NEAR); + e.jg(std::move(name), e.T_NEAR); break; case OPCODE_COMPARE_SGE: - e.jge(name, e.T_NEAR); + e.jge(std::move(name), e.T_NEAR); break; case OPCODE_COMPARE_ULT: - e.jb(name, e.T_NEAR); + e.jb(std::move(name), e.T_NEAR); break; case OPCODE_COMPARE_ULE: - e.jbe(name, e.T_NEAR); + e.jbe(std::move(name), e.T_NEAR); break; case OPCODE_COMPARE_UGT: - e.ja(name, e.T_NEAR); + e.ja(std::move(name), e.T_NEAR); break; case OPCODE_COMPARE_UGE: - e.jae(name, e.T_NEAR); + e.jae(std::move(name), e.T_NEAR); break; default: e.test(i.src1, i.src1); - e.jnz(name, e.T_NEAR); + e.jnz(std::move(name), e.T_NEAR); break; } } else { e.test(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); + e.jnz(i.src2.value->GetIdString(), e.T_NEAR); } } // ============================================================================ @@ -490,7 +490,7 @@ EMITTER_OPCODE_TABLE(OPCODE_SET_RETURN_ADDRESS, SET_RETURN_ADDRESS); // ============================================================================ struct BRANCH : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.jmp(i.src1.value->name, e.T_NEAR); + e.jmp(i.src1.value->GetIdString(), e.T_NEAR); } }; EMITTER_OPCODE_TABLE(OPCODE_BRANCH, BRANCH); @@ -534,7 +534,7 @@ struct BRANCH_TRUE_F32 Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); e.vmovd(e.eax, input); e.test(e.eax, e.eax); - e.jnz(i.src2.value->name, e.T_NEAR); + e.jnz(i.src2.value->GetIdString(), e.T_NEAR); } }; struct BRANCH_TRUE_F64 @@ -543,7 +543,7 @@ struct BRANCH_TRUE_F64 Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); e.vmovq(e.rax, input); e.test(e.rax, e.rax); - e.jnz(i.src2.value->name, e.T_NEAR); + e.jnz(i.src2.value->GetIdString(), e.T_NEAR); } }; EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16, @@ -557,7 +557,7 @@ struct BRANCH_FALSE_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { e.test(i.src1, i.src1); - e.jz(i.src2.value->name, e.T_NEAR); + e.jz(i.src2.value->GetIdString(), e.T_NEAR); } }; struct BRANCH_FALSE_I16 @@ -565,7 +565,7 @@ struct BRANCH_FALSE_I16 I> { static void Emit(X64Emitter& e, const EmitArgType& i) { e.test(i.src1, i.src1); - e.jz(i.src2.value->name, e.T_NEAR); + e.jz(i.src2.value->GetIdString(), e.T_NEAR); } }; struct BRANCH_FALSE_I32 @@ -573,7 +573,7 @@ struct BRANCH_FALSE_I32 I> { static void Emit(X64Emitter& e, const EmitArgType& i) { e.test(i.src1, i.src1); - e.jz(i.src2.value->name, e.T_NEAR); + e.jz(i.src2.value->GetIdString(), e.T_NEAR); } }; struct BRANCH_FALSE_I64 @@ -581,7 +581,7 @@ struct BRANCH_FALSE_I64 I> { static void Emit(X64Emitter& e, const EmitArgType& i) { e.test(i.src1, i.src1); - e.jz(i.src2.value->name, e.T_NEAR); + e.jz(i.src2.value->GetIdString(), e.T_NEAR); } }; struct BRANCH_FALSE_F32 @@ -591,7 +591,7 @@ struct BRANCH_FALSE_F32 Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); e.vmovd(e.eax, input); e.test(e.eax, e.eax); - e.jz(i.src2.value->name, e.T_NEAR); + e.jz(i.src2.value->GetIdString(), e.T_NEAR); } }; struct BRANCH_FALSE_F64 @@ -601,7 +601,7 @@ struct BRANCH_FALSE_F64 Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); e.vmovq(e.rax, input); e.test(e.rax, e.rax); - e.jz(i.src2.value->name, e.T_NEAR); + e.jz(i.src2.value->GetIdString(), e.T_NEAR); } }; EMITTER_OPCODE_TABLE(OPCODE_BRANCH_FALSE, BRANCH_FALSE_I8, BRANCH_FALSE_I16, diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 2b4657c36..82d56ded6 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -805,22 +805,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB); // ============================================================================ // OPCODE_VECTOR_SHL // ============================================================================ -template ::value, int> = 0> -static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) { - alignas(16) T value[16 / sizeof(T)]; - alignas(16) T shamt[16 / sizeof(T)]; - // Load SSE registers into a C array. - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - - for (size_t i = 0; i < (16 / sizeof(T)); ++i) { - value[i] = value[i] << (shamt[i] & ((sizeof(T) * 8) - 1)); - } - - // Store result and return it. - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); -} static XmmConst GetShiftmaskForType(unsigned typ) { if (typ == INT8_TYPE) { return XMMXOPByteShiftMask; @@ -914,28 +899,14 @@ struct VECTOR_SHL_V128 } } if (all_same) { - // mul by two - /*if (seenvalue == 1) { - e.vpaddb(i.dest, i.src1, i.src1); - } else if (seenvalue == 2) { - e.vpaddb(i.dest, i.src1, i.src1); - e.vpaddb(i.dest, i.dest, i.dest); - } else if (seenvalue == 3) { - // mul by 8 - e.vpaddb(i.dest, i.src1, i.src1); - e.vpaddb(i.dest, i.dest, i.dest); - e.vpaddb(i.dest, i.dest, i.dest); - } else*/ - { - e.vpmovzxbw(e.ymm0, i.src1); - e.vpsllw(e.ymm0, e.ymm0, seenvalue); - e.vextracti128(e.xmm1, e.ymm0, 1); + e.vpmovzxbw(e.ymm0, i.src1); + e.vpsllw(e.ymm0, e.ymm0, seenvalue); + e.vextracti128(e.xmm1, e.ymm0, 1); - e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMShortsToBytes)); - e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMShortsToBytes)); - e.vpunpcklqdq(i.dest, e.xmm0, e.xmm1); - return; - } + e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMShortsToBytes)); + e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMShortsToBytes)); + e.vpunpcklqdq(i.dest, e.xmm0, e.xmm1); + return; } else { e.LoadConstantXmm(e.xmm2, constmask); @@ -966,14 +937,41 @@ struct VECTOR_SHL_V128 } } } - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); + + unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH; + unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16; + + if (i.src1.is_constant) { + e.StashConstantXmm(0, i.src1.constant()); + stack_offset_src1 = X64Emitter::kStashOffset; } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1); } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); - e.vmovaps(i.dest, e.xmm0); + if (i.src2.is_constant) { + e.StashConstantXmm(1, i.src2.constant()); + stack_offset_src2 = X64Emitter::kStashOffset + 16; + } else { + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2); + } + + Xbyak::Label looper; + + e.xor_(e.edx, e.edx); + + e.L(looper); + e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]); + + e.shl(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl); + + if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) { + e.inc(e.edx); + } else { + e.add(e.edx, 1); + } + + e.cmp(e.edx, 16); + e.jnz(looper); + e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]); } static void EmitInt16(X64Emitter& e, const EmitArgType& i) { Xmm src1; @@ -1022,14 +1020,32 @@ struct VECTOR_SHL_V128 // TODO(benvanik): native version (with shift magic). e.L(emu); + + unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH; + unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16; + + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1); if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); + e.StashConstantXmm(1, i.src2.constant()); + stack_offset_src2 = X64Emitter::kStashOffset + 16; } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2); } - e.lea(e.GetNativeParam(0), e.StashXmm(0, src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); - e.vmovaps(i.dest, e.xmm0); + + Xbyak::Label looper; + + e.xor_(e.edx, e.edx); + + e.L(looper); + e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]); + + e.shl(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl); + + e.add(e.edx, 2); + + e.cmp(e.edx, 16); + e.jnz(looper); + e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]); e.L(end); } @@ -1098,14 +1114,32 @@ struct VECTOR_SHL_V128 // TODO(benvanik): native version (with shift magic). e.L(emu); + + unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH; + unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16; + + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1); if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); + e.StashConstantXmm(1, i.src2.constant()); + stack_offset_src2 = X64Emitter::kStashOffset + 16; } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2); } - e.lea(e.GetNativeParam(0), e.StashXmm(0, src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); - e.vmovaps(i.dest, e.xmm0); + + Xbyak::Label looper; + + e.xor_(e.edx, e.edx); + + e.L(looper); + e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]); + + e.shl(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl); + + e.add(e.edx, 4); + + e.cmp(e.edx, 16); + e.jnz(looper); + e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]); e.L(end); } @@ -1116,22 +1150,6 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128); // ============================================================================ // OPCODE_VECTOR_SHR // ============================================================================ -template ::value, int> = 0> -static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) { - alignas(16) T value[16 / sizeof(T)]; - alignas(16) T shamt[16 / sizeof(T)]; - - // Load SSE registers into a C array. - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - - for (size_t i = 0; i < (16 / sizeof(T)); ++i) { - value[i] = value[i] >> (shamt[i] & ((sizeof(T) * 8) - 1)); - } - - // Store result and return it. - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); -} struct VECTOR_SHR_V128 : Sequence> { @@ -1179,34 +1197,63 @@ struct VECTOR_SHR_V128 } static void EmitInt8(X64Emitter& e, const EmitArgType& i) { - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - if (e.IsFeatureEnabled(kX64EmitGFNI)) { - const auto& shamt = i.src2.constant(); - bool all_same = true; - for (size_t n = 0; n < 16 - n; ++n) { - if (shamt.u8[n] != shamt.u8[n + 1]) { - all_same = false; - break; - } - } - if (all_same) { - // Every count is the same, so we can use gf2p8affineqb. - const uint8_t shift_amount = shamt.u8[0] & 0b111; - const uint64_t shift_matrix = UINT64_C(0x0102040810204080) - << (shift_amount * 8); - e.vgf2p8affineqb(i.dest, i.src1, - e.StashConstantXmm(0, vec128q(shift_matrix)), 0); - return; + if (i.src2.is_constant && e.IsFeatureEnabled(kX64EmitGFNI)) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.u8[n] != shamt.u8[n + 1]) { + all_same = false; + break; } } - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); - } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + if (all_same) { + // Every count is the same, so we can use gf2p8affineqb. + const uint8_t shift_amount = shamt.u8[0] & 0b111; + const uint64_t shift_matrix = UINT64_C(0x0102040810204080) + << (shift_amount * 8); + e.vgf2p8affineqb(i.dest, i.src1, + e.StashConstantXmm(0, vec128q(shift_matrix)), 0); + return; + } } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); - e.vmovaps(i.dest, e.xmm0); + unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH; + unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16; + + if (i.src1.is_constant) { + e.StashConstantXmm(0, i.src1.constant()); + stack_offset_src1 = X64Emitter::kStashOffset; + } else { + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1); + } + if (i.src2.is_constant) { + e.StashConstantXmm(1, i.src2.constant()); + stack_offset_src2 = X64Emitter::kStashOffset + 16; + } else { + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2); + } + + Xbyak::Label looper; + + e.xor_(e.edx, e.edx); + + e.L(looper); + // movzx is to eliminate any possible dep on previous value of rcx at start + // of loop + e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]); + // maybe using a memory operand as the left side isn't the best idea lol, + // still better than callnativesafe though agners docs have no timing info + // on shx [m], cl so shrug + e.shr(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl); + + if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) { + e.inc(e.edx); + } else { + e.add(e.edx, 1); + } + + e.cmp(e.edx, 16); + e.jnz(looper); + e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]); } static void EmitInt16(X64Emitter& e, const EmitArgType& i) { @@ -1248,14 +1295,38 @@ struct VECTOR_SHR_V128 // TODO(benvanik): native version (with shift magic). e.L(emu); - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); + + unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH; + unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16; + if (i.src1.is_constant) { + e.StashConstantXmm(0, i.src1.constant()); + stack_offset_src1 = X64Emitter::kStashOffset; + } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1); } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); - e.vmovaps(i.dest, e.xmm0); + + if (i.src2.is_constant) { + e.StashConstantXmm(1, i.src2.constant()); + stack_offset_src2 = X64Emitter::kStashOffset + 16; + } else { + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2); + } + + Xbyak::Label looper; + + e.xor_(e.edx, e.edx); + + e.L(looper); + e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]); + + e.shr(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl); + + e.add(e.edx, 2); + + e.cmp(e.edx, 16); + e.jnz(looper); + e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]); e.L(end); } @@ -1324,14 +1395,37 @@ struct VECTOR_SHR_V128 // TODO(benvanik): native version. e.L(emu); - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); + + unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH; + unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16; + if (i.src1.is_constant) { + e.StashConstantXmm(0, i.src1.constant()); + stack_offset_src1 = X64Emitter::kStashOffset; + } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1); } - e.lea(e.GetNativeParam(0), e.StashXmm(0, src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); - e.vmovaps(i.dest, e.xmm0); + + if (i.src2.is_constant) { + e.StashConstantXmm(1, i.src2.constant()); + stack_offset_src2 = X64Emitter::kStashOffset + 16; + } else { + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2); + } + + Xbyak::Label looper; + + e.xor_(e.edx, e.edx); + + e.L(looper); + e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]); + e.shr(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl); + + e.add(e.edx, 4); + + e.cmp(e.edx, 16); + e.jnz(looper); + e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]); e.L(end); } @@ -1388,7 +1482,8 @@ struct VECTOR_SHA_V128 } static void EmitInt8(X64Emitter& e, const EmitArgType& i) { - // TODO(benvanik): native version (with shift magic). + unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH; + unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16; if (i.src2.is_constant) { const auto& shamt = i.src2.constant(); bool all_same = true; @@ -1399,7 +1494,6 @@ struct VECTOR_SHA_V128 } } - if (e.IsFeatureEnabled(kX64EmitGFNI)) { if (all_same) { // Every count is the same, so we can use gf2p8affineqb. @@ -1412,8 +1506,7 @@ struct VECTOR_SHA_V128 e.StashConstantXmm(0, vec128q(shift_matrix)), 0); return; } - } - else if (all_same) { + } else if (all_same) { Xmm to_be_shifted = GetInputRegOrConstant(e, i.src1, e.xmm1); e.vpmovsxbw(e.xmm0, to_be_shifted); //_mm_srai_epi16 / psraw @@ -1425,14 +1518,41 @@ struct VECTOR_SHA_V128 return; } - - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); + e.StashConstantXmm(1, i.src2.constant()); + stack_offset_src2 = X64Emitter::kStashOffset + 16; } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2); } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); - e.vmovaps(i.dest, e.xmm0); + + if (i.src1.is_constant) { + e.StashConstantXmm(0, i.src1.constant()); + stack_offset_src1 = X64Emitter::kStashOffset; + } else { + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1); + } + + Xbyak::Label looper; + + e.xor_(e.edx, e.edx); + + e.L(looper); + // movzx is to eliminate any possible dep on previous value of rcx at start + // of loop + e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]); + // maybe using a memory operand as the left side isn't the best idea lol, + // still better than callnativesafe though agners docs have no timing info + // on shx [m], cl so shrug + e.sar(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl); + + if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) { + e.inc(e.edx); + } else { + e.add(e.edx, 1); + } + + e.cmp(e.edx, 16); + e.jnz(looper); + e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]); } static void EmitInt16(X64Emitter& e, const EmitArgType& i) { @@ -1474,14 +1594,38 @@ struct VECTOR_SHA_V128 // TODO(benvanik): native version (with shift magic). e.L(emu); - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); + + unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH; + unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16; + if (i.src1.is_constant) { + e.StashConstantXmm(0, i.src1.constant()); + stack_offset_src1 = X64Emitter::kStashOffset; + } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1); } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); - e.vmovaps(i.dest, e.xmm0); + + if (i.src2.is_constant) { + e.StashConstantXmm(1, i.src2.constant()); + stack_offset_src2 = X64Emitter::kStashOffset + 16; + } else { + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2); + } + + Xbyak::Label looper; + + e.xor_(e.edx, e.edx); + + e.L(looper); + e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]); + + e.sar(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl); + + e.add(e.edx, 2); + + e.cmp(e.edx, 16); + e.jnz(looper); + e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]); e.L(end); } @@ -1508,9 +1652,9 @@ struct VECTOR_SHA_V128 // that happens so we mask. if (i.src2.is_constant) { e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpand(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS)); } else { - e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); } e.vpsravd(i.dest, i.src1, e.xmm0); } else { @@ -1535,14 +1679,36 @@ struct VECTOR_SHA_V128 // TODO(benvanik): native version. e.L(emu); - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); + unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH; + unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16; + if (i.src1.is_constant) { + e.StashConstantXmm(0, i.src1.constant()); + stack_offset_src1 = X64Emitter::kStashOffset; + } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1); } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); - e.vmovaps(i.dest, e.xmm0); + + if (i.src2.is_constant) { + e.StashConstantXmm(1, i.src2.constant()); + stack_offset_src2 = X64Emitter::kStashOffset + 16; + } else { + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2); + } + + Xbyak::Label looper; + + e.xor_(e.edx, e.edx); + + e.L(looper); + e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]); + e.sar(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl); + + e.add(e.edx, 4); + + e.cmp(e.edx, 16); + e.jnz(looper); + e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]); e.L(end); } @@ -1550,26 +1716,6 @@ struct VECTOR_SHA_V128 }; EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128); -// ============================================================================ -// OPCODE_VECTOR_ROTATE_LEFT -// ============================================================================ -template ::value, int> = 0> -static __m128i EmulateVectorRotateLeft(void*, __m128i src1, __m128i src2) { - alignas(16) T value[16 / sizeof(T)]; - alignas(16) T shamt[16 / sizeof(T)]; - - // Load SSE registers into a C array. - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - - for (size_t i = 0; i < (16 / sizeof(T)); ++i) { - value[i] = xe::rotate_left(value[i], shamt[i] & ((sizeof(T) * 8) - 1)); - } - - // Store result and return it. - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); -} - struct VECTOR_ROTATE_LEFT_V128 : Sequence> { @@ -1594,33 +1740,72 @@ struct VECTOR_ROTATE_LEFT_V128 } } else { + unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH; + unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16; switch (i.instr->flags) { - case INT8_TYPE: - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), - e.StashConstantXmm(1, i.src2.constant())); + case INT8_TYPE: { + if (i.src1.is_constant) { + e.StashConstantXmm(0, i.src1.constant()); + stack_offset_src1 = X64Emitter::kStashOffset; + } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1); } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorRotateLeft)); - e.vmovaps(i.dest, e.xmm0); - break; - case INT16_TYPE: - // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), - e.StashConstantXmm(1, i.src2.constant())); + e.StashConstantXmm(1, i.src2.constant()); + stack_offset_src2 = X64Emitter::kStashOffset + 16; } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2); } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorRotateLeft)); - e.vmovaps(i.dest, e.xmm0); - break; + + Xbyak::Label rotate_iter; + + e.xor_(e.edx, e.edx); + + e.L(rotate_iter); + e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]); + + e.rol(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl); + + e.add(e.edx, 1); + + e.cmp(e.edx, 16); + e.jnz(rotate_iter); + e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]); + + } break; + case INT16_TYPE: { + if (i.src1.is_constant) { + e.StashConstantXmm(0, i.src1.constant()); + stack_offset_src1 = X64Emitter::kStashOffset; + + } else { + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1); + } + + if (i.src2.is_constant) { + e.StashConstantXmm(1, i.src2.constant()); + stack_offset_src2 = X64Emitter::kStashOffset + 16; + } else { + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2); + } + + Xbyak::Label rotate_iter; + + e.xor_(e.edx, e.edx); + + e.L(rotate_iter); + e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]); + e.rol(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl); + + e.add(e.edx, 2); + + e.cmp(e.edx, 16); + e.jnz(rotate_iter); + e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]); + + } break; case INT32_TYPE: { if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { e.vprolvd(i.dest, i.src1, i.src2); @@ -1638,23 +1823,40 @@ struct VECTOR_ROTATE_LEFT_V128 } e.vpsllvd(e.xmm1, i.src1, e.xmm0); // Shift right (to get low bits): - e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); + e.vmovdqa(temp, e.GetXmmConstPtr(XMMPI32)); e.vpsubd(temp, e.xmm0); e.vpsrlvd(i.dest, i.src1, temp); // Merge: e.vpor(i.dest, e.xmm1); } else { - // TODO(benvanik): non-AVX2 native version. - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), - e.StashConstantXmm(1, i.src2.constant())); + if (i.src1.is_constant) { + e.StashConstantXmm(0, i.src1.constant()); + stack_offset_src1 = X64Emitter::kStashOffset; + } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1); } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorRotateLeft)); - e.vmovaps(i.dest, e.xmm0); + + if (i.src2.is_constant) { + e.StashConstantXmm(1, i.src2.constant()); + stack_offset_src2 = X64Emitter::kStashOffset + 16; + } else { + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2); + } + + Xbyak::Label rotate_iter; + + e.xor_(e.edx, e.edx); + + e.L(rotate_iter); + e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]); + e.rol(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl); + + e.add(e.edx, 4); + + e.cmp(e.edx, 16); + e.jnz(rotate_iter); + e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]); } break; } @@ -1667,80 +1869,120 @@ struct VECTOR_ROTATE_LEFT_V128 }; EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128); -// ============================================================================ -// OPCODE_VECTOR_AVERAGE -// ============================================================================ -template ::value, int> = 0> -static __m128i EmulateVectorAverage(void*, __m128i src1, __m128i src2) { - alignas(16) T src1v[16 / sizeof(T)]; - alignas(16) T src2v[16 / sizeof(T)]; - alignas(16) T value[16 / sizeof(T)]; - - // Load SSE registers into a C array. - _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2); - - for (size_t i = 0; i < (16 / sizeof(T)); ++i) { - auto t = (uint64_t(src1v[i]) + uint64_t(src2v[i]) + 1) / 2; - value[i] = T(t); - } - - // Store result and return it. - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); -} - struct VECTOR_AVERAGE : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + auto i_flags = i.instr->flags; EmitCommutativeBinaryXmmOp( e, i, - [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, const Xmm& src2) { - const TypeName part_type = - static_cast(i.instr->flags & 0xFF); - const uint32_t arithmetic_flags = i.instr->flags >> 8; + [i_flags](X64Emitter& e, const Xmm& dest, const Xmm& src1, + const Xmm& src2) { + const TypeName part_type = static_cast(i_flags & 0xFF); + const uint32_t arithmetic_flags = i_flags >> 8; bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); + unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH; + unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16; switch (part_type) { case INT8_TYPE: if (is_unsigned) { e.vpavgb(dest, src1, src2); } else { - assert_always(); + // todo: avx2 version or version that sign extends to two __m128 + + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], src2); + + Xbyak::Label looper; + + e.xor_(e.edx, e.edx); + + e.L(looper); + + e.movsx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]); + e.movsx(e.eax, e.byte[e.rsp + stack_offset_src1 + e.rdx]); + + e.lea(e.ecx, e.ptr[e.ecx + e.eax + 1]); + e.sar(e.ecx, 1); + e.mov(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl); + + if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) { + e.inc(e.edx); + } else { + e.add(e.edx, 1); + } + + e.cmp(e.edx, 16); + e.jnz(looper); + e.vmovdqa(dest, e.ptr[e.rsp + stack_offset_src1]); } break; case INT16_TYPE: if (is_unsigned) { e.vpavgw(dest, src1, src2); } else { - assert_always(); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], src2); + + Xbyak::Label looper; + + e.xor_(e.edx, e.edx); + + e.L(looper); + + e.movsx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]); + e.movsx(e.eax, e.word[e.rsp + stack_offset_src1 + e.rdx]); + + e.lea(e.ecx, e.ptr[e.ecx + e.eax + 1]); + e.sar(e.ecx, 1); + e.mov(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cx); + + e.add(e.edx, 2); + + e.cmp(e.edx, 16); + e.jnz(looper); + e.vmovdqa(dest, e.ptr[e.rsp + stack_offset_src1]); } break; - case INT32_TYPE: + case INT32_TYPE: { // No 32bit averages in AVX. + e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1); + e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], src2); + + Xbyak::Label looper; + + e.xor_(e.edx, e.edx); + + e.L(looper); + auto src2_current_ptr = + e.dword[e.rsp + stack_offset_src2 + e.rdx]; + auto src1_current_ptr = + e.dword[e.rsp + stack_offset_src1 + e.rdx]; + if (is_unsigned) { - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), - e.StashConstantXmm(1, i.src2.constant())); - } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); - } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorAverage)); - e.vmovaps(i.dest, e.xmm0); + // implicit zero-ext + e.mov(e.ecx, src2_current_ptr); + e.mov(e.eax, src1_current_ptr); } else { - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), - e.StashConstantXmm(1, i.src2.constant())); - } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); - } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorAverage)); - e.vmovaps(i.dest, e.xmm0); + e.movsxd(e.rcx, src2_current_ptr); + e.movsxd(e.rax, src1_current_ptr); } - break; + + e.lea(e.rcx, e.ptr[e.rcx + e.rax + 1]); + if (is_unsigned) { + e.shr(e.rcx, 1); + } else { + e.sar(e.rcx, 1); + } + e.mov(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.ecx); + + e.add(e.edx, 4); + + e.cmp(e.edx, 16); + e.jnz(looper); + e.vmovdqa(dest, e.ptr[e.rsp + stack_offset_src1]); + } break; + default: assert_unhandled_case(part_type); break; @@ -2163,82 +2405,6 @@ struct PERMUTE_V128 }; EMITTER_OPCODE_TABLE(OPCODE_PERMUTE, PERMUTE_I32, PERMUTE_V128); -#define LCPI(name, quad1) const __m128i name = _mm_set1_epi32(quad1) -// xmm0 is precasted to int, but contains float -// chrispy: todo: make available to gpu code -static __m128i xenos_float4_to_float16_x4(__m128i xmm0) { - LCPI(LCPI0_0, 2147483647); - LCPI(LCPI0_1, 1207951360); - LCPI(LCPI0_2, 134217728); - LCPI(LCPI0_3, 3347054592); - LCPI(LCPI0_4, 260038655); - LCPI(LCPI0_5, 32767); - LCPI(LCPI0_6, 4294934528); - - __m128i xmm1 = _mm_and_si128(xmm0, LCPI0_0); - - __m128i xmm2 = LCPI0_1; - - __m128i xmm3 = _mm_add_epi32(xmm0, LCPI0_2); - xmm2 = _mm_cmpgt_epi32(xmm2, xmm1); - xmm3 = _mm_srli_epi32(xmm3, 13); - xmm1 = _mm_add_epi32(xmm1, LCPI0_3); - __m128i xmm4 = _mm_min_epu32(xmm1, LCPI0_4); - xmm1 = _mm_cmpeq_epi32(xmm1, xmm4); - xmm4 = LCPI0_5; - xmm3 = _mm_and_si128(xmm3, xmm4); - xmm1 = _mm_and_si128(xmm1, xmm3); - - xmm1 = _mm_castps_si128(_mm_blendv_ps( - _mm_castsi128_ps(xmm4), _mm_castsi128_ps(xmm1), _mm_castsi128_ps(xmm2))); - xmm0 = _mm_srli_epi32(xmm0, 16); - xmm0 = _mm_and_si128(xmm0, LCPI0_6); - xmm0 = _mm_or_si128(xmm1, xmm0); - xmm0 = _mm_packus_epi32(xmm0, _mm_setzero_si128()); - return xmm0; -} -// returns floats, uncasted -// chrispy: todo, make this available to gpu code? -static __m128i xenos_halves_to_floats(__m128i xmm0) { - LCPI(LCPI3_0, 0x1f); - LCPI(LCPI3_1, 0x80000000); - LCPI(LCPI3_2, 0x38000000); - LCPI(LCPI3_3, 0x7fe000); - - __m128i xmm1, xmm2, xmm3, xmm4; - - xmm1 = _mm_cvtepu16_epi32(xmm0); - - xmm2 = _mm_srli_epi32(xmm1, 10); - - xmm2 = _mm_and_si128(xmm2, LCPI3_0); - - xmm0 = _mm_cvtepi16_epi32(xmm0); - - xmm0 = _mm_and_si128(xmm0, LCPI3_1); - - xmm3 = _mm_setzero_si128(); - - xmm4 = _mm_slli_epi32(xmm2, 23); - - xmm4 = _mm_add_epi32(xmm4, LCPI3_2); - - xmm2 = _mm_cmpeq_epi32(xmm2, xmm3); - - xmm1 = _mm_slli_epi32(xmm1, 13); - - xmm1 = _mm_and_si128(xmm1, LCPI3_3); - - xmm3 = _mm_andnot_si128(xmm2, xmm4); - - xmm1 = _mm_andnot_si128(xmm2, xmm1); - - xmm0 = _mm_or_si128(xmm1, xmm0); - xmm0 = _mm_or_si128(xmm0, xmm3); - return xmm0; -} - -#undef LCPI template static void emit_fast_f16_unpack(X64Emitter& e, const Inst& i, XmmConst initial_shuffle) { diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 1b6c40f44..820420451 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -54,6 +54,10 @@ DEFINE_bool(inline_loadclock, false, "Directly read cached guest clock without calling the LoadClock " "method (it gets repeatedly updated by calls from other threads)", "CPU"); +DEFINE_bool(delay_via_maybeyield, false, + "implement the db16cyc instruction via MaybeYield, may improve " + "scheduling of guest threads", + "x64"); namespace xe { namespace cpu { namespace backend { @@ -804,7 +808,7 @@ static const hir::Instr* GetFirstPrecedingInstrWithPossibleFlagEffects( go_further: i = i->GetNonFakePrev(); if (!i) { - return false; + return nullptr; } iop = i->opcode->num; // context/local loads are just movs from mem. we know they will not spoil the @@ -985,7 +989,7 @@ struct COMPARE_EQ_F32 if (!HasPrecedingCmpOfSameValues(i.instr)) { EmitCommutativeBinaryXmmOp( e, i, - [&i](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) { + [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) { e.vcomiss(src1, src2); }); } @@ -999,7 +1003,7 @@ struct COMPARE_EQ_F64 if (!HasPrecedingCmpOfSameValues(i.instr)) { EmitCommutativeBinaryXmmOp( e, i, - [&i](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) { + [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) { e.vcomisd(src1, src2); }); } @@ -1065,7 +1069,11 @@ struct COMPARE_NE_I32 e.cmp(src1, src2); }, [](X64Emitter& e, const Reg32& src1, int32_t constant) { - e.cmp(src1, constant); + if (constant == 0 && e.CanUseMembaseLow32As0()) { + e.cmp(src1, e.GetMembaseReg().cvt32()); + } else { + e.cmp(src1, constant); + } }); } CompareNeDoSetne(e, i.instr, i.dest); @@ -2603,25 +2611,16 @@ void EmitAndNotXX(X64Emitter& e, const ARGS& i) { // src1 constant. // `and` instruction only supports up to 32-bit immediate constants // 64-bit constants will need a temp register - if (i.dest.reg().getBit() == 64) { - auto temp = GetTempReg(e); - e.mov(temp, i.src1.constant()); + //only possible with 64 bit inputs, andc is the only instruction that generates this + auto temp = GetTempReg(e); + e.mov(temp, i.src1.constant()); - if (e.IsFeatureEnabled(kX64EmitBMI1)) { - if (i.dest.reg().getBit() == 64) { - e.andn(i.dest.reg().cvt64(), i.src2.reg().cvt64(), temp.cvt64()); - } else { - e.andn(i.dest.reg().cvt32(), i.src2.reg().cvt32(), temp.cvt32()); - } - } else { - e.mov(i.dest, i.src2); - e.not_(i.dest); - e.and_(i.dest, temp); - } + if (e.IsFeatureEnabled(kX64EmitBMI1)) { + e.andn(i.dest.reg().cvt64(), i.src2.reg().cvt64(), temp.cvt64()); } else { e.mov(i.dest, i.src2); e.not_(i.dest); - e.and_(i.dest, uint32_t(i.src1.constant())); + e.and_(i.dest, temp); } } else if (i.src2.is_constant) { // src2 constant. @@ -2638,13 +2637,7 @@ void EmitAndNotXX(X64Emitter& e, const ARGS& i) { } else { // neither are constant if (e.IsFeatureEnabled(kX64EmitBMI1)) { - if (i.dest.reg().getBit() == 64) { - e.andn(i.dest.reg().cvt64(), i.src2.reg().cvt64(), - i.src1.reg().cvt64()); - } else { - e.andn(i.dest.reg().cvt32(), i.src2.reg().cvt32(), - i.src1.reg().cvt32()); - } + e.andn(i.dest.reg().cvt64(), i.src2.reg().cvt64(), i.src1.reg().cvt64()); } else { if (i.dest == i.src2) { e.not_(i.dest); @@ -2982,15 +2975,52 @@ struct SHR_I64 : Sequence> { }; struct SHR_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - e.mov(e.GetNativeParam(1), i.src2.constant()); + /* + godbolt link: + https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGIMwAcpK4AMngMmAByPgBGmMQgAGwaAJykAA6oCoRODB7evv5BmdmOAmER0SxxCclpdpgOuUIETMQE%2BT5%2BgbaY9mUMLW0EFVGx8Umptq3tnYU9CjMj4WPVE3UAlLaoXsTI7BzmAMzhyN5YANQmR26qAYnhBMThAHQI19gmGgCCx6fnmCuNxihA%2BX1%2BZhODDOXku1zci3wgjeYJ%2B4L%2BVAuGnRkLwmKwNAi6AgAH0SQBxSJyNxkjbgi4XAgAT3SmAJDI5nIutAEwG5vO5tGuVh%2BDOZrPZXgY2WARP5RnlfK8tCFRxF3wZxwJKwuZMeiUkisV9KukO1EV1JMeRzMF0eJq1mEJgL1gi4iQuCgQJAIDrNTp1roIAQZyAQbT9R3NgIAst8ANLYEIhCAMHwbC5plimo7HC7JyPRi4AMRjABUSQbTWYVeYzDijn08Rdo8SSTGhDSAGrYABKdNFjJZbKdXK5QartbVJvFI8xUplconhuVqvVmv9zouccTydT6czPhzebwBsLAYtpYrVbrAEkz2Z62jIU38Re2RdSSSLAB5Xshb5IgAERpEkBw1IcJVHMcOWXQVhRnYdJWlPBZQ/ODVwQwdHS3HckxTLMMyzY9ITtM9sM3HUr0rQ06xCOsGz6JRI3iYgSGrKUAGsGFQAB3BgLjQFh0joeIGOfRsGHwKhwVnZDFw/R4Li8e1px%2BOTRwXVC5TDNplN04gsO%2BDT5xQtD0E9b12mUr0fSMkzlLMuUeQVZVeSM2SkOgmDBPDYgOUeAJ7K8zEGQUiyDI5bJBCCtTjJCxzwt8vSGRUmLgqg0KfNs6y7TdRIMrnKLtI/HKCDCx53UK%2BSSossrUsqgq4ocny8vKgLBBtarvKSpTis6%2BtmoSrTzLazk0oILqhsywVWq5fVJG6zEVTmzlooIM9pqK1dVoawRNvVcEAHojouZRhjwMRaCZFt3ws2cFBeC4ywQTAbraQEvCUCzeNegSCFe26hJE%2Bh/PQVBMAUTNUHK7i%2BOO07DCZAHwj5JgYh2cqAcBWcLkwVR9nScrCCh7IAC9MBeBsi2/ABNMtsD24NqffXUAHU/yApmqokmmgI53suYmrredZkkAEUBaFhaG2bMAwFbUkO27PtwJwwMQh/SJyU17XLUqwJGKkvF0R%2BE6LkiAQAFpFkMdA2gsjHPEwQxIMhp6Xrei4Ppsj9fsYRlAawYHRP80QGB48qvswBHA8BW2pId6snaFR83YuOJRGji5UExbHPTwCmLhYPBFhYJgCDDDOvCxwGSmyGJ6FjgA3MQvEh73iEBARrqxb2pIuLgnqETBATEBRUAuMAOF/H8Qmn9O4h5XiqfUhLAt1WeQi4Ja2vdTefznwb1Qc61bW/Q%2BQkWrb2QWg%2B59iw6JLxKTRxJNnb2An82aEElPJmjeFh6afBvqORqFwpa7zPhcfmnMoEDXzFrck8Dypb2FDBc2Xh0isj2EwJQFwt52ihl9LwV0bqGhiMjSGRtpL/yKnfSWcC4oYlfpiMkyB0jeAUJwr6dDb6CAzqgTw6Cxzm14oCXihgsaT2zinPKOddgXDcBcdIbFgDEFYAoGhJszanSEKgNggkBDN0YHgRg%2Bxi5MGQGxKGRBLGcUBOkC6YhvbIH2AoJQUMGB4H2IZUWW4AJCArJ/ICEBVAZGGCSWcGYOQQHJpgXOYSNhHXiYkpx7QonDgzFbQeatcRvmdG2OmDMSSc1VqaAqZgPRkiASUspvYgRAWuFzGpt5yQkmwMBW8gEGwMiLJrNmJIQlhIiRk6JHJAnBOAiM9JBBMmsjyUcPprMAASbSVlDOmeE2Z8zMAxOxBJJiMcJLLK3Gs8kGzhnbMieM/M3wgmbNCdcsZWTem3QCd/R5MyblZI5AciEklaH%2BJ1LU7ADARmZhiZ%2BAAVFAYp2BoV0iqUk6wDANiLKLFLcF4TIWxNhaSKWiLzCJBRZYNFGLWawMFti0guKYVwqpUBIlyLVBIosOS02AL%2Bk/lBUkhkoKaUDK%2BeE%2BF6KWYfKlnyiBnNBWfKuaQd%2BnMxXAotJrRlfLGWysGfKkkjLlVctWbeXlrL%2BXAJpecy5WyFWgv1erC0azJUmuldSkZFrhUKqlrayi9rbzqpNZq116z3W6s9RSrcoKuBSoIWaiFuTWrm0oQQQEXBPxoClI4BUVA2LZg0GGkFwCzBRoFbGsweaLSgqOEWmNOKLhHDLYCUFkgq0MxpQySQ9bo0MwAKzNrBbGrtHbQUkqdZ2vtNbEiDuAQAdl7a2i4U7J0MwCLO2NARF3YBSCumtKR11cA0FK4tOK927sjU6w9tKuBcF3YWs91aL2lvFfmhmXBK23pbRCl9u6m1vrHRe9tj7y3AK4D2n9rbgMdqlqeqFWLY1XoA4CKWN7oMypLVC0Rp0UbEB%2BiQCyuc445xiNoRoBBaUjSJPB51QFX3IZdTWutFGpbfpo0BOd/6VUIc5iB5jc6B0Mc5sO7jsaJ18cFjOkdMGa0Ls5ebHivEC6jXLtYrIn584KFYICGINcLi8UIAgeTAl8ZJpQgIDtQhz10vpRAQKzKBOoq9VGVmQgJO0rRXiqAjUbOkvZfZosQgA04tc5Zs%2BnnWV2bVuxi4QhNbGpiWZu9Qr5WBR845gZnMpVOZQ%2BEhLVrGrJa3FFn8fqMx%2Bec9lp55ABp5Z1EINZMWGRxffeEt1iWYpVYtDV28jrYvOeazl/KbXAQdaK5F/zpBevlbPgNyLEao0Nd/QyODEW5tIY5HNudD6lsVtm%2BZ2tpnG3bbvW2vbwCuOrZ27xzbwCBNncOxcYTl2GZiahWt2NUmHvYGXSOl7Na10Ubm5ur7O2d1/Yjfup132L25pB0BqD9XzOXuO8%2Blb03DtcA2wa/LEbqNw9R/R97Uh0vw7Yxj6rEbTso8axei7JP2uQdm85hbpnEP08y7Si46O7WDaltj%2BrDPdt/cYyz2jbPiec8i1Lcn4vWcMmp2LjLgtru8%2Bl3dpnnMnurb52934uiLjkkYPECuY8VFMDwP5PDqAcF20epFz0rQpJQ34P5ae4Vp4UbJEIZQ3xby9ndGSCACBUIIFpcvGJUArP9YZP7wPGZ4TwgZGuq4U7lEQAmgniAIeO3u8997m0fuA/ACD/yXiof3OVcj/nhAMebhx/dDHpPn4Jq1/T3xKbWeve9gNHnwPweW%2BR9Lxtdt5fo9AjcHHm0dfk/C1Lc34vmeSQe/b2jgIXeC89%2BL5%2BfvS%2BMxR4L1X0fNw7uD5MPXlPC0Ngz9bySbPPvEgr8LyH2JUBG8Ts/BXvfceLgJ%2BP5PpLn4M9u6v3b1zxJB33v17z71PzL1APfwP1r0Tx/36wvzn2v07xAIrzXyhTDwmgNG3zfxHzH1LXgIb0myQIAOvyXzvwwIgMb0CHPzwNjwPxwKIMgIH3P3/zRB%2BC%2BlRmN2QAcXQCiXxhJDQBwwUCiUaUSlqjag8h%2BEWGIC8AcAuBMWQDMDEOP3XA5CoB5ArguBxSZA8inSaWYWfkxB3h%2BHBi8EbkBGDgMXSC/Dvi7gUGVBIxbB2EsO9l%2BzRCnXUKUmbmPguHNmIBSBNCDH3mblzDVH8NOmIEvRNB8OvgsEiIuGICCkHB8K7XQQCL3WCKtHykUKagSMyNMIgnMLcJiCsU4iEPDHCAyNOhMC7QsG4WsA0HeC7S5jqIsCtj3RaKaUHBKPoEUMfkSPaMaMsDMGaLqLaPqOsC6ImM5QZGbhDGaXcKMgZGbAgHcMaSWI0BeA0AuHAk1C8JNHmNtC2JWMTx6IgiOQdEOMHHmKWSWIdTSyYF%2Bzik5DWM/EeMFggGeJjyqSxFUCnWLGLFzU2KOC5n%2BTHGJWJQ32blojBIuDWXVR%2BNpWbi7XELVUlWRI%2BN9UxK/z%2BI0FUCBKJIzHli2In2/0QSRLXQzH2I5DUKOI5F8PEM6I0DMB3lePmkxHWIgBmx%2BIqX%2BOPVBPBL2IZIOPULHHBlFLpJuIgh8lhIuGhSWOPilM5ERMlQWKry5lhLOJ8neNRJHz7lpJ8npNuNanlO/yWK4C8B1NajVLSw1PEO1I5ONIMJVMZPuPhM%2BNCQ1JtLHH1MVPhOVNNLHCtitl8N9OlIuJ8l%2BlEk/E/BmwdOhIJMFOaS2JFOdK5AxPtK/3hNRIjOPyjK5Gbg9CWLCP5IJKBOwGLAjK9IgETNzPyKlPeOeINO2N2KNK5FrPrK1JLPrwJICA0EHIjLKN4MqJNwElLMfilNrJHIqN0nCE1IRM62zN%2BI9H7MHOLCIIJKOGLGwGxAzILIZKuNNJNNlI5FnLHPCHEOeJrOXK%2BIvPnNcweLvNCT5KTLuA3K3NUB3L3IjKZKWKfycnQhyIICb1rLfPxIBKBJBLBCOEZkHxyT3UfBtMPNNJbKWIfKqIYDONQoglhRDU5gVI2AcKcMdKDIgi7gIF2AEhvOYVdMOWNhkh%2BD6kcJiBJACDMHqMSBSCOH3RCI9GhSYC4FpSUiYDMBEoESYCOAkvKiYGkDGiYC7Rku9kSGUqYC11PNWIEQWJqKSKyNSIERItoF9AiICNzAMvKmbiyNMqiPMogh8JiJsqSKOCKK0ssrR10uIF4tiO0pfU8rMCCIssUKkH8pSO1wggWOvJ3nqPrFaLOPeO%2BJOIP0TJPykoNKEvaNzFaIn0/DkvSuiosEWmyuYMUqBEZgyvqPSOKopO%2BJLLgu9gKoKmqtSqnTKoavaKnQmJpOuPFLtC5O9iSuUUisQpFODIuGABhkngAgsCTBJACptKst2KWIqosCysMN6ubI9KGr3QuBGo7IZAmrsWmtmvmozKspTWWoKqKvWoZObLhO2pTT2qlMOqmu%2BBmpCDmqCLOrRyivaKqputNObLRI%2BGCpOMsEHlGrPIOsmruXes%2BoWpfV%2Bq4tmIzObLqsZisvuPBpTX2vGphuOo%2BtOt6qsrhK5hWs6ritRv6vUrapJtNASJxuevxrepOq%2Bo2upqSoxpDFxrEsdKnMBupo9Ixp2p5o9LJoKrWptLutpsvUhrHDysuvaOuqlupuBuSp%2Bp5rVvFr%2BpRvZrjKYDqu2qWR5rqu1uRspr1vWJprjzpp5tasVvqIpoBrPIStKpyTEozGhVyo9HdrrQVNytavdoQs/DppyXOs9pDsRrDsFK9rrJ%2BrDpzMZltA7NPIJs%2BpAClKMvKlOIPPorPOPLPM0q5BWtiraPhLEojKzvEPZN6oZBWrWuvPCNrvavqOuuvPiKlJWv%2BuvPSIzLrsaomOvMbLGvJsHuWs6tzsLNwoZALrFIZKrstJwrzoZEouoqSMhicP0IuPBDkQUA0R0xTyAoskwBiBNSLs1E4KPsUPEJPtUH5KgAuur1gOyVySMjkVPsdKnjABzvVDIw/GiK4AsiWNvrKpBuTJkjilXuIAEg/uhIAfQC3uk0VMvqShcj5DQaSNoB0wYBYEPtQYFAwfCHSBrlIyvrYmcL/osnLgUE4jr16soaNAwbOBdi7iAfhMAt6kEA2CIZrjpCMLPOYY%2BiAaTyWMEdYYYLcCmifnCs5CgYEjJE0zoAzRJHIcN3oCoAHwgDEbZFpXIaRUSGoc4kQY4O%2BAYaUiTUWEjQYYwcIYYGIYIDofBDkcwewdwZ4ecOTKYAvT4cOjzuhSOk5TMYEXBkICujwckKUhxQMi9teiZE03QHQCZEcdNLkTDEaE4kIHEKoHHhjiprCQNJ3UT2btDFel4MyaWKeHbgjIZBOmcaJoZOno5GhVificSfELSbKZMukbeKoC0dKYyYcaKbdIUZUmMvCBJClC7isXDEsIgB8YaeXo5Gcd4rZvRDzttkcGQAuE0NQG0PDFoCoDMB2YrnCdGh0OSbPOcZJBYBYCEObgIA8RJCoBtFJBubuYIHSAQBJHSAUFedueQHueyFz2yCXw4Yia4dZTFToouI2Z8QkNGnqmOYIDMH2d6aRe2YufnswCougdtwrh8SELwRMrcAYbKg%2BD%2BfeZBcCBJCBcLWuf%2BfuZ%2Be%2BYQApaUDmQ8WoC2F1BjBjBJGLF7B/DkGAhJEiB/A6QAA1lEKT2xeX%2BXBXhWywxWAAtPsH8dFeZzwi49F1QRgAQEkVFo5rQggU5uUc5oZkx8aK%2BhQWUcQu%2B98gcoc/hy1pKfGdIcQvlaEiAFy/A2AqHKmz8eWV1i5nyZxxEEAEAEEOZUQRYeEJF8l61icp%2B5RWvFCvO26gRZAYSQxm%2B1QN11M9hlyiRlMnc3alNEALEG0k6V13agCzN9IbNz19k/fNwY9DVs8hh9etAaURwKipNNhrmO%2BnJLRrNvBTie%2Bpt%2BEVt6Qs89FlgJwvAThMxfyJY2F5AQlmNm4ONuCqAOthtv4wgmqiAAAP13dHdXNyTbbmI0KNcEnuaiWEnEINaRYgC7i7cWEIBrjZDbdkexbXoBbmQIAfa9rnbGcXbEn4bWe3pfEYs5QxArZMfpDJBYCXx0N1YYH1bEEOefaQ6XzwDNfPv8OQaho5DfbtweYZP8YZJw8CDw9RZJFdYEEYDmQbfhPpZpexa4Ho5EndC9a4DkB9eTb9d6uo4CDw8XFEJY5aTebZc48wG45JXXMHLkG/dgmudw7xlzcY8EBJGDi7jYG08k91Gk44647wB48fubcHiE6o7U5o%2B2aNfo804iG09OD06Y%2BudHfELY5k9M5eb44E/Z3oyddU%2BQ7s4pjYi7j5CWO8%2BxfC9QBpbM8CDmZtJE7w9dahmi7ebtgS9BZxTo4Y%2Bc%2BY7PZS9s9E9uh4mIHLloAk888y/%2BeEkwAAEdTOeP0vaU4vIvL2LWQv1OsAKuquauaGSQbQvO3mpQnFeCeQWuSU%2BuSABuPOaHaVZvKvx4FvaGajmwKyiTaVM4DFAQtuiSiSnoIgRwLJHcM56ApJm5uRDc/EZGOQq3c2hTa0LOa8ISeu7Ou5GuVIHDHO5kNEjBMA1vRv6vWRmu5OzOSU2uLh8unP3PDGVOGRUuNP0gdPGg3ODPR2Pw6uSRsuqXl0vufvIY/uVHDBgAgfDHaUCv3PdPMB9OiuaHEfLRQuyudMAZHPMhCvcf0BtAvo%2B2QeaWLDpuIVXW0frE6f4esf0AmeROjEGATF2g2QSQ2fvQa4SeAfyfufefFgBf/3PnvnfmVeOetO5kmAee%2Bev3p3ORkfxOBe8fEvPtaVxO1uuuzybftIFARucfxvyipuIeeOOvMBgAnePefHIjy4HELhtfypxOr2kfSu8PCfTdifDNSfAevfWOxu7Hff4v/eSVA/g/17vvk/RDU%2BNeKfiuMzkey%2ByegekWcjxf6e7epIcukuk/fua/0/xL7Pdm5lqeXPoQMeGf1vgv4%2BWe8Ok1iBnhLCVHMBHh4hVFsWhDjF4h%2Becfo2PnRD8etGV/Ff%2BCVfMZ1fa%2Bte%2Bew%2BTpvRe4vRR5iZBA%2BhaBS5GB8E8Abm2RLok1L%2BvBgByfFgPwUYoZtC2OcfZnup0n7T96ATzXPP3zmQ0BVAWCO3gk186FpQBeAGfl3Hn7EBF%2BUbXfn2yp6qB/ux/evq5wl5cNp2NncfpYiwBo9%2BuW/WqCSDi4dwcelhK7qZ2XTICZ%2BzzFQlAKeZ4BYB6QEPrKE97d9lu83QxsNwfSj916OLeRm8w3748DeEAcuJQOEE0DzIdA%2BIBDAUCu9YOedI6ER05BEBwY5bZ/qolQAmJA4pccEJR1FDI8dWPEdDkixRaYcvwLPDFua3u5j91O9fdGINzHY48fOEPF5gSUBKHcgSXgV3tbwT76ZRBPzAXv4JEiBCPyg5JIdZ1NLI9TB8QHZrxDT7k9YhJnAIYWgJKSBghe5QcmEJK7kCpQWAYgJkOyGAg/BeQ%2BIQUO/IBAHWSQsoVX0iHpDqhy8Wfo4WMq5DQKwvIIUSXaHCdIhi4UQfvHqGDC8%2BEARTikLd6lc8WGSQzK4CV7l8BhnHH5hAESBdou0GgSQDsSoAy9IhovWnvTwz5SdbmcQvAH5344SNAuJw8gaLyIH08phmfa4Q0Mh4QB/ODw49Fbw%2B5lcC%2BAwuLq32XRPD1OkbSOJF114NdweIkZdAX3a7qDOu5QlDlQFx4xBvoTzEgEIQfY49su2wqgLSjY6yCHe8grwViJd7hDAReHDEd4KV7z91ELidfkSyZbb96RWIpXo7jxHpAmeugt0lYiopkJaUaaWgBZGzglwGAR0cuHfXCA/8zeOcTEHW36F2I6wbAYgOTzrB5QJmr8boXxFqFHRxmXQmoeXwo4BNUhkQyODkRJAmiehGw9fsJGMqIDqAmI7EUyI0S0BaUdog0eXwhF2dxmlQjIfaNr6wjOEffRoa6IZH8EPRYgUhlUNNG19/RZXY0Qr0TGA8BeqgXEdv2tGpjgxvo2vjt0IDQig%2BNIjwYECL5E9S%2BeAtYfwQdEfDeRwAOZESLdHcjcRdbJ3ksD%2B61jahZYq4Ah1uLmw0haYnobiKDH6i5MOPbMQ73Dy2iRxBYwHhJV1EJiQxgPPscj3b4p8axr8OscfzQG38MBlFbHg2I37siZxm46sUZiwC7i1xaIuzhiP3GT9MBx4q4eu035gjOWAI8sWV3OFMd4gzffgtvwvHdidxtQ2fugOfHoBcBqPX8ZVi/HAC7OsEvtjiOIBH8MxfgoXvkNdGPiF%2BR4qCUDCH7xB1xSwwwDDFeioSmAvEKgMqHAkHjMBuvIlnIO2FIS2xqEzvuTz7FkD1O%2BzExKIWtEwClecA9foy1EKfNvikccibjyok0ScJh47FiSJ5Z8sBWQrICCKzFbYBJWRBGVkpPlaqTFWJIFVgK35FHRsY%2BCH2JgOZDjUbECgWlEyGoSWjyBwE1YaBPL7vDXxE3TiKX0jFOTtx14sCbJMgnIiIupYu8WVx9G8Q3JRnW5h5K8m3DC0eYicbUKCkkAQpHQioXqMyGRS2OMUl0YGIymrjyeyU1EWlPU7hTfcY4/KXxCylvNpxoLMqapUcgriqpwmCDg5PU6utXO1U25vb1BY%2BSrx6w2vu6GgkN8h%2BKbCQdX1zbcCdW/bKKdz34KzCeJxPfiTwMEl8CUenUlqb1ROiyZPQfQYjJ6FZDIBLotARQm3GOSLDyB3CRoMdJJCtxvAdQhsT1KS71TvR84rIVnSZ5XSjpYgW6WdIF61SkuX0m6XdPbgkisuLfbfqLwEn4SJho7EkLXmInkDoZKEmlodOOkZcGxTA9AM3BYEQAoZK0mGWjJ%2BkgzMAr0/MRFNvwVSmpFMvsc42Rk8igZ48HCpq3BDYFJAYhEDgPiICgs2ZYhC3MJWUgLQ%2BZ6QDnDKXGhCyo4j9LmBAH5nllVAVABWYrIVlM9eZ7ld1hbjBprl5ZSsxWSrIlmMh3WXgFNF7Ssp6z9o7M8qLxDFoqY5ZOs3WTaVVkXBfB7DbOnBRBrj54Jss2CiAi5hSMGSrs6WbLJNlXp6aTsm0s7OlnZ1IKds5WQ7P1kRT3WrskBIzA9kSCNZoDN2fCT9mmkA5DeY2TnBFm/EEiRjDMhHJtneza04g7pks1/a4sZZ6QfORrKLnaYLq4NEudI1wrHBXAOicKg5FkLyFZK9/WUHgh%2BEv4lCFBDLmLMci5A3Ba0SpgoQI4MhtWEZdFnoT7o99tCZMFeTe14iptnac8uQgvKnlvEBoYhHgRGTPhny15zdS%2BcTC3nrzb5doXeZPRtL9yj5zdKfGIS8CqAIyn8qONfKlJ/zlI98j%2BY/K8DPzeqh5DMuiw4FK1daDJPooCCoBngYq8CwGjFDPlBR6iVUC2gyVTxfysFFgHBfvI6igVb8xMHBRYCwUkLuY5C5SJQuoVS1T5xMLqK3TQXtswFrCwquwpPkD4z5KCiwKXTjnmyv5AioRRmTfnZ1j5HIR2cvF/n6yK8e8pes7RNByILcBuIgIZDBLNId%2B3bRQo0AnnQkYgGYUjrPJrlSCi8z3GIC8GXgVJoSFeKxW8EDzKLVFQiE6eoo0SaLSSYJXRTr3HnUstEfxYxXLx14EdnGti7xVzGsW2KLS9eBxZEozhOKC8LiwcP4uXxQwPFFcEgHUWwC%2BLLKBigJUYpMUCBQl0itJTS29jiFoUf/G0kwCSV6YclGFepbUpsV8QrgXaH2YkrkUZlnGTAFxRyXKWBKPQmSzRY0p8WmLBlRSsxQyGqUB4MZXMWZaXHaWMwYgNpZxosq0QckoF6hQZQXI0XZLkUEygpekqmWmKCOuypgFUpqUZk6lDiqEvCWsWKKblrSycVUiaXdLeqvS/pTsuOU0toSIyg5U1FCRHLkAhioJcGyaZ/8rlcy/kqsp6W1yBIGy75SaF2UArDISePJfotBWFLwVISqRcTV%2BX/9oVFg55XcpEYPLmlzymJeSqiUvK1lCK72MitSWEr/lluUZTSsxWTLcVBHDZcSsnnKI4VnyhlUiq2WuloWWKigl0zmKrsEJZXbACB0NxCAEAxALsKCxt57ABZ%2BCmPnsAPbSLzY3wWgCwCyCDyTp8ou3O4k7jxxwwLAXOQLM0zlQ7BVsUjlIlXhnlzYXoVgMTAy71ENApAJ2ngsChiEPVNq8QnvWUJyyJ6vVMQNpBHlwFylllM6Sl2k6aKgeQEufnRMopvj4QqXAMrkuJQkyuWYauDAeUhCqImAwAcuIJAMCuQGAbEFUCSB5AW4GS53cPMTGrq90EieHMBuglDJ4AIVnIEmS8C8CEKu1HS/4g3QAqDrh17RUdYzG3Jj1M5jMYNYMwtJjVdoPkOsmdKHWEKrMu1ONDkvHULrLOEAEMDkmXVQtIFizeYluunU%2Brllh61oo6RvWEKlqY6%2BdY%2BornLr6VFitjloUAnnj01T4zNRv2zUJ9c1EAfNWdIvXXFt61gyIRY0n5zM3BXEz7hvX6HTC2JCIuYdrIvS0pu%2B0lC4PJSUp3ZaUU6WlEEAuBpArOF6AWWjgvQEapAVOJns43lXKhFVyq1VW3zQ3OEpM0LPhJpCvqRRE0mAJQmITJAVwngKAz9n7jrLEZpxFMCAIaC9qpJemjwdVm2xt4IAvACs0GM4PU5FrSMga7VcoUxYyEdV9yoCEEIWq/KY%2BWmnTUDyzr0kxVBK7Fcvls3abNCqa5VU5poUSrqW7m%2BzTSwQBChDizmvBVfX67RrxC%2B5Xqi2qUhdr4SwGdBAlrnU9qrYeAUgJFvv7WBrA/arkF6A83gCs626mdfevfVl0uYWWvANU0siFavNxAErfUSq0LquYs6w9evLCh2bPNQW2gI1vZRzdo1LWz8Hh0Qox43135JRaaQK2Bbitt6iwK%2BrnUTaP18JZrRIq63gCvQDWubc1uW3CkD1TdBktNu61ehet22gbdlt23/EByNpYTaJts3EBptL42aUdvAEQ9l0BmhUgFOA1EtQN4/cDcShe31boNKTKhHdssjVc7NT27zutqB5va4kGq2lNCi%2B1L8QNNwHNeSwB0w6etwOs8ptse1sNk5K7czWtoh3abjxzbACqes9A6rEZ6nbQKgBWAzS8dkO4RsohO347v1a9aHXVtxn07GdiO5HVGx%2B1o6wNGOqpIDt6FOEcdncn4Mj2bh714ZemuzkWry2hsYdxAeHaCrw2XsDCHALYLQE4BdpeAfgDgFoFICoBOAo%2BSwNYE9A7A9gdQyEDwFIAEBNAeurYJxBACSBEgLwSQCkD3QpANARwAcgFUSA9ADdHASQMbtd3m7OAvABQCAF9Uu7Tdeu0gHAFgBIAQ4oMcgJQEz0TAzgZPKJHIQYCcQ%2BASjeIPHogAxBo9IIZgMQCZCcAndQkenj%2BAYDXRo9WAcuEYHEDJ7SA%2BAV9o4F4nR78YjQT9g3t4Dz9w9Zu%2B/jEA0R16PAWAaPZJpYBj6tgmhctQoC7BmJeIP4VkCbqd38BBAIgMQOwEY0H75ASgNQNHt0DCVq1xgHLZYH0AoD49kALYBbgGDx6OAVsH8Nkx167Vy4ewd4GCTJjy94gtoG2AQAQZglZQ1pa3aMSYBx69pTQZwHjKkhzA/AwlUICsCqA1A9AJQHIAIDQO4Gsg%2BBhgKMGwMTBhKDQJA4MCWCEHKDiBgYEMHaBkHxgCQSg7Qc8BdA9AtsZg1gdYMSAtgE8XYPsD0BPBR4K%2B/QIbqj096LdHAO4IkCthVh89CoCAJU2L3b4rdVgB/RcFwCEB2IxwAWR4GEihwcwV6XgEnq0DopSAr0M3hMDmakAPdXaMwC8CnQvpJARwJw5ICKEcUihkhiPdIbN2yG49Ce53a7q2Bp7EAIATGPY2z079jDoMSIOpk4CqAqwLABQM3C2ZWUUgLwKQJ%2BA/iRBsAGwXgK/00V4B0AegM/UfvECn7ZAigFQOoB73X7SAvEDROkAkPh6jdmW6PbIZ/A1x7GSonQvcEUOGhlDfIVQ0XtoafgjDIMf8QYaKOhHk9Vhj3dsSnQpAu0RQ9w1Ol91skqkRwaQOHsj1dGZDse2wCEYsNu6/DZgAI7wCCMLHLDWwBXtkGcCSAgAA%3D%3D%3D + */ + // https://github.com/xenia-canary/xenia-canary/blob/968f656d96b3ca9c14d6467423df77d3583f7d18/src/xenia/cpu/backend/x64/x64_sequences.cc + /* + todo: this is a naive version, we can do far more optimizations for + constant src2 + */ + bool consts2 = false; + + if (i.src1.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src1.constant()); } else { - e.mov(e.GetNativeParam(1), i.src2); + e.vmovdqa(e.xmm0, i.src1); } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateShrV128)); - e.vmovaps(i.dest, e.xmm0); + if (i.src2.is_constant) { + consts2 = true; + e.mov(e.r8d, i.src2.constant() & 7); + e.mov(e.eax, 8 - (i.src2.constant() & 7)); + } else { + e.movzx(e.r8d, i.src2); + e.and_(e.r8d, 7); + } + + e.vpshufd(e.xmm1, e.xmm0, 27); + e.vpcmpeqd(e.xmm3, e.xmm3, e.xmm3); + e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMVSRShlByteshuf)); + if (!consts2) { + e.mov(e.eax, 8); + } + e.vmovd(e.xmm2, e.r8d); + if (!consts2) { + e.sub(e.eax, e.r8d); + } + e.vpsrlw(e.xmm1, e.xmm1, e.xmm2); + e.vpsrlw(e.xmm2, e.xmm3, e.xmm2); + e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMVSRMask)); + e.vpand(e.xmm1, e.xmm1, e.xmm2); + e.vmovd(e.xmm2, e.eax); + e.vpsllw(e.xmm0, e.xmm0, e.xmm2); + e.vpsllw(e.xmm2, e.xmm3, e.xmm2); + e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMZero)); + e.vpand(e.xmm0, e.xmm0, e.xmm2); + e.vpor(e.xmm0, e.xmm0, e.xmm1); + e.vpshufd(i.dest, e.xmm0, 27); } static __m128i EmulateShrV128(void*, __m128i src1, uint8_t src2) { // Almost all instances are shamt = 1, but non-constant. @@ -3238,6 +3268,8 @@ struct SET_ROUNDING_MODE_I32 } }; EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32); + +static void MaybeYieldForwarder(void* ctx) { xe::threading::MaybeYield(); } // ============================================================================ // OPCODE_DELAY_EXECUTION // ============================================================================ @@ -3245,7 +3277,11 @@ struct DELAY_EXECUTION : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { // todo: what if they dont have smt? - e.pause(); + if (cvars::delay_via_maybeyield) { + e.CallNativeSafe((void*)MaybeYieldForwarder); + } else { + e.pause(); + } } }; EMITTER_OPCODE_TABLE(OPCODE_DELAY_EXECUTION, DELAY_EXECUTION); diff --git a/src/xenia/cpu/compiler/passes/conditional_group_pass.cc b/src/xenia/cpu/compiler/passes/conditional_group_pass.cc index e29eaae45..348144b72 100644 --- a/src/xenia/cpu/compiler/passes/conditional_group_pass.cc +++ b/src/xenia/cpu/compiler/passes/conditional_group_pass.cc @@ -48,9 +48,7 @@ bool ConditionalGroupPass::Initialize(Compiler* compiler) { bool ConditionalGroupPass::Run(HIRBuilder* builder) { bool dirty; - int loops = 0; do { - assert_true(loops < 20); // arbitrary number dirty = false; for (size_t i = 0; i < passes_.size(); ++i) { scratch_arena()->Reset(); @@ -68,7 +66,6 @@ bool ConditionalGroupPass::Run(HIRBuilder* builder) { dirty |= result; } } - loops++; } while (dirty); return true; } diff --git a/src/xenia/cpu/compiler/passes/finalization_pass.cc b/src/xenia/cpu/compiler/passes/finalization_pass.cc index 1b409430c..d9e2846eb 100644 --- a/src/xenia/cpu/compiler/passes/finalization_pass.cc +++ b/src/xenia/cpu/compiler/passes/finalization_pass.cc @@ -41,18 +41,6 @@ bool FinalizationPass::Run(HIRBuilder* builder) { block->ordinal = block_ordinal++; // Ensure all labels have names. - auto label = block->label_head; - while (label) { - if (!label->name) { - const size_t label_len = 6 + 4; - char* name = reinterpret_cast(arena->Alloc(label_len + 1, 1)); - assert_true(label->id <= 9999); - auto end = fmt::format_to_n(name, label_len, "_label{}", label->id); - name[end.size] = '\0'; - label->name = name; - } - label = label->next; - } // Remove unneeded jumps. auto tail = block->instr_tail; diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index a8c93c3b6..da6f0dfe1 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -23,52 +23,6 @@ using namespace xe::cpu::hir; using xe::cpu::hir::HIRBuilder; using xe::cpu::hir::Instr; using xe::cpu::hir::Value; -using vmask_portion_t = uint64_t; -template -struct Valuemask_t { - vmask_portion_t bits[Ndwords]; - - static Valuemask_t create_empty(vmask_portion_t fill = 0) { - Valuemask_t result; - for (uint32_t i = 0; i < Ndwords; ++i) { - result.bits[i] = fill; - } - return result; - } - template - Valuemask_t operate(TCallable&& oper) const { - Valuemask_t result = create_empty(); - - for (uint32_t i = 0; i < Ndwords; ++i) { - result.bits[i] = oper(bits[i]); - } - return result; - } - template - Valuemask_t operate(TCallable&& oper, Valuemask_t other) const { - Valuemask_t result = create_empty(); - - for (uint32_t i = 0; i < Ndwords; ++i) { - result.bits[i] = oper(bits[i], other.bits[i]); - } - return result; - } - Valuemask_t operator&(ValueMask other) const { - return operate([](vmask_portion_t x, vmask_portion_t y) { return x & y; }, - other); - } - Valuemask_t operator|(ValueMask other) const { - return operate([](vmask_portion_t x, vmask_portion_t y) { return x | y; }, - other); - } - Valuemask_t operator^(ValueMask other) const { - return operate([](vmask_portion_t x, vmask_portion_t y) { return x ^ y; }, - other); - } - Valuemask_t operator~() const { - return operate([](vmask_portion_t x) { return ~x; }, other); - } -}; SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {} @@ -76,17 +30,13 @@ SimplificationPass::~SimplificationPass() {} bool SimplificationPass::Run(HIRBuilder* builder, bool& result) { result = false; - bool iter_result = false; - do { - iter_result = false; - iter_result |= SimplifyBitArith(builder); - iter_result |= EliminateConversions(builder); - iter_result |= SimplifyAssignments(builder); - iter_result |= SimplifyBasicArith(builder); - iter_result |= SimplifyVectorOps(builder); - result |= iter_result; - } while (iter_result); + result |= SimplifyBitArith(builder); + result |= EliminateConversions(builder); + result |= SimplifyAssignments(builder); + result |= SimplifyBasicArith(builder); + result |= SimplifyVectorOps(builder); + return true; } // simplifications that apply to both or and xor @@ -735,7 +685,9 @@ bool SimplificationPass::CheckAdd(hir::Instr* i, hir::HIRBuilder* builder) { auto [added_constant_neg, added_var_neg] = i->BinaryValueArrangeAsConstAndVar(); - if (!added_constant_neg) return false; + if (!added_constant_neg) { + return false; + } if (added_constant_neg->AsUint64() & GetScalarSignbitMask(added_constant_neg->type)) { // adding a value that has its signbit set! @@ -882,11 +834,6 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i, } else if (cmpop == OPCODE_COMPARE_UGT) { // impossible, cannot be greater than mask - - /* i->Replace(&OPCODE_ASSIGN_info, 0); - i->set_src1(builder->LoadZeroInt8()); - return true; - */ constant_replacement = builder->LoadZeroInt8(); } else if (cmpop == OPCODE_COMPARE_ULE) { // less than or equal to mask = @@ -914,9 +861,9 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i, bool istrue = i->opcode == &OPCODE_COMPARE_NE_info; bool isfalse = i->opcode == &OPCODE_COMPARE_EQ_info; - auto [input_cosntant, input] = i->BinaryValueArrangeAsConstAndVar(); + auto [input_constant, input] = i->BinaryValueArrangeAsConstAndVar(); - if (!input_cosntant || input_cosntant->AsUint64() != 0) { + if (!input_constant || input_constant->AsUint64() != 0) { return false; } @@ -957,12 +904,6 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i, } } - /* Instr* input_def = input->def; - if (!input_def) { - return false; - } - - input_def = input_def->GetDestDefSkipAssigns();*/ return false; } bool SimplificationPass::CheckSHRByConst(hir::Instr* i, diff --git a/src/xenia/cpu/hir/label.h b/src/xenia/cpu/hir/label.h index c57fd0154..cc4f4146e 100644 --- a/src/xenia/cpu/hir/label.h +++ b/src/xenia/cpu/hir/label.h @@ -26,6 +26,13 @@ class Label { char* name; void* tag; + // just use stringification of label id + // this will later be used as an input to xbyak. xbyak only accepts + // std::string as a value, not passed by reference, so precomputing the + // stringification does not help + std::string GetIdString() { + return std::to_string(id); + } }; } // namespace hir diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index 6d90f1811..1bd85cae9 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -11,7 +11,7 @@ #define XENIA_CPU_HIR_OPCODES_H_ #include - +#include "xenia/base/platform.h" namespace xe { namespace cpu { namespace hir { @@ -361,13 +361,16 @@ enum OpcodeSignature { #define GET_OPCODE_SIG_TYPE_SRC1(sig) (OpcodeSignatureType)((sig >> 3) & 0x7) #define GET_OPCODE_SIG_TYPE_SRC2(sig) (OpcodeSignatureType)((sig >> 6) & 0x7) #define GET_OPCODE_SIG_TYPE_SRC3(sig) (OpcodeSignatureType)((sig >> 9) & 0x7) +XE_MAYBE_UNUSED static bool IsOpcodeBinaryValue(uint32_t signature) { return (signature & ~(0x7)) == ((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6)); } +XE_MAYBE_UNUSED static bool IsOpcodeUnaryValue(uint32_t signature) { return (signature & ~(0x7)) == ((OPCODE_SIG_TYPE_V << 3)); } +XE_MAYBE_UNUSED static void UnpackOpcodeSig(uint32_t sig, OpcodeSignatureType& dest, OpcodeSignatureType& src1, OpcodeSignatureType& src2, diff --git a/src/xenia/cpu/mmio_handler.cc b/src/xenia/cpu/mmio_handler.cc index 61f420eaa..b1e2d2964 100644 --- a/src/xenia/cpu/mmio_handler.cc +++ b/src/xenia/cpu/mmio_handler.cc @@ -185,7 +185,7 @@ bool MMIOHandler::TryDecodeLoadStore(const uint8_t* p, uint8_t rex_b = rex & 0b0001; uint8_t rex_x = rex & 0b0010; uint8_t rex_r = rex & 0b0100; - uint8_t rex_w = rex & 0b1000; + //uint8_t rex_w = rex & 0b1000; // http://www.sandpile.org/x86/opc_rm.htm // http://www.sandpile.org/x86/opc_sib.htm @@ -418,7 +418,6 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) { // Quick kill anything outside our mapping. return false; } - uint64_t hostip = ex->pc(); void* fault_host_address = reinterpret_cast(ex->fault_address()); diff --git a/src/xenia/cpu/module.h b/src/xenia/cpu/module.h index fd85ab9ed..3f6b139ec 100644 --- a/src/xenia/cpu/module.h +++ b/src/xenia/cpu/module.h @@ -54,6 +54,7 @@ class Module { bool ReadMap(const char* file_name); + virtual void Precompile() {} protected: virtual std::unique_ptr CreateFunction(uint32_t address) = 0; diff --git a/src/xenia/cpu/ppc/ppc_context.h b/src/xenia/cpu/ppc/ppc_context.h index 1528d3378..799cef860 100644 --- a/src/xenia/cpu/ppc/ppc_context.h +++ b/src/xenia/cpu/ppc/ppc_context.h @@ -425,6 +425,27 @@ typedef struct alignas(64) PPCContext_s { uint64_t reserved_val; ThreadState* thread_state; uint8_t* virtual_membase; + + template + inline T TranslateVirtual(uint32_t guest_address) XE_RESTRICT const { +#if XE_PLATFORM_WIN32 == 1 + uint8_t* host_address = virtual_membase + guest_address; + if (guest_address >= static_cast(reinterpret_cast(this))) { + host_address += 0x1000; + } + return reinterpret_cast(host_address); +#else + return processor->memory()->TranslateVirtual(guest_address); + +#endif + } + //for convenience in kernel functions, version that auto narrows to uint32 + template + inline T TranslateVirtualGPR(uint64_t guest_address) XE_RESTRICT const { + return TranslateVirtual(static_cast(guest_address)); + + } + static std::string GetRegisterName(PPCRegister reg); std::string GetStringFromValue(PPCRegister reg) const; void SetValueFromString(PPCRegister reg, std::string value); diff --git a/src/xenia/cpu/ppc/ppc_decode_data.h b/src/xenia/cpu/ppc/ppc_decode_data.h index 61ae57610..1d6d95e69 100644 --- a/src/xenia/cpu/ppc/ppc_decode_data.h +++ b/src/xenia/cpu/ppc/ppc_decode_data.h @@ -46,6 +46,7 @@ struct PPCDecodeData { uint32_t LEV() const { return bits_.LEV; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -74,6 +75,7 @@ struct PPCDecodeData { uint32_t L() const { return bits_.RT & 0x1; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -95,6 +97,7 @@ struct PPCDecodeData { int32_t ds() const { return static_cast(XEEXTS16(DS() << 2)); } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -174,6 +177,7 @@ struct PPCDecodeData { uint32_t CRFS() const { return bits_.RA >> 2; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -200,6 +204,7 @@ struct PPCDecodeData { uint32_t CRFS() const { return CRBA() >> 2; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -223,6 +228,7 @@ struct PPCDecodeData { } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -244,6 +250,7 @@ struct PPCDecodeData { bool Rc() const { return bits_.Rc ? true : false; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -266,6 +273,7 @@ struct PPCDecodeData { bool Rc() const { return bits_.Rc ? true : false; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -289,6 +297,7 @@ struct PPCDecodeData { bool Rc() const { return bits_.Rc ? true : false; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -314,6 +323,7 @@ struct PPCDecodeData { bool Rc() const { return bits_.Rc ? true : false; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -339,6 +349,7 @@ struct PPCDecodeData { bool Rc() const { return bits_.Rc ? true : false; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -363,6 +374,7 @@ struct PPCDecodeData { bool Rc() const { return bits_.Rc ? true : false; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -389,6 +401,7 @@ struct PPCDecodeData { bool Rc() const { return bits_.Rc ? true : false; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -412,6 +425,7 @@ struct PPCDecodeData { int32_t SIMM() const { return static_cast(XEEXTS16(VA())); } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -431,6 +445,7 @@ struct PPCDecodeData { bool Rc() const { return bits_.Rc ? true : false; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -452,6 +467,7 @@ struct PPCDecodeData { uint32_t SHB() const { return VC() & 0xF; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -473,6 +489,7 @@ struct PPCDecodeData { uint32_t VB() const { return bits_.VB128l | (bits_.VB128h << 5); } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -498,6 +515,7 @@ struct PPCDecodeData { uint32_t RB() const { return bits_.RB; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -521,6 +539,7 @@ struct PPCDecodeData { uint32_t VC() const { return bits_.VC; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -546,6 +565,7 @@ struct PPCDecodeData { int32_t SIMM() const { return static_cast(XEEXTS16(bits_.UIMM)); } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -567,6 +587,7 @@ struct PPCDecodeData { uint32_t z() const { return bits_.z; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -592,6 +613,7 @@ struct PPCDecodeData { uint32_t SH() const { return bits_.SH; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -618,6 +640,7 @@ struct PPCDecodeData { bool Rc() const { return bits_.Rc ? true : false; } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; @@ -642,6 +665,7 @@ struct PPCDecodeData { uint32_t UIMM() const { return bits_.PERMl | (bits_.PERMh << 5); } private: + XE_MAYBE_UNUSED uint32_t address_; union { uint32_t value_; diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index 9ff0f45c6..386abf4bd 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -2014,8 +2014,7 @@ int InstrEmit_vupkhsh(PPCHIRBuilder& f, const InstrData& i) { return InstrEmit_vupkhsh_(f, i.VX.VD, i.VX.VB); } int InstrEmit_vupkhsh128(PPCHIRBuilder& f, const InstrData& i) { - uint32_t va = VX128_VA128; - assert_zero(va); + assert_zero(VX128_VA128); return InstrEmit_vupkhsh_(f, VX128_VD128, VX128_VB128); } @@ -2032,8 +2031,7 @@ int InstrEmit_vupklsh(PPCHIRBuilder& f, const InstrData& i) { return InstrEmit_vupklsh_(f, i.VX.VD, i.VX.VB); } int InstrEmit_vupklsh128(PPCHIRBuilder& f, const InstrData& i) { - uint32_t va = VX128_VA128; - assert_zero(va); + assert_zero(VX128_VA128); return InstrEmit_vupklsh_(f, VX128_VD128, VX128_VB128); } diff --git a/src/xenia/cpu/ppc/ppc_emit_memory.cc b/src/xenia/cpu/ppc/ppc_emit_memory.cc index 9bb7d5593..69c7fdf9e 100644 --- a/src/xenia/cpu/ppc/ppc_emit_memory.cc +++ b/src/xenia/cpu/ppc/ppc_emit_memory.cc @@ -16,7 +16,7 @@ #include "xenia/cpu/ppc/ppc_hir_builder.h" DEFINE_bool( - disable_prefetch_and_cachecontrol, false, + disable_prefetch_and_cachecontrol, true, "Disables translating ppc prefetch/cache flush instructions to host " "prefetch/cacheflush instructions. This may improve performance as these " "instructions were written with the Xbox 360's cache in mind, and modern " diff --git a/src/xenia/cpu/ppc/ppc_frontend.cc b/src/xenia/cpu/ppc/ppc_frontend.cc index 7b7617368..bd65919dd 100644 --- a/src/xenia/cpu/ppc/ppc_frontend.cc +++ b/src/xenia/cpu/ppc/ppc_frontend.cc @@ -105,6 +105,11 @@ bool PPCFrontend::Initialize() { } bool PPCFrontend::DeclareFunction(GuestFunction* function) { + + //chrispy: make sure we aren't declaring a function that is actually padding data, this will mess up PPCScanner and is hard to debug + //wow, this halo reach actually has branches into 0 opcodes, look into further + //xenia_assert(*reinterpret_cast( + // this->memory()->TranslateVirtual(function->address())) != 0); // Could scan or something here. // Could also check to see if it's a well-known function type and classify // for later. diff --git a/src/xenia/cpu/ppc/ppc_hir_builder.cc b/src/xenia/cpu/ppc/ppc_hir_builder.cc index 263d3675a..867651c32 100644 --- a/src/xenia/cpu/ppc/ppc_hir_builder.cc +++ b/src/xenia/cpu/ppc/ppc_hir_builder.cc @@ -34,6 +34,11 @@ DEFINE_bool( "unimplemented PowerPC instruction is encountered.", "CPU"); +DEFINE_bool( + emit_useless_fpscr_updates, false, + "Emit useless fpscr update instructions (pre-10/30/2022 behavior). ", + "CPU"); + namespace xe { namespace cpu { namespace ppc { @@ -89,6 +94,9 @@ bool PPCHIRBuilder::Emit(GuestFunction* function, uint32_t flags) { function_ = function; start_address_ = function_->address(); + //chrispy: i've seen this one happen, not sure why but i think from trying to precompile twice + //i've also seen ones with a start and end address that are the same... + assert_true(function_->address() <= function_->end_address()); instr_count_ = (function_->end_address() - function_->address()) / 4 + 1; with_debug_info_ = (flags & EMIT_DEBUG_COMMENTS) == EMIT_DEBUG_COMMENTS; @@ -242,6 +250,7 @@ void PPCHIRBuilder::MaybeBreakOnInstruction(uint32_t address) { } void PPCHIRBuilder::AnnotateLabel(uint32_t address, Label* label) { + //chrispy: label->name is unused, it would be nice to be able to remove the field and this code char name_buffer[13]; auto format_result = fmt::format_to_n(name_buffer, 12, "loc_{:08X}", address); name_buffer[format_result.size] = '\0'; @@ -447,31 +456,38 @@ void PPCHIRBuilder::StoreFPSCR(Value* value) { void PPCHIRBuilder::UpdateFPSCR(Value* result, bool update_cr1) { // TODO(benvanik): detect overflow and nan cases. // fx and vx are the most important. - Value* fx = LoadConstantInt8(0); - Value* fex = LoadConstantInt8(0); - Value* vx = LoadConstantInt8(0); - Value* ox = LoadConstantInt8(0); + /* + chrispy: stubbed this out because right now all it does is waste + memory and CPU time + */ + if (cvars::emit_useless_fpscr_updates) { + Value* fx = LoadConstantInt8(0); + Value* fex = LoadConstantInt8(0); + Value* vx = LoadConstantInt8(0); + Value* ox = LoadConstantInt8(0); - if (update_cr1) { - // Store into the CR1 field. - // We do this instead of just calling CopyFPSCRToCR1 so that we don't - // have to read back the bits and do shifting work. - StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx); - StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex); - StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx); - StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox); + if (update_cr1) { + // Store into the CR1 field. + // We do this instead of just calling CopyFPSCRToCR1 so that we don't + // have to read back the bits and do shifting work. + StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx); + StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex); + StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx); + StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox); + } + + // Generate our new bits. + Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31); + new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30)); + new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29)); + new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28)); + + // Mix into fpscr while preserving sticky bits (FX and OX). + Value* bits = LoadFPSCR(); + bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits); + StoreFPSCR(bits); } - // Generate our new bits. - Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31); - new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30)); - new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29)); - new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28)); - - // Mix into fpscr while preserving sticky bits (FX and OX). - Value* bits = LoadFPSCR(); - bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits); - StoreFPSCR(bits); } void PPCHIRBuilder::CopyFPSCRToCR1() { diff --git a/src/xenia/cpu/ppc/ppc_instr.h b/src/xenia/cpu/ppc/ppc_instr.h index 7f2b69bba..a65f1b638 100644 --- a/src/xenia/cpu/ppc/ppc_instr.h +++ b/src/xenia/cpu/ppc/ppc_instr.h @@ -21,13 +21,7 @@ namespace xe { namespace cpu { namespace ppc { -// DEPRECATED -// TODO(benvanik): move code to PPCDecodeData. -struct InstrData { - PPCOpcode opcode; - const PPCOpcodeInfo* opcode_info; - uint32_t address; - +struct PPCOpcodeBits { union { uint32_t code; @@ -329,6 +323,14 @@ struct InstrData { }; }; +// DEPRECATED +// TODO(benvanik): move code to PPCDecodeData. +struct InstrData : public PPCOpcodeBits { + PPCOpcode opcode; + const PPCOpcodeInfo* opcode_info; + uint32_t address; +}; + } // namespace ppc } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/xex_module.cc b/src/xenia/cpu/xex_module.cc index d7325ea91..2f8afc853 100644 --- a/src/xenia/cpu/xex_module.cc +++ b/src/xenia/cpu/xex_module.cc @@ -31,14 +31,17 @@ #include "third_party/crypto/rijndael-alg-fst.c" #include "third_party/crypto/rijndael-alg-fst.h" #include "third_party/pe/pe_image.h" - +#include "xenia/cpu/ppc/ppc_decode_data.h" +#include "xenia/cpu/ppc/ppc_instr.h" DEFINE_bool(disable_instruction_infocache, false, "Disables caching records of called instructions/mmio accesses.", "CPU"); -DEFINE_bool(disable_function_precompilation, true, - "Disables pre-compiling guest functions that we know we've called " - "on previous runs", - "CPU"); + +DEFINE_bool( + disable_early_precompilation, false, + "Disables pre-compiling guest functions that we know we've called/that " + "we've recognized as being functions via simple heuristics.", + "CPU"); static const uint8_t xe_xex2_retail_key[16] = { 0x20, 0xB1, 0x85, 0xA5, 0x9D, 0x28, 0xFD, 0xC3, @@ -1057,29 +1060,6 @@ bool XexModule::LoadContinue() { library_offset += library->size; } } - sha1::SHA1 final_image_sha_; - - final_image_sha_.reset(); - - unsigned high_code = this->high_address_ - this->low_address_; - - final_image_sha_.processBytes(memory()->TranslateVirtual(this->low_address_), - high_code); - final_image_sha_.finalize(image_sha_bytes_); - - char fmtbuf[16]; - - for (unsigned i = 0; i < 16; ++i) { - sprintf_s(fmtbuf, "%X", image_sha_bytes_[i]); - image_sha_str_ += &fmtbuf[0]; - } - - info_cache_.Init(this); - // Find __savegprlr_* and __restgprlr_* and the others. - // We can flag these for special handling (inlining/etc). - if (!FindSaveRest()) { - return false; - } // Load a specified module map and diff. if (cvars::load_module_map.size()) { @@ -1112,6 +1092,32 @@ bool XexModule::LoadContinue() { return true; } +void XexModule::Precompile() { + sha1::SHA1 final_image_sha_; + + final_image_sha_.reset(); + + unsigned high_code = this->high_address_ - this->low_address_; + + final_image_sha_.processBytes(memory()->TranslateVirtual(this->low_address_), + high_code); + final_image_sha_.finalize(image_sha_bytes_); + + char fmtbuf[16]; + + for (unsigned i = 0; i < 16; ++i) { + sprintf_s(fmtbuf, "%X", image_sha_bytes_[i]); + image_sha_str_ += &fmtbuf[0]; + } + + // Find __savegprlr_* and __restgprlr_* and the others. + // We can flag these for special handling (inlining/etc). + if (!FindSaveRest()) { + return; + } + info_cache_.Init(this); + PrecompileDiscoveredFunctions(); +} bool XexModule::Unload() { if (!loaded_) { return true; @@ -1363,9 +1369,25 @@ InfoCacheFlags* XexModule::GetInstructionAddressFlags(uint32_t guest_addr) { return info_cache_.LookupFlags(guest_addr); } +void XexModule::PrecompileDiscoveredFunctions() { + if (cvars::disable_early_precompilation) { + return; + } + auto others = PreanalyzeCode(); + for (auto&& other : others) { + if (other < low_address_ || other >= high_address_) { + continue; + } + auto sym = processor_->LookupFunction(other); + + if (!sym || sym->status() != Symbol::Status::kDefined) { + processor_->ResolveFunction(other); + } + } +} void XexModule::PrecompileKnownFunctions() { - if (cvars::disable_function_precompilation) { + if (cvars::disable_early_precompilation) { return; } uint32_t start = 0; @@ -1374,12 +1396,160 @@ void XexModule::PrecompileKnownFunctions() { if (!flags) { return; } + //maybe should pre-acquire global crit? for (uint32_t i = 0; i < end; i++) { if (flags[i].was_resolved) { - processor_->ResolveFunction(low_address_ + (i * 4)); + uint32_t addr = low_address_ + (i * 4); + auto sym = processor_->LookupFunction(addr); + + if (!sym || sym->status() != Symbol::Status::kDefined) { + processor_->ResolveFunction(addr); + } } } } + +static uint32_t GetBLCalledFunction(XexModule* xexmod, uint32_t current_base, + ppc::PPCOpcodeBits wrd) { + int32_t displ = static_cast(ppc::XEEXTS26(wrd.I.LI << 2)); + + if (wrd.I.AA) { + return static_cast(displ); + } else { + return static_cast(static_cast(current_base) + displ); + } +} +static bool IsOpcodeBL(unsigned w) { + return (w >> (32 - 6)) == 18 && ppc::PPCOpcodeBits{w}.I.LK; +} + +std::vector XexModule::PreanalyzeCode() { + uint32_t low_8_aligned = xe::align(low_address_, 8); + uint32_t high_8_aligned = high_address_ & ~(8U - 1); + + uint32_t n_possible_8byte_addresses = (high_8_aligned - low_8_aligned) / 8; + uint32_t* funcstart_candidate_stack = + new uint32_t[n_possible_8byte_addresses]; + uint32_t* funcstart_candstack2 = new uint32_t[n_possible_8byte_addresses]; + + uint32_t stack_pos = 0; + { + // all functions seem to start on 8 byte boundaries, except for obvious ones + // like the save/rest funcs + uint32_t* range_start = + (uint32_t*)memory()->TranslateVirtual(low_8_aligned); + uint32_t* range_end = (uint32_t*)memory()->TranslateVirtual( + high_8_aligned); // align down to multiple of 8 + + const uint8_t mfspr_r12_lr[4] = {0x7D, 0x88, 0x02, 0xA6}; + + // a blr instruction, with 4 zero bytes afterwards to pad the next address + // to 8 byte alignment + // if we see this prior to our address, we can assume we are a function + // start + const uint8_t blr[4] = {0x4E, 0x80, 0x0, 0x20}; + + uint32_t blr32 = *reinterpret_cast(&blr[0]); + + uint32_t mfspr_r12_lr32 = + *reinterpret_cast(&mfspr_r12_lr[0]); + /* + First pass: detect save of the link register at an eight byte + aligned address + */ + for (uint32_t* first_pass = range_start; first_pass < range_end; + first_pass += 2) { + if (*first_pass == mfspr_r12_lr32) { + // Push our newly discovered function start into our list + // All addresses in the list are sorted until the second pass + funcstart_candidate_stack[stack_pos++] = + static_cast(reinterpret_cast(first_pass) - + reinterpret_cast(range_start)) + + low_8_aligned; + } else if (first_pass[-1] == 0 && *first_pass != 0) { + // originally i checked for blr followed by 0, but some functions are + // actually aligned to greater boundaries. something that appears to be + // longjmp (it occurs in most games, so standard library, and loads ctx, + // so longjmp) is aligned to 16 bytes in most games + uint32_t* check_iter = &first_pass[-2]; + + while (!*check_iter) { + --check_iter; + } + + XE_LIKELY_IF(*check_iter == blr32) { + funcstart_candidate_stack[stack_pos++] = + static_cast(reinterpret_cast(first_pass) - + reinterpret_cast(range_start)) + + low_8_aligned; + } + } + } + uint32_t current_guestaddr = low_8_aligned; + // Second pass: detect branch with link instructions and decode the target + // address. We can safely assume that if bl is to address, that address is + // the start of the function + for (uint32_t* second_pass = range_start; second_pass < range_end; + second_pass++, current_guestaddr += 4) { + uint32_t current_call = xe::byte_swap(*second_pass); + + if (IsOpcodeBL(current_call)) { + funcstart_candidate_stack[stack_pos++] = GetBLCalledFunction( + this, current_guestaddr, ppc::PPCOpcodeBits{current_call}); + } + } + + auto pdata = this->GetPESection(".pdata"); + + if (pdata) { + uint32_t* pdata_base = + (uint32_t*)this->memory()->TranslateVirtual(pdata->address); + + uint32_t n_pdata_entries = pdata->raw_size / 8; + + for (uint32_t i = 0; i < n_pdata_entries; ++i) { + uint32_t funcaddr = xe::load_and_swap(&pdata_base[i * 2]); + if (funcaddr >= low_address_ && funcaddr <= high_address_) { + funcstart_candidate_stack[stack_pos++] = funcaddr; + } else { + // we hit 0 for func addr, that means we're done + break; + } + } + } + } + + // Sort the list of function starts and then ensure that all addresses are + // unique + uint32_t n_known_funcaddrs = 0; + { + // make addresses unique + + std::sort(funcstart_candidate_stack, funcstart_candidate_stack + stack_pos); + + uint32_t read_pos = 0; + uint32_t write_pos = 0; + uint32_t previous_addr = ~0u; + while (read_pos < stack_pos) { + uint32_t current_addr = funcstart_candidate_stack[read_pos++]; + + if (current_addr != previous_addr) { + previous_addr = current_addr; + funcstart_candstack2[write_pos++] = current_addr; + } + } + n_known_funcaddrs = write_pos; + } + + delete[] funcstart_candidate_stack; + + std::vector result; + result.resize(n_known_funcaddrs); + memcpy(&result[0], funcstart_candstack2, + sizeof(uint32_t) * n_known_funcaddrs); + delete[] funcstart_candstack2; + return result; +} bool XexModule::FindSaveRest() { // Special stack save/restore functions. // http://research.microsoft.com/en-us/um/redmond/projects/invisible/src/crt/md/ppc/xxx.s.htm @@ -1552,6 +1722,8 @@ bool XexModule::FindSaveRest() { auto page_size = base_address_ <= 0x90000000 ? 64 * 1024 : 4 * 1024; auto sec_header = xex_security_info(); + std::vector resolve_on_exit{}; + resolve_on_exit.reserve(256); for (uint32_t i = 0, page = 0; i < sec_header->page_descriptor_count; i++) { // Byteswap the bitfield manually. xex2_page_descriptor desc; @@ -1586,13 +1758,20 @@ bool XexModule::FindSaveRest() { // Add function stubs. char name[32]; + + auto AddXexFunction = [this, &resolve_on_exit](uint32_t address, + Function** function) { + DeclareFunction(address, function); + resolve_on_exit.push_back(address); + }; if (gplr_start) { uint32_t address = gplr_start; for (int n = 14; n <= 31; n++) { auto format_result = fmt::format_to_n(name, xe::countof(name), "__savegprlr_{}", n); Function* function; - DeclareFunction(address, &function); + + AddXexFunction(address, &function); function->set_end_address(address + (31 - n) * 4 + 2 * 4); function->set_name(std::string_view(name, format_result.size)); // TODO(benvanik): set type fn->type = FunctionSymbol::User; @@ -1608,7 +1787,7 @@ bool XexModule::FindSaveRest() { auto format_result = fmt::format_to_n(name, xe::countof(name), "__restgprlr_{}", n); Function* function; - DeclareFunction(address, &function); + AddXexFunction(address, &function); function->set_end_address(address + (31 - n) * 4 + 3 * 4); function->set_name(std::string_view(name, format_result.size)); // TODO(benvanik): set type fn->type = FunctionSymbol::User; @@ -1625,7 +1804,7 @@ bool XexModule::FindSaveRest() { auto format_result = fmt::format_to_n(name, xe::countof(name), "__savefpr_{}", n); Function* function; - DeclareFunction(address, &function); + AddXexFunction(address, &function); function->set_end_address(address + (31 - n) * 4 + 1 * 4); function->set_name(std::string_view(name, format_result.size)); // TODO(benvanik): set type fn->type = FunctionSymbol::User; @@ -1641,7 +1820,7 @@ bool XexModule::FindSaveRest() { auto format_result = fmt::format_to_n(name, xe::countof(name), "__restfpr_{}", n); Function* function; - DeclareFunction(address, &function); + AddXexFunction(address, &function); function->set_end_address(address + (31 - n) * 4 + 1 * 4); function->set_name(std::string_view(name, format_result.size)); // TODO(benvanik): set type fn->type = FunctionSymbol::User; @@ -1663,7 +1842,7 @@ bool XexModule::FindSaveRest() { auto format_result = fmt::format_to_n(name, xe::countof(name), "__savevmx_{}", n); Function* function; - DeclareFunction(address, &function); + AddXexFunction(address, &function); function->set_name(std::string_view(name, format_result.size)); // TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx; @@ -1677,7 +1856,7 @@ bool XexModule::FindSaveRest() { auto format_result = fmt::format_to_n(name, xe::countof(name), "__savevmx_{}", n); Function* function; - DeclareFunction(address, &function); + AddXexFunction(address, &function); function->set_name(std::string_view(name, format_result.size)); // TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx; @@ -1691,7 +1870,7 @@ bool XexModule::FindSaveRest() { auto format_result = fmt::format_to_n(name, xe::countof(name), "__restvmx_{}", n); Function* function; - DeclareFunction(address, &function); + AddXexFunction(address, &function); function->set_name(std::string_view(name, format_result.size)); // TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx; @@ -1705,7 +1884,7 @@ bool XexModule::FindSaveRest() { auto format_result = fmt::format_to_n(name, xe::countof(name), "__restvmx_{}", n); Function* function; - DeclareFunction(address, &function); + AddXexFunction(address, &function); function->set_name(std::string_view(name, format_result.size)); // TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx; @@ -1715,7 +1894,15 @@ bool XexModule::FindSaveRest() { address += 2 * 4; } } - + if (!cvars::disable_early_precompilation) { + for (auto&& to_ensure_precompiled : resolve_on_exit) { + // we want to make sure an address for these functions is available before + // any other functions are compiled for code generation purposes but we do + // it outside of our loops, because we also want to make sure we've marked + // up the symbol with info about it being save/rest and whatnot + processor_->ResolveFunction(to_ensure_precompiled); + } + } return true; } diff --git a/src/xenia/cpu/xex_module.h b/src/xenia/cpu/xex_module.h index 06045ff92..d5981ad41 100644 --- a/src/xenia/cpu/xex_module.h +++ b/src/xenia/cpu/xex_module.h @@ -34,7 +34,8 @@ struct InfoCacheFlags { uint32_t was_resolved : 1; // has this address ever been called/requested // via resolvefunction? uint32_t accessed_mmio : 1; - uint32_t reserved : 30; + uint32_t is_syscall_func : 1; + uint32_t reserved : 29; }; struct XexInfoCache { struct InfoCacheFlagsHeader { @@ -208,12 +209,15 @@ class XexModule : public xe::cpu::Module { } InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr); - void PrecompileKnownFunctions(); + virtual void Precompile() override; protected: std::unique_ptr CreateFunction(uint32_t address) override; private: + void PrecompileKnownFunctions(); + void PrecompileDiscoveredFunctions(); + std::vector PreanalyzeCode(); friend struct XexInfoCache; void ReadSecurityInfo(); diff --git a/src/xenia/debug/ui/debug_window.h b/src/xenia/debug/ui/debug_window.h index e3c01c54d..56edec903 100644 --- a/src/xenia/debug/ui/debug_window.h +++ b/src/xenia/debug/ui/debug_window.h @@ -33,7 +33,7 @@ namespace ui { class DebugWindow : public cpu::DebugListener { public: - ~DebugWindow(); + virtual ~DebugWindow(); static std::unique_ptr Create( Emulator* emulator, xe::ui::WindowedAppContext& app_context); diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 9f00648b0..62191477e 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -2042,10 +2042,6 @@ D3D12CommandProcessor::WriteRegisterRangeFromRing_WithKnownBound( RingBuffer::ReadRange range = ring->BeginRead(num_registers * sizeof(uint32_t)); - constexpr auto bounds_has_reg = - bounds_may_have_reg; - constexpr auto bounds_has_bounds = - bounds_may_have_bounds; XE_LIKELY_IF(!range.second) { WriteRegisterRangeFromMem_WithKnownBound(0x26322BC3); - return; -} - } // namespace lightweight_nvapi \ No newline at end of file diff --git a/src/xenia/gpu/d3d12/d3d12_texture_cache.h b/src/xenia/gpu/d3d12/d3d12_texture_cache.h index e70954fb9..347772bb6 100644 --- a/src/xenia/gpu/d3d12/d3d12_texture_cache.h +++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.h @@ -87,7 +87,7 @@ class D3D12TextureCache final : public TextureCache { ~D3D12TextureCache(); - void ClearCache(); + void ClearCache() override; void BeginSubmission(uint64_t new_submission_index) override; void BeginFrame() override; diff --git a/src/xenia/gpu/pm4_command_processor_declare.h b/src/xenia/gpu/pm4_command_processor_declare.h index 8db802d13..da0888f21 100644 --- a/src/xenia/gpu/pm4_command_processor_declare.h +++ b/src/xenia/gpu/pm4_command_processor_declare.h @@ -1,8 +1,15 @@ -void ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) XE_RESTRICT; -virtual uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index) XE_RESTRICT; -virtual bool ExecutePacket(); +#if defined(OVERRIDING_BASE_CMDPROCESSOR) +#define PM4_OVERRIDE override +#else +#define PM4_OVERRIDE +#endif +void ExecuteIndirectBuffer(uint32_t ptr, + uint32_t count) XE_RESTRICT; +virtual uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index) + XE_RESTRICT PM4_OVERRIDE; +virtual bool ExecutePacket() PM4_OVERRIDE; public: void ExecutePacket(uint32_t ptr, uint32_t count); @@ -111,4 +118,6 @@ XE_COLD bool ExecutePacketType3_CountOverflow(uint32_t count); XE_NOINLINE XE_COLD -bool ExecutePacketType0_CountOverflow(uint32_t count); \ No newline at end of file +bool ExecutePacketType0_CountOverflow(uint32_t count); + +#undef PM4_OVERRIDE \ No newline at end of file diff --git a/src/xenia/gpu/pm4_command_processor_implement.h b/src/xenia/gpu/pm4_command_processor_implement.h index bb6b94051..2fc63cef2 100644 --- a/src/xenia/gpu/pm4_command_processor_implement.h +++ b/src/xenia/gpu/pm4_command_processor_implement.h @@ -4,32 +4,38 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) XE_RESTRICT { SCOPE_profile_cpu_f("gpu"); + trace_writer_.WriteIndirectBufferStart(ptr, count * sizeof(uint32_t)); + if (count != 0) { + RingBuffer old_reader = reader_; - RingBuffer old_reader = reader_; + // Execute commands! + new (&reader_) + RingBuffer(memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)); + reader_.set_write_offset(count * sizeof(uint32_t)); + // prefetch the wraparound range + // it likely is already in L3 cache, but in a zen system it may be another + // chiplets l3 + reader_.BeginPrefetchedRead( + COMMAND_PROCESSOR::GetCurrentRingReadCount()); + do { + if (COMMAND_PROCESSOR::ExecutePacket()) { + continue; + } else { + // Return up a level if we encounter a bad packet. + XELOGE("**** INDIRECT RINGBUFFER: Failed to execute packet."); + assert_always(); + // break; + } + } while (reader_.read_count()); - // Execute commands! - new (&reader_) - RingBuffer(memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)); - reader_.set_write_offset(count * sizeof(uint32_t)); - // prefetch the wraparound range - // it likely is already in L3 cache, but in a zen system it may be another - // chiplets l3 - reader_.BeginPrefetchedRead( - COMMAND_PROCESSOR::GetCurrentRingReadCount()); - do { - if (COMMAND_PROCESSOR::ExecutePacket()) { - continue; - } else { - // Return up a level if we encounter a bad packet. - XELOGE("**** INDIRECT RINGBUFFER: Failed to execute packet."); - assert_always(); - // break; - } - } while (reader_.read_count()); + trace_writer_.WriteIndirectBufferEnd(); + reader_ = old_reader; + } else { + //rare, but i've seen it happen! (and then a division by 0 occurs) + return; + } - trace_writer_.WriteIndirectBufferEnd(); - reader_ = old_reader; } bool COMMAND_PROCESSOR::ExecutePacket() { diff --git a/src/xenia/gpu/render_target_cache.h b/src/xenia/gpu/render_target_cache.h index 5353176ed..8ac449208 100644 --- a/src/xenia/gpu/render_target_cache.h +++ b/src/xenia/gpu/render_target_cache.h @@ -229,9 +229,10 @@ class RenderTargetCache { TraceWriter* trace_writer, uint32_t draw_resolution_scale_x, uint32_t draw_resolution_scale_y) : register_file_(register_file), - draw_extent_estimator_(register_file, memory, trace_writer), draw_resolution_scale_x_(draw_resolution_scale_x), - draw_resolution_scale_y_(draw_resolution_scale_y) { + draw_resolution_scale_y_(draw_resolution_scale_y), + draw_extent_estimator_(register_file, memory, trace_writer) + { assert_not_zero(draw_resolution_scale_x); assert_not_zero(draw_resolution_scale_y); } diff --git a/src/xenia/gpu/trace_viewer.cc b/src/xenia/gpu/trace_viewer.cc index 58df2b8ea..3e051a13d 100644 --- a/src/xenia/gpu/trace_viewer.cc +++ b/src/xenia/gpu/trace_viewer.cc @@ -941,7 +941,6 @@ void TraceViewer::DrawVertexFetcher(Shader* shader, ImGui::NextColumn(); break; case xenos::VertexFormat::k_2_10_10_10: { - auto e0 = LOADEL(uint32_t, 0); ImGui::Text("??"); ImGui::NextColumn(); ImGui::Text("??"); @@ -1066,8 +1065,6 @@ void ProgressBar(float frac, float width, float height = 0, } frac = xe::saturate_unsigned(frac); - const auto fontAtlas = ImGui::GetIO().Fonts; - auto pos = ImGui::GetCursorScreenPos(); auto col = ImGui::ColorConvertFloat4ToU32(color); auto border_col = ImGui::ColorConvertFloat4ToU32(border_color); @@ -1137,7 +1134,6 @@ void TraceViewer::DrawStateUI() { std::memset(&draw_info, 0, sizeof(draw_info)); switch (opcode) { case PM4_DRAW_INDX: { - uint32_t dword0 = xe::load_and_swap(packet_head + 4); uint32_t dword1 = xe::load_and_swap(packet_head + 8); draw_info.index_count = dword1 >> 16; draw_info.prim_type = static_cast(dword1 & 0x3F); @@ -1187,7 +1183,6 @@ void TraceViewer::DrawStateUI() { auto enable_mode = static_cast(regs[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7); - const char* mode_name = "Unknown"; switch (enable_mode) { case ModeControl::kIgnore: ImGui::Text("Ignored Command %d", player_->current_command_index()); diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index 619ae270d..5ebddf604 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -48,8 +48,9 @@ namespace vulkan { class VulkanCommandProcessor final : public CommandProcessor { protected: +#define OVERRIDING_BASE_CMDPROCESSOR #include "../pm4_command_processor_declare.h" - +#undef OVERRIDING_BASE_CMDPROCESSOR public: // Single-descriptor layouts for use within a single frame. enum class SingleTransientDescriptorLayout { diff --git a/src/xenia/kernel/kernel_state.cc b/src/xenia/kernel/kernel_state.cc index 958489d48..fbd9da668 100644 --- a/src/xenia/kernel/kernel_state.cc +++ b/src/xenia/kernel/kernel_state.cc @@ -421,6 +421,9 @@ X_RESULT KernelState::FinishLoadingUserModule( emulator_->patcher()->ApplyPatchesForTitle(memory_, module->title_id(), module->hash()); emulator_->on_patch_apply(); + if (module->xex_module()) { + module->xex_module()->Precompile(); + } if (module->is_dll_module() && module->entry_point() && call_entry) { // Call DllMain(DLL_PROCESS_ATTACH): diff --git a/src/xenia/kernel/util/shim_utils.h b/src/xenia/kernel/util/shim_utils.h index 7ead28998..fe49fd05f 100644 --- a/src/xenia/kernel/util/shim_utils.h +++ b/src/xenia/kernel/util/shim_utils.h @@ -36,7 +36,7 @@ using PPCContext = xe::cpu::ppc::PPCContext; (xe::cpu::xe_kernel_export_shim_fn)export_name##_entry); #define SHIM_MEM_ADDR(a) \ - ((a) ? ppc_context->kernel_state->memory()->TranslateVirtual(a) : nullptr) + ((a) ? ppc_context->TranslateVirtual(a) : nullptr) #define SHIM_MEM_8(a) xe::load_and_swap(SHIM_MEM_ADDR(a)) #define SHIM_MEM_16(a) xe::load_and_swap(SHIM_MEM_ADDR(a)) @@ -159,7 +159,7 @@ class Param { uint32_t stack_ptr = uint32_t(init.ppc_context->r[1]) + 0x54 + (ordinal_ - 8) * 8; *out_value = xe::load_and_swap( - init.ppc_context->kernel_state->memory()->TranslateVirtual( + init.ppc_context->TranslateVirtual( stack_ptr)); } } @@ -212,7 +212,7 @@ class PointerParam : public ParamBase { PointerParam(Init& init) : ParamBase(init) { host_ptr_ = value_ - ? init.ppc_context->kernel_state->memory()->TranslateVirtual(value_) + ? init.ppc_context->TranslateVirtual(value_) : nullptr; } PointerParam(void* host_ptr) : ParamBase(), host_ptr_(host_ptr) {} @@ -251,8 +251,7 @@ template class PrimitivePointerParam : public ParamBase { public: PrimitivePointerParam(Init& init) : ParamBase(init) { - host_ptr_ = value_ ? init.ppc_context->kernel_state->memory() - ->TranslateVirtual*>(value_) + host_ptr_ = value_ ? init.ppc_context->TranslateVirtual*>(value_) : nullptr; } PrimitivePointerParam(T* host_ptr) : ParamBase() { @@ -285,7 +284,7 @@ class StringPointerParam : public ParamBase { StringPointerParam(Init& init) : ParamBase(init) { host_ptr_ = value_ - ? init.ppc_context->kernel_state->memory()->TranslateVirtual( + ? init.ppc_context->TranslateVirtual( value_) : nullptr; } @@ -311,7 +310,7 @@ class TypedPointerParam : public ParamBase { public: TypedPointerParam(Init& init) : ParamBase(init) { host_ptr_ = - value_ ? init.ppc_context->kernel_state->memory()->TranslateVirtual( + value_ ? init.ppc_context->TranslateVirtual( value_) : nullptr; } diff --git a/src/xenia/kernel/xam/xam_info.cc b/src/xenia/kernel/xam/xam_info.cc index b8b25d257..9c6dd0611 100644 --- a/src/xenia/kernel/xam/xam_info.cc +++ b/src/xenia/kernel/xam/xam_info.cc @@ -195,7 +195,8 @@ void XCustomRegisterDynamicActions_entry() { DECLARE_XAM_EXPORT1(XCustomRegisterDynamicActions, kNone, kStub); dword_result_t XGetAVPack_entry() { - // Value from https://github.com/Free60Project/libxenon/blob/920146f/libxenon/drivers/xenos/xenos_videomodes.h + // Value from + // https://github.com/Free60Project/libxenon/blob/920146f/libxenon/drivers/xenos/xenos_videomodes.h // DWORD // Not sure what the values are for this, but 6 is VGA. // Other likely values are 3/4/8 for HDMI or something. @@ -321,11 +322,16 @@ void XamLoaderTerminateTitle_entry() { } DECLARE_XAM_EXPORT1(XamLoaderTerminateTitle, kNone, kSketchy); -dword_result_t XamAlloc_entry(dword_t unk, dword_t size, lpdword_t out_ptr) { - assert_true(unk == 0); +dword_result_t XamAlloc_entry(dword_t flags, dword_t size, lpdword_t out_ptr) { + if (flags & 0x00100000) { // HEAP_ZERO_memory used unless this flag + // do nothing! + // maybe we ought to fill it with nonzero garbage, but otherwise this is a + // flag we can safely ignore + } // Allocate from the heap. Not sure why XAM does this specially, perhaps // it keeps stuff in a separate heap? + //chrispy: there is a set of different heaps it uses, an array of them. the top 4 bits of the 32 bit flags seems to select the heap uint32_t ptr = kernel_state()->memory()->SystemHeapAlloc(size); *out_ptr = ptr; diff --git a/src/xenia/kernel/xam/xam_ui.cc b/src/xenia/kernel/xam/xam_ui.cc index be8e2c892..11cd8faa7 100644 --- a/src/xenia/kernel/xam/xam_ui.cc +++ b/src/xenia/kernel/xam/xam_ui.cc @@ -55,6 +55,7 @@ class XamDialog : public xe::ui::ImGuiDialog { XamDialog(xe::ui::ImGuiDrawer* imgui_drawer) : xe::ui::ImGuiDialog(imgui_drawer) {} + virtual ~XamDialog() {} void OnClose() override { if (close_callback_) { close_callback_(); @@ -254,6 +255,7 @@ class MessageBoxDialog : public XamDialog { Close(); } } + virtual ~MessageBoxDialog() {} private: bool has_opened_ = false; @@ -264,8 +266,7 @@ class MessageBoxDialog : public XamDialog { uint32_t chosen_button_ = 0; }; -// https://www.se7ensins.com/forums/threads/working-xshowmessageboxui.844116/ -dword_result_t XamShowMessageBoxUI_entry( +static dword_result_t XamShowMessageBoxUi( dword_t user_index, lpu16string_t title_ptr, lpu16string_t text_ptr, dword_t button_count, lpdword_t button_ptrs, dword_t active_button, dword_t flags, lpdword_t result_ptr, pointer_t overlapped) { @@ -321,8 +322,28 @@ dword_result_t XamShowMessageBoxUI_entry( } return result; } + +// https://www.se7ensins.com/forums/threads/working-xshowmessageboxui.844116/ +dword_result_t XamShowMessageBoxUI_entry( + dword_t user_index, lpu16string_t title_ptr, lpu16string_t text_ptr, + dword_t button_count, lpdword_t button_ptrs, dword_t active_button, + dword_t flags, lpdword_t result_ptr, pointer_t overlapped) { + return XamShowMessageBoxUi(user_index, title_ptr, text_ptr, button_count, + button_ptrs, active_button, flags, result_ptr, + overlapped); +} DECLARE_XAM_EXPORT1(XamShowMessageBoxUI, kUI, kImplemented); +dword_result_t XamShowMessageBoxUIEx_entry( + dword_t user_index, lpu16string_t title_ptr, lpu16string_t text_ptr, + dword_t button_count, lpdword_t button_ptrs, dword_t active_button, + dword_t flags, dword_t unknown_unused, lpdword_t result_ptr, + pointer_t overlapped) { + return XamShowMessageBoxUi(user_index, title_ptr, text_ptr, button_count, + button_ptrs, active_button, flags, result_ptr, + overlapped); +} +DECLARE_XAM_EXPORT1(XamShowMessageBoxUIEx, kUI, kImplemented); class KeyboardInputDialog : public XamDialog { public: KeyboardInputDialog(xe::ui::ImGuiDrawer* imgui_drawer, std::string title, @@ -347,6 +368,7 @@ class KeyboardInputDialog : public XamDialog { xe::string_util::copy_truncating(text_buffer_.data(), default_text_, text_buffer_.size()); } + virtual ~KeyboardInputDialog() {} const std::string& text() const { return text_; } bool cancelled() const { return cancelled_; } diff --git a/src/xenia/kernel/xbdm/xbdm_misc.cc b/src/xenia/kernel/xbdm/xbdm_misc.cc index d66ff3c5b..0fef2d6a3 100644 --- a/src/xenia/kernel/xbdm/xbdm_misc.cc +++ b/src/xenia/kernel/xbdm/xbdm_misc.cc @@ -13,11 +13,12 @@ #include "xenia/kernel/xbdm/xbdm_private.h" #include "xenia/kernel/xthread.h" #include "xenia/xbox.h" - +//chrispy: no idea what a real valid value is for this +static constexpr const char DmXboxName[] = "Xbox360Name"; namespace xe { -namespace kernel { + namespace kernel { namespace xbdm { - +#define XBDM_SUCCESSFULL 0x02DA0000 #define MAKE_DUMMY_STUB_PTR(x) \ dword_result_t x##_entry() { return 0; } \ DECLARE_XBDM_EXPORT1(x, kDebug, kStub) @@ -36,11 +37,27 @@ MAKE_DUMMY_STUB_STATUS(DmFreePool); dword_result_t DmGetXbeInfo_entry() { // TODO(gibbed): 4D5307DC appears to expect this as success? // Unknown arguments -- let's hope things don't explode. - return 0x02DA0000; + return XBDM_SUCCESSFULL; } DECLARE_XBDM_EXPORT1(DmGetXbeInfo, kDebug, kStub); -MAKE_DUMMY_STUB_STATUS(DmGetXboxName); +dword_result_t DmGetXboxName_entry(const ppc_context_t& ctx) { + uint64_t arg1 = ctx->r[3]; + uint64_t arg2 = ctx->r[4]; + if (!arg1 || !arg2) { + return 0x80070057; + } + char* name_out = ctx->TranslateVirtualGPR(arg1); + + uint32_t* max_name_chars_ptr = ctx->TranslateVirtualGPR(arg2); + + uint32_t max_name_chars = xe::load_and_swap(max_name_chars_ptr); + strncpy(name_out, DmXboxName, sizeof(DmXboxName)); + + + return XBDM_SUCCESSFULL; +} +DECLARE_XBDM_EXPORT1(DmGetXboxName, kDebug, kImplemented) dword_result_t DmIsDebuggerPresent_entry() { return 0; } DECLARE_XBDM_EXPORT1(DmIsDebuggerPresent, kDebug, kStub); @@ -49,15 +66,15 @@ void DmSendNotificationString_entry(lpdword_t unk0_ptr) {} DECLARE_XBDM_EXPORT1(DmSendNotificationString, kDebug, kStub); dword_result_t DmRegisterCommandProcessor_entry(lpdword_t name_ptr, - lpdword_t handler_fn) { + lpdword_t handler_fn) { // Return success to prevent some games from crashing return X_STATUS_SUCCESS; } DECLARE_XBDM_EXPORT1(DmRegisterCommandProcessor, kDebug, kStub); dword_result_t DmRegisterCommandProcessorEx_entry(lpdword_t name_ptr, - lpdword_t handler_fn, - dword_t unk3) { + lpdword_t handler_fn, + dword_t unk3) { // Return success to prevent some games from stalling return X_STATUS_SUCCESS; } @@ -65,9 +82,12 @@ DECLARE_XBDM_EXPORT1(DmRegisterCommandProcessorEx, kDebug, kStub); MAKE_DUMMY_STUB_STATUS(DmStartProfiling); MAKE_DUMMY_STUB_STATUS(DmStopProfiling); - -dword_result_t DmCaptureStackBackTrace_entry(lpdword_t unk0_ptr, - lpdword_t unk1_ptr) { +// two arguments, first is num frames i think, second is some kind of pointer to +// where to capture +dword_result_t DmCaptureStackBackTrace_entry(const ppc_context_t& ctx) { + uint32_t nframes = static_cast(ctx->r[3]); + uint8_t* unknown_addr = + ctx->TranslateVirtual(static_cast(ctx->r[4])); return X_STATUS_INVALID_PARAMETER; } DECLARE_XBDM_EXPORT1(DmCaptureStackBackTrace, kDebug, kStub); @@ -82,7 +102,10 @@ dword_result_t DmWalkLoadedModules_entry(lpdword_t unk0_ptr, } DECLARE_XBDM_EXPORT1(DmWalkLoadedModules, kDebug, kStub); -void DmMapDevkitDrive_entry() {} +void DmMapDevkitDrive_entry(const ppc_context_t& ctx) { + // games check for nonzero result, failure if nz + ctx->r[3] = 0ULL; +} DECLARE_XBDM_EXPORT1(DmMapDevkitDrive, kDebug, kStub); dword_result_t DmFindPdbSignature_entry(lpdword_t unk0_ptr, diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc index 3a0d88ea4..fefe2df4e 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc @@ -28,7 +28,11 @@ namespace xe { namespace kernel { namespace xboxkrnl { - +struct X_STRING { + unsigned short length; + unsigned short pad; + uint32_t ptr; +}; // https://msdn.microsoft.com/en-us/library/ff561778 dword_result_t RtlCompareMemory_entry(lpvoid_t source1, lpvoid_t source2, dword_t length) { @@ -142,38 +146,80 @@ dword_result_t RtlLowerChar_entry(dword_t in) { } DECLARE_XBOXKRNL_EXPORT1(RtlLowerChar, kNone, kImplemented); -dword_result_t RtlCompareString_entry(lpstring_t string_1, lpstring_t string_2, - dword_t case_insensitive) { - int ret = case_insensitive ? xe_strcasecmp(string_1, string_2) - : std::strcmp(string_1, string_2); - - return ret; +static int RtlCompareStringN_impl(uint8_t* string_1, unsigned int string_1_len, + uint8_t* string_2, unsigned int string_2_len, + int case_insensitive) { + if (string_1_len == 0xFFFFFFFF) { + uint8_t* string1_strlen_iter = string_1; + while (*string1_strlen_iter++) + ; + string_1_len = + static_cast(string1_strlen_iter - string_1 - 1); + } + if (string_2_len == 0xFFFFFFFF) { + uint8_t* string2_strlen_iter = string_2; + while (*string2_strlen_iter++) + ; + string_2_len = + static_cast(string2_strlen_iter - string_2 - 1); + } + uint8_t* string1_end = &string_1[std::min(string_2_len, string_1_len)]; + if (case_insensitive) { + while (string_1 < string1_end) { + unsigned c1 = *string_1++; + unsigned c2 = *string_2++; + if (c1 != c2) { + unsigned cu1 = rtl_upper_table[c1]; + unsigned cu2 = rtl_upper_table[c2]; + if (cu1 != cu2) { + return cu1 - cu2; + } + } + } + } else { + while (string_1 < string1_end) { + unsigned c1 = *string_1++; + unsigned c2 = *string_2++; + if (c1 != c2) { + return c1 - c2; + } + } + } + // why? not sure, but its the original logic + return string_1_len - string_2_len; } -DECLARE_XBOXKRNL_EXPORT1(RtlCompareString, kNone, kImplemented); - dword_result_t RtlCompareStringN_entry(lpstring_t string_1, dword_t string_1_len, lpstring_t string_2, dword_t string_2_len, dword_t case_insensitive) { - uint32_t len1 = string_1_len; - uint32_t len2 = string_2_len; - - if (string_1_len == 0xFFFF) { - len1 = uint32_t(std::strlen(string_1)); - } - if (string_2_len == 0xFFFF) { - len2 = uint32_t(std::strlen(string_2)); - } - auto len = std::min(string_1_len, string_2_len); - - int ret = case_insensitive ? xe_strncasecmp(string_1, string_2, len) - : std::strncmp(string_1, string_2, len); - - return ret; + return RtlCompareStringN_impl( + reinterpret_cast(string_1.host_address()), string_1_len, + reinterpret_cast(string_2.host_address()), string_2_len, + case_insensitive); } + DECLARE_XBOXKRNL_EXPORT1(RtlCompareStringN, kNone, kImplemented); +dword_result_t RtlCompareString_entry(lpvoid_t string_1, lpvoid_t string_2, + dword_t case_insensitive) { + X_STRING* xs1 = string_1.as(); + X_STRING* xs2 = string_2.as(); + + unsigned length_1 = xe::load_and_swap(&xs1->length); + unsigned length_2 = xe::load_and_swap(&xs2->length); + + uint32_t ptr_1 = xe::load_and_swap(&xs1->ptr); + + uint32_t ptr_2 = xe::load_and_swap(&xs2->ptr); + + auto kmem = kernel_memory(); + + return RtlCompareStringN_impl( + kmem->TranslateVirtual(ptr_1), length_1, + kmem->TranslateVirtual(ptr_2), length_2, case_insensitive); +} +DECLARE_XBOXKRNL_EXPORT1(RtlCompareString, kNone, kImplemented); // https://msdn.microsoft.com/en-us/library/ff561918 void RtlInitAnsiString_entry(pointer_t destination, lpstring_t source) { @@ -188,13 +234,13 @@ void RtlInitAnsiString_entry(pointer_t destination, destination->pointer = source.guest_address(); } DECLARE_XBOXKRNL_EXPORT1(RtlInitAnsiString, kNone, kImplemented); -//https://learn.microsoft.com/en-us/windows-hardware/drivers/ddi/wdm/nf-wdm-rtlupcaseunicodechar +// https://learn.microsoft.com/en-us/windows-hardware/drivers/ddi/wdm/nf-wdm-rtlupcaseunicodechar dword_result_t RtlUpcaseUnicodeChar_entry(dword_t SourceCharacter) { - return std::use_facet>(std::locale()).toupper(SourceCharacter); + return std::use_facet>(std::locale()) + .toupper(SourceCharacter); } DECLARE_XBOXKRNL_EXPORT1(RtlUpcaseUnicodeChar, kNone, kImplemented); - // https://msdn.microsoft.com/en-us/library/ff561899 void RtlFreeAnsiString_entry(pointer_t string) { if (string->pointer) { @@ -206,8 +252,8 @@ void RtlFreeAnsiString_entry(pointer_t string) { DECLARE_XBOXKRNL_EXPORT1(RtlFreeAnsiString, kNone, kImplemented); // https://msdn.microsoft.com/en-us/library/ff561934 -void RtlInitUnicodeString_entry(pointer_t destination, - lpu16string_t source) { +pointer_result_t RtlInitUnicodeString_entry( + pointer_t destination, lpu16string_t source) { if (source) { destination->length = (uint16_t)source.value().size() * 2; destination->maximum_length = (uint16_t)(source.value().size() + 1) * 2; @@ -215,6 +261,7 @@ void RtlInitUnicodeString_entry(pointer_t destination, } else { destination->reset(); } + return destination.guest_address(); } DECLARE_XBOXKRNL_EXPORT1(RtlInitUnicodeString, kNone, kImplemented); @@ -671,6 +718,26 @@ dword_result_t RtlComputeCrc32_entry(dword_t seed, lpvoid_t buffer, } DECLARE_XBOXKRNL_EXPORT1(RtlComputeCrc32, kNone, kImplemented); +static void RtlRip_entry(const ppc_context_t& ctx) { + uint32_t arg1 = static_cast(ctx->r[3]); + uint32_t arg2 = static_cast(ctx->r[4]); + const char* msg_str1 = ""; + + const char* msg_str2 = ""; + + if (arg1) { + msg_str1 = ctx->TranslateVirtual(arg1); + } + + if (arg2) { + msg_str2 = ctx->TranslateVirtual(arg2); + } + + XELOGE("RtlRip called, arg1 = {}, arg2 = {}\n", msg_str1, msg_str2); + + //we should break here... not sure what to do exactly +} +DECLARE_XBOXKRNL_EXPORT1(RtlRip, kNone, kImportant); } // namespace xboxkrnl } // namespace kernel } // namespace xe diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc index 8d0283744..574a91585 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc @@ -9,7 +9,6 @@ #include #include - #include "xenia/base/atomic.h" #include "xenia/base/clock.h" #include "xenia/base/logging.h" @@ -913,7 +912,7 @@ dword_result_t NtWaitForMultipleObjectsEx_entry( dword_t count, lpdword_t handles, dword_t wait_type, dword_t wait_mode, dword_t alertable, lpqword_t timeout_ptr) { uint64_t timeout = timeout_ptr ? static_cast(*timeout_ptr) : 0u; - if (!count || count > 64 || wait_type != 1 && wait_type) { + if (!count || count > 64 || (wait_type != 1 && wait_type)) { return X_STATUS_INVALID_PARAMETER; } return xeNtWaitForMultipleObjectsEx(count, handles, wait_type, wait_mode, @@ -964,7 +963,7 @@ uint32_t xeKeKfAcquireSpinLock(uint32_t* lock, uint64_t r13 = 1) { PrefetchForCAS(lock); assert_true(*lock != static_cast(r13)); // Lock. - while (!xe::atomic_cas(0, static_cast(r13), lock)) { + while (!xe::atomic_cas(0, xe::byte_swap(static_cast(r13)), lock)) { // Spin! // TODO(benvanik): error on deadlock? xe::threading::MaybeYield(); @@ -978,7 +977,7 @@ uint32_t xeKeKfAcquireSpinLock(uint32_t* lock, uint64_t r13 = 1) { } dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr, - ppc_context_t& ppc_context) { + const ppc_context_t& ppc_context) { auto lock = reinterpret_cast(lock_ptr.host_address()); return xeKeKfAcquireSpinLock(lock, ppc_context->r[13]); } @@ -997,9 +996,7 @@ void xeKeKfReleaseSpinLock(uint32_t* lock, dword_t old_irql) { } void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql, - ppc_context_t& ppc_ctx) { - auto lock = reinterpret_cast(lock_ptr.host_address()); - + const ppc_context_t& ppc_ctx) { assert_true(*lock_ptr == static_cast(ppc_ctx->r[13])); *lock_ptr = 0; @@ -1014,14 +1011,14 @@ DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented, kHighFrequency); // todo: this is not accurate void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr, - ppc_context_t& ppc_ctx) { + const ppc_context_t& ppc_ctx) { // Lock. auto lock = reinterpret_cast(lock_ptr.host_address()); // must not be our own thread assert_true(*lock_ptr != static_cast(ppc_ctx->r[13])); PrefetchForCAS(lock); - while (!xe::atomic_cas(0, static_cast(ppc_ctx->r[13]), lock)) { + while (!xe::atomic_cas(0, xe::byte_swap(static_cast(ppc_ctx->r[13])), lock)) { #if XE_ARCH_AMD64 == 1 // todo: this is just a nop if they don't have SMT, which is not great // either... @@ -1036,12 +1033,12 @@ DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading, kImplemented, kBlocking, kHighFrequency); dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry( - lpdword_t lock_ptr, ppc_context_t& ppc_ctx) { + lpdword_t lock_ptr, const ppc_context_t& ppc_ctx) { // Lock. auto lock = reinterpret_cast(lock_ptr.host_address()); assert_true(*lock_ptr != static_cast(ppc_ctx->r[13])); PrefetchForCAS(lock); - if (!xe::atomic_cas(0, static_cast(ppc_ctx->r[13]), lock)) { + if (!xe::atomic_cas(0, xe::byte_swap(static_cast(ppc_ctx->r[13])), lock)) { return 0; } return 1; @@ -1050,10 +1047,9 @@ DECLARE_XBOXKRNL_EXPORT4(KeTryToAcquireSpinLockAtRaisedIrql, kThreading, kImplemented, kBlocking, kHighFrequency, kSketchy); void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr, - ppc_context_t& ppc_ctx) { + const ppc_context_t& ppc_ctx) { // Unlock. assert_true(*lock_ptr == static_cast(ppc_ctx->r[13])); - auto lock = reinterpret_cast(lock_ptr.host_address()); *lock_ptr = 0; } DECLARE_XBOXKRNL_EXPORT2(KeReleaseSpinLockFromRaisedIrql, kThreading, @@ -1283,7 +1279,8 @@ void ExInitializeReadWriteLock_entry(pointer_t lock_ptr) { } DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented); -void ExAcquireReadWriteLockExclusive_entry(pointer_t lock_ptr, ppc_context_t& ppc_context) { +void ExAcquireReadWriteLockExclusive_entry(pointer_t lock_ptr, + const ppc_context_t& ppc_context) { auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); int32_t lock_count = ++lock_ptr->lock_count; @@ -1301,7 +1298,7 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockExclusive, kThreading, kImplemented, kBlocking); dword_result_t ExTryToAcquireReadWriteLockExclusive_entry( - pointer_t lock_ptr, ppc_context_t& ppc_context) { + pointer_t lock_ptr, const ppc_context_t& ppc_context) { auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); @@ -1320,7 +1317,7 @@ DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading, kImplemented); void ExAcquireReadWriteLockShared_entry(pointer_t lock_ptr, - ppc_context_t& ppc_context) { + const ppc_context_t& ppc_context) { auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); int32_t lock_count = ++lock_ptr->lock_count; @@ -1340,7 +1337,7 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockShared, kThreading, kImplemented, kBlocking); dword_result_t ExTryToAcquireReadWriteLockShared_entry( - pointer_t lock_ptr, ppc_context_t& ppc_context) { + pointer_t lock_ptr, const ppc_context_t& ppc_context) { auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); @@ -1361,7 +1358,7 @@ DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockShared, kThreading, kImplemented); void ExReleaseReadWriteLock_entry(pointer_t lock_ptr, - ppc_context_t& ppc_context) { + const ppc_context_t& ppc_context) { auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); @@ -1404,7 +1401,7 @@ pointer_result_t InterlockedPushEntrySList_entry( assert_not_null(entry); alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr; - alignas(8) X_SLIST_HEADER new_hdr = {0}; + alignas(8) X_SLIST_HEADER new_hdr = {{0}, 0, 0}; uint32_t old_head = 0; do { old_hdr = *plist_ptr; @@ -1428,8 +1425,8 @@ pointer_result_t InterlockedPopEntrySList_entry( assert_not_null(plist_ptr); uint32_t popped = 0; - alignas(8) X_SLIST_HEADER old_hdr = {0}; - alignas(8) X_SLIST_HEADER new_hdr = {0}; + alignas(8) X_SLIST_HEADER old_hdr = {{0}, 0, 0}; + alignas(8) X_SLIST_HEADER new_hdr = {{0}, 0, 0}; do { old_hdr = *plist_ptr; auto next = kernel_memory()->TranslateVirtual( @@ -1456,7 +1453,7 @@ pointer_result_t InterlockedFlushSList_entry( assert_not_null(plist_ptr); alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr; - alignas(8) X_SLIST_HEADER new_hdr = {0}; + alignas(8) X_SLIST_HEADER new_hdr = {{0}, 0, 0}; uint32_t first = 0; do { old_hdr = *plist_ptr; diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc index 531bd49de..d8e225924 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc @@ -433,7 +433,7 @@ void VdSwap_entry( return; } gpu_fetch.base_address = frontbuffer_physical_address >> 12; - + XE_MAYBE_UNUSED auto texture_format = gpu::xenos::TextureFormat(texture_format_ptr.value()); auto color_space = *color_space_ptr; assert_true(texture_format == gpu::xenos::TextureFormat::k_8_8_8_8 || diff --git a/src/xenia/kernel/xthread.h b/src/xenia/kernel/xthread.h index 35af2bc12..3e0ff19c4 100644 --- a/src/xenia/kernel/xthread.h +++ b/src/xenia/kernel/xthread.h @@ -41,8 +41,7 @@ struct XAPC { // KAPC is 0x28(40) bytes? (what's passed to ExAllocatePoolWithTag) // This is 4b shorter than NT - looks like the reserved dword at +4 is gone. // NOTE: stored in guest memory. - uint8_t type; // +0 - uint8_t unk1; // +1 + uint16_t type; // +0 uint8_t processor_mode; // +2 uint8_t enqueued; // +3 xe::be thread_ptr; // +4 @@ -57,7 +56,6 @@ struct XAPC { void Initialize() { type = 18; // ApcObject - unk1 = 0; processor_mode = 0; enqueued = 0; thread_ptr = 0; diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc index f29eb21dc..b160696f4 100644 --- a/src/xenia/memory.cc +++ b/src/xenia/memory.cc @@ -316,9 +316,10 @@ void Memory::Reset() { heaps_.v90000000.Reset(); heaps_.physical.Reset(); } +//clang does not like non-standard layout offsetof +#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL==0 XE_NOALIAS const BaseHeap* Memory::LookupHeap(uint32_t address) const { -#if 1 #define HEAP_INDEX(name) \ offsetof(Memory, heaps_.name) - offsetof(Memory, heaps_) @@ -354,8 +355,11 @@ const BaseHeap* Memory::LookupHeap(uint32_t address) const { heap_select = nullptr; } return reinterpret_cast(selected_heap_offset + heap_select); - +} #else +XE_NOALIAS +const BaseHeap* Memory::LookupHeap(uint32_t address) const { + if (address < 0x40000000) { return &heaps_.v00000000; } else if (address < 0x7F000000) { @@ -375,9 +379,8 @@ const BaseHeap* Memory::LookupHeap(uint32_t address) const { } else { return nullptr; } -#endif } - +#endif BaseHeap* Memory::LookupHeapByType(bool physical, uint32_t page_size) { if (physical) { if (page_size <= 4096) { @@ -1069,7 +1072,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address, if (start_page_number == UINT_MAX || end_page_number == UINT_MAX) { // Out of memory. XELOGE("BaseHeap::Alloc failed to find contiguous range"); - assert_always("Heap exhausted!"); + //assert_always("Heap exhausted!"); return false; } diff --git a/src/xenia/memory.h b/src/xenia/memory.h index 672115d5c..77b8ff44f 100644 --- a/src/xenia/memory.h +++ b/src/xenia/memory.h @@ -285,7 +285,8 @@ class PhysicalHeap : public BaseHeap { uint32_t GetPhysicalAddress(uint32_t address) const; uint32_t SystemPagenumToGuestPagenum(uint32_t num) const { - return ((num << system_page_shift_) - host_address_offset()) >> page_size_shift_; + return ((num << system_page_shift_) - host_address_offset()) >> + page_size_shift_; } uint32_t GuestPagenumToSystemPagenum(uint32_t num) { @@ -294,6 +295,7 @@ class PhysicalHeap : public BaseHeap { num >>= system_page_shift_; return num; } + protected: VirtualHeap* parent_heap_; @@ -351,12 +353,21 @@ class Memory { // Note that the contents at the specified host address are big-endian. template inline T TranslateVirtual(uint32_t guest_address) const { +#if XE_PLATFORM_WIN32 == 1 + uint8_t* host_address = virtual_membase_ + guest_address; + if (guest_address >= 0xE0000000) { + host_address += 0x1000; + } + return reinterpret_cast(host_address); +#else uint8_t* host_address = virtual_membase_ + guest_address; const auto heap = LookupHeap(guest_address); if (heap) { host_address += heap->host_address_offset(); } return reinterpret_cast(host_address); + +#endif } // Base address of physical memory in the host address space. diff --git a/src/xenia/vfs/devices/null_device.cc b/src/xenia/vfs/devices/null_device.cc index ef34fd833..79490376b 100644 --- a/src/xenia/vfs/devices/null_device.cc +++ b/src/xenia/vfs/devices/null_device.cc @@ -21,7 +21,7 @@ namespace vfs { NullDevice::NullDevice(const std::string& mount_path, const std::initializer_list& null_paths) - : Device(mount_path), null_paths_(null_paths), name_("NullDevice") {} + : Device(mount_path), name_("NullDevice"), null_paths_(null_paths) {} NullDevice::~NullDevice() = default; diff --git a/third_party/FFmpeg b/third_party/FFmpeg index a437fe6d8..a14f5c038 160000 --- a/third_party/FFmpeg +++ b/third_party/FFmpeg @@ -1 +1 @@ -Subproject commit a437fe6d8efef17c8ad33d39f5815032e7adf5d7 +Subproject commit a14f5c03834a79fc401626a4dad7a58a2da0c445 From c70ae76a6972214530f66f526be0d42a5dda69a6 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sat, 5 Nov 2022 11:08:04 -0700 Subject: [PATCH 2/2] hopefully switched cxxopts to the main master branch now that the selectany changes are accepted --- third_party/cxxopts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/cxxopts b/third_party/cxxopts index b2b8cf2f5..f087dc8fc 160000 --- a/third_party/cxxopts +++ b/third_party/cxxopts @@ -1 +1 @@ -Subproject commit b2b8cf2f50a449720874f43445e23d75b77dcc43 +Subproject commit f087dc8fcdcd6aabba68e671ae17ff3e975134f4