Merge pull request #87 from chrisps/canary_experimental

Minor decoder optimizations, kernel fixes, cpu backend fixes
This commit is contained in:
chrisps 2022-11-01 11:49:10 -07:00 committed by GitHub
commit 781871e2d5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
41 changed files with 1011 additions and 598 deletions

View File

@ -614,7 +614,7 @@ bool EmulatorWindow::Initialize() {
MenuItem::Type::kString, "Build commit on GitHub...", "F2", MenuItem::Type::kString, "Build commit on GitHub...", "F2",
std::bind(&EmulatorWindow::ShowBuildCommit, this))); std::bind(&EmulatorWindow::ShowBuildCommit, this)));
help_menu->AddChild(MenuItem::Create( help_menu->AddChild(MenuItem::Create(
MenuItem::Type::kString, "Recent changes on GitHub...", [this]() { MenuItem::Type::kString, "Recent changes on GitHub...", []() {
LaunchWebBrowser( LaunchWebBrowser(
"https://github.com/xenia-project/xenia/compare/" XE_BUILD_COMMIT "https://github.com/xenia-project/xenia/compare/" XE_BUILD_COMMIT
"..." XE_BUILD_BRANCH); "..." XE_BUILD_BRANCH);
@ -622,7 +622,7 @@ bool EmulatorWindow::Initialize() {
help_menu->AddChild(MenuItem::Create(MenuItem::Type::kSeparator)); help_menu->AddChild(MenuItem::Create(MenuItem::Type::kSeparator));
help_menu->AddChild(MenuItem::Create( help_menu->AddChild(MenuItem::Create(
MenuItem::Type::kString, "&About...", MenuItem::Type::kString, "&About...",
[this]() { LaunchWebBrowser("https://xenia.jp/about/"); })); []() { LaunchWebBrowser("https://xenia.jp/about/"); }));
} }
main_menu->AddChild(std::move(help_menu)); main_menu->AddChild(std::move(help_menu));

View File

@ -170,8 +170,10 @@ CommandVar<T>::CommandVar(const char* name, T* default_value,
const char* description) const char* description)
: name_(name), : name_(name),
default_value_(*default_value), default_value_(*default_value),
description_(description), current_value_(default_value),
current_value_(default_value) {} commandline_value_(),
description_(description)
{}
template <class T> template <class T>
ConfigVar<T>::ConfigVar(const char* name, T* default_value, ConfigVar<T>::ConfigVar(const char* name, T* default_value,

View File

@ -149,7 +149,7 @@ class Win32FileHandle : public FileHandle {
return false; return false;
} }
} }
bool SetLength(size_t length) { bool SetLength(size_t length) override {
LARGE_INTEGER position; LARGE_INTEGER position;
position.QuadPart = length; position.QuadPart = length;
if (!SetFilePointerEx(handle_, position, nullptr, SEEK_SET)) { if (!SetFilePointerEx(handle_, position, nullptr, SEEK_SET)) {

View File

@ -59,7 +59,7 @@ static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3); CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3); CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
#pragma loop(no_vector)
for (uint32_t i = 0; i < num_lines_for_8k; ++i) { for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
xe::swcache::CacheLine line0, line1, line2, line3; xe::swcache::CacheLine line0, line1, line2, line3;
@ -92,7 +92,6 @@ static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to,
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3); CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3); CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
#pragma loop(no_vector)
for (uint32_t i = 0; i < num_lines_for_8k; ++i) { for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
_movdir64b(dest1 + i, src1 + i); _movdir64b(dest1 + i, src1 + i);
_movdir64b(dest2 + i, src2 + i); _movdir64b(dest2 + i, src2 + i);

View File

@ -620,23 +620,23 @@ static void Prefetch(const void* addr) {
} }
template <> template <>
void Prefetch<PrefetchTag::Write>(const void* addr) { XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Write>(const void* addr) {
PrefetchW(addr); PrefetchW(addr);
} }
template <> template <>
void Prefetch<PrefetchTag::Nontemporal>(const void* addr) { XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Nontemporal>(const void* addr) {
PrefetchNTA(addr); PrefetchNTA(addr);
} }
template <> template <>
void Prefetch<PrefetchTag::Level3>(const void* addr) { XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Level3>(const void* addr) {
PrefetchL3(addr); PrefetchL3(addr);
} }
template <> template <>
void Prefetch<PrefetchTag::Level2>(const void* addr) { XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Level2>(const void* addr) {
PrefetchL2(addr); PrefetchL2(addr);
} }
template <> template <>
void Prefetch<PrefetchTag::Level1>(const void* addr) { XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Level1>(const void* addr) {
PrefetchL1(addr); PrefetchL1(addr);
} }
// todo: does aarch64 have streaming stores/loads? // todo: does aarch64 have streaming stores/loads?

View File

@ -25,6 +25,7 @@ namespace xe {
*/ */
class alignas(4096) xe_global_mutex { class alignas(4096) xe_global_mutex {
XE_MAYBE_UNUSED
char detail[64]; char detail[64];
public: public:
@ -38,6 +39,7 @@ class alignas(4096) xe_global_mutex {
using global_mutex_type = xe_global_mutex; using global_mutex_type = xe_global_mutex;
class alignas(64) xe_fast_mutex { class alignas(64) xe_fast_mutex {
XE_MAYBE_UNUSED
char detail[64]; char detail[64];
public: public:
@ -62,8 +64,6 @@ class xe_unlikely_mutex {
~xe_unlikely_mutex() { mut = 0; } ~xe_unlikely_mutex() { mut = 0; }
void lock() { void lock() {
uint32_t lock_expected = 0;
if (XE_LIKELY(_tryget())) { if (XE_LIKELY(_tryget())) {
return; return;
} else { } else {

View File

@ -144,9 +144,11 @@
#define XE_MSVC_OPTIMIZE_SMALL() #define XE_MSVC_OPTIMIZE_SMALL()
#define XE_MSVC_OPTIMIZE_REVERT() #define XE_MSVC_OPTIMIZE_REVERT()
#endif #endif
#if XE_COMPILER_HAS_GNU_EXTENSIONS == 1 #if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
#define XE_LIKELY_IF(...) if (XE_LIKELY(__VA_ARGS__)) #define XE_LIKELY_IF(...) if (XE_LIKELY(__VA_ARGS__))
#define XE_UNLIKELY_IF(...) if (XE_UNLIKELY(__VA_ARGS__)) #define XE_UNLIKELY_IF(...) if (XE_UNLIKELY(__VA_ARGS__))
#define XE_MAYBE_UNUSED __attribute__((unused))
#else #else
#if __cplusplus >= 202002 #if __cplusplus >= 202002
#define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) [[likely]] #define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) [[likely]]
@ -155,6 +157,7 @@
#define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) #define XE_LIKELY_IF(...) if (!!(__VA_ARGS__))
#define XE_UNLIKELY_IF(...) if (!!(__VA_ARGS__)) #define XE_UNLIKELY_IF(...) if (!!(__VA_ARGS__))
#endif #endif
#define XE_MAYBE_UNUSED
#endif #endif
// only use __restrict if MSVC, for clang/gcc we can use -fstrict-aliasing which // only use __restrict if MSVC, for clang/gcc we can use -fstrict-aliasing which
// acts as __restrict across the board todo: __restrict is part of the type // acts as __restrict across the board todo: __restrict is part of the type

View File

@ -78,7 +78,9 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t _count) {
if (read_offset_ < write_offset_) { if (read_offset_ < write_offset_) {
assert_true(read_offset_ + count <= write_offset_); assert_true(read_offset_ + count <= write_offset_);
} else if (read_offset_ + count >= capacity_) { } else if (read_offset_ + count >= capacity_) {
XE_MAYBE_UNUSED
ring_size_t left_half = capacity_ - read_offset_; ring_size_t left_half = capacity_ - read_offset_;
assert_true(count - left_half <= write_offset_); assert_true(count - left_half <= write_offset_);
} }
@ -107,6 +109,7 @@ size_t RingBuffer::Write(const uint8_t* buffer, size_t _count) {
if (write_offset_ < read_offset_) { if (write_offset_ < read_offset_) {
assert_true(write_offset_ + count <= read_offset_); assert_true(write_offset_ + count <= read_offset_);
} else if (write_offset_ + count >= capacity_) { } else if (write_offset_ + count >= capacity_) {
XE_MAYBE_UNUSED
size_t left_half = capacity_ - write_offset_; size_t left_half = capacity_ - write_offset_;
assert_true(count - left_half <= read_offset_); assert_true(count - left_half <= read_offset_);
} }

View File

@ -68,7 +68,6 @@ class RingBuffer {
ring_size_t offset_delta = write_offs - read_offs; ring_size_t offset_delta = write_offs - read_offs;
ring_size_t wrap_read_count = (cap - read_offs) + write_offs; ring_size_t wrap_read_count = (cap - read_offs) + write_offs;
ring_size_t comparison_value = read_offs <= write_offs;
if (XE_LIKELY(read_offs <= write_offs)) { if (XE_LIKELY(read_offs <= write_offs)) {
return offset_delta; // will be 0 if they are equal, semantically return offset_delta; // will be 0 if they are equal, semantically

View File

@ -67,8 +67,6 @@ class split_map {
void InsertAt(TKey k, TValue v, uint32_t idx) { void InsertAt(TKey k, TValue v, uint32_t idx) {
uint32_t old_size = size(); uint32_t old_size = size();
bool needs_shiftup = idx != old_size;
values_.insert(values_.begin() + idx, v); values_.insert(values_.begin() + idx, v);
keys_.insert(keys_.begin() + idx, k); keys_.insert(keys_.begin() + idx, k);
} }

View File

@ -117,7 +117,7 @@ void set_name(const std::string_view name) {
// checked ntoskrnl, it does not modify delay, so we can place this as a // checked ntoskrnl, it does not modify delay, so we can place this as a
// constant and avoid creating a stack variable // constant and avoid creating a stack variable
static const LARGE_INTEGER sleepdelay0_for_maybeyield{0LL}; static const LARGE_INTEGER sleepdelay0_for_maybeyield{{0LL}};
void MaybeYield() { void MaybeYield() {
#if 0 #if 0
@ -314,7 +314,8 @@ class Win32Event : public Win32Handle<Event> {
} }
#endif #endif
EventInfo Query() { EventInfo result{}; EventInfo Query() override {
EventInfo result{};
NtQueryEventPointer.invoke(handle_, 0, &result, sizeof(EventInfo), nullptr); NtQueryEventPointer.invoke(handle_, 0, &result, sizeof(EventInfo), nullptr);
return result; return result;
} }
@ -429,7 +430,7 @@ class Win32Timer : public Win32Handle<Timer> {
} }
bool SetRepeatingAt(GClock_::time_point due_time, bool SetRepeatingAt(GClock_::time_point due_time,
std::chrono::milliseconds period, std::chrono::milliseconds period,
std::function<void()> opt_callback = nullptr) { std::function<void()> opt_callback = nullptr) override {
return SetRepeatingAt(date::clock_cast<WClock_>(due_time), period, return SetRepeatingAt(date::clock_cast<WClock_>(due_time), period,
std::move(opt_callback)); std::move(opt_callback));
} }

View File

@ -46,10 +46,6 @@ DEFINE_bool(ignore_undefined_externs, true,
DEFINE_bool(emit_source_annotations, false, DEFINE_bool(emit_source_annotations, false,
"Add extra movs and nops to make disassembly easier to read.", "Add extra movs and nops to make disassembly easier to read.",
"CPU"); "CPU");
DEFINE_bool(resolve_rel32_guest_calls, true,
"Experimental optimization, directly call already resolved "
"functions via x86 rel32 call/jmp",
"CPU");
DEFINE_bool(enable_incorrect_roundingmode_behavior, false, DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
"Disables the FPU/VMX MXCSR sharing workaround, potentially " "Disables the FPU/VMX MXCSR sharing workaround, potentially "
@ -78,7 +74,6 @@ using namespace xe::literals;
static const size_t kMaxCodeSize = 1_MiB; static const size_t kMaxCodeSize = 1_MiB;
static const size_t kStashOffset = 32;
// static const size_t kStashOffsetHigh = 32 + 32; // static const size_t kStashOffsetHigh = 32 + 32;
const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = { const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = {
@ -141,55 +136,6 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
return true; return true;
} }
#pragma pack(push, 1)
struct RGCEmitted {
uint8_t ff_;
uint32_t rgcid_;
};
#pragma pack(pop)
#if 0
void X64Emitter::InjectCallAddresses(void* new_execute_address) {
for (auto&& callsite : call_sites_) {
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
hunter =
reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
}
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
hunter->rgcid_ =
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
reinterpret_cast<intptr_t>(hunter + 1));
}
}
#else
void X64Emitter::InjectCallAddresses(void* new_execute_address) {
#if 0
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
std::map<uint32_t, ResolvableGuestCall*> id_to_rgc{};
for (auto&& callsite : call_sites_) {
id_to_rgc[callsite.offset_] = &callsite;
}
#else
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
for (auto&& callsite : call_sites_) {
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
hunter =
reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
}
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
hunter->rgcid_ =
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
reinterpret_cast<intptr_t>(hunter + 1));
}
#endif
}
#endif
void* X64Emitter::Emplace(const EmitFunctionInfo& func_info, void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
GuestFunction* function) { GuestFunction* function) {
// To avoid changing xbyak, we do a switcharoo here. // To avoid changing xbyak, we do a switcharoo here.
@ -207,10 +153,6 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
if (function) { if (function) {
code_cache_->PlaceGuestCode(function->address(), top_, func_info, function, code_cache_->PlaceGuestCode(function->address(), top_, func_info, function,
new_execute_address, new_write_address); new_execute_address, new_write_address);
if (cvars::resolve_rel32_guest_calls) {
InjectCallAddresses(new_execute_address);
}
} else { } else {
code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address, code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address,
new_write_address); new_write_address);
@ -219,7 +161,6 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
ready(); ready();
top_ = old_address; top_ = old_address;
reset(); reset();
call_sites_.clear();
tail_code_.clear(); tail_code_.clear();
for (auto&& cached_label : label_cache_) { for (auto&& cached_label : label_cache_) {
delete cached_label; delete cached_label;
@ -336,7 +277,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
// Mark block labels. // Mark block labels.
auto label = block->label_head; auto label = block->label_head;
while (label) { while (label) {
L(label->name); L(std::to_string(label->id));
label = label->next; label = label->next;
} }
@ -418,7 +359,6 @@ void X64Emitter::EmitProfilerEpilogue() {
// actually... lets just try without atomics lol // actually... lets just try without atomics lol
// lock(); // lock();
add(qword[r10], rdx); add(qword[r10], rdx);
} }
#endif #endif
} }
@ -534,44 +474,23 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
auto fn = static_cast<X64Function*>(function); auto fn = static_cast<X64Function*>(function);
// Resolve address to the function to call and store in rax. // Resolve address to the function to call and store in rax.
if (cvars::resolve_rel32_guest_calls && fn->machine_code()) { if (fn->machine_code()) {
ResolvableGuestCall rgc;
rgc.destination_ = uint32_t(uint64_t(fn->machine_code()));
rgc.offset_ = current_rgc_id_;
current_rgc_id_++;
if (!(instr->flags & hir::CALL_TAIL)) { if (!(instr->flags & hir::CALL_TAIL)) {
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
db(0xFF); call((void*)fn->machine_code());
rgc.is_jump_ = false;
dd(rgc.offset_);
} else { } else {
// tail call // tail call
EmitTraceUserCallReturn(); EmitTraceUserCallReturn();
EmitProfilerEpilogue();
rgc.is_jump_ = true;
// Pass the callers return address over. // Pass the callers return address over.
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]); mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
add(rsp, static_cast<uint32_t>(stack_size())); add(rsp, static_cast<uint32_t>(stack_size()));
db(0xFF); jmp((void*)fn->machine_code(), T_NEAR);
dd(rgc.offset_);
} }
call_sites_.push_back(rgc);
return; return;
}
if (fn->machine_code()) {
// TODO(benvanik): is it worth it to do this? It removes the need for
// a ResolveFunction call, but makes the table less useful.
assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000);
// todo: this should be changed so that we can actually do a call to
// fn->machine_code. the code will be emitted near us, so 32 bit rel jmp
// should be possible
mov(eax, uint32_t(uint64_t(fn->machine_code())));
} else if (code_cache_->has_indirection_table()) { } else if (code_cache_->has_indirection_table()) {
// Load the pointer to the indirection table maintained in X64CodeCache. // Load the pointer to the indirection table maintained in X64CodeCache.
// The target dword will either contain the address of the generated code // The target dword will either contain the address of the generated code
@ -1017,7 +936,10 @@ static const vec128_t xmm_consts[] = {
/*XMMSTVLShuffle*/ /*XMMSTVLShuffle*/
v128_setr_bytes(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), v128_setr_bytes(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
/* XMMSTVRSwapMask*/ /* XMMSTVRSwapMask*/
vec128b((uint8_t)0x83)}; vec128b((uint8_t)0x83), /*XMMVSRShlByteshuf*/
v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
// XMMVSRMask
vec128b(1)};
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
for (auto& vec : xmm_consts) { for (auto& vec : xmm_consts) {

View File

@ -172,7 +172,9 @@ enum XmmConst {
XMMLVLShuffle, XMMLVLShuffle,
XMMLVRCmp16, XMMLVRCmp16,
XMMSTVLShuffle, XMMSTVLShuffle,
XMMSTVRSwapMask // swapwordmask with bit 7 set XMMSTVRSwapMask, // swapwordmask with bit 7 set
XMMVSRShlByteshuf,
XMMVSRMask
}; };
using amdfx::xopcompare_e; using amdfx::xopcompare_e;
@ -190,13 +192,6 @@ class XbyakAllocator : public Xbyak::Allocator {
virtual bool useProtect() const { return false; } virtual bool useProtect() const { return false; }
}; };
class ResolvableGuestCall {
public:
bool is_jump_;
uintptr_t destination_;
// rgcid
unsigned offset_;
};
class X64Emitter; class X64Emitter;
using TailEmitCallback = std::function<void(X64Emitter& e, Xbyak::Label& lbl)>; using TailEmitCallback = std::function<void(X64Emitter& e, Xbyak::Label& lbl)>;
struct TailEmitter { struct TailEmitter {
@ -220,7 +215,6 @@ class X64Emitter : public Xbyak::CodeGenerator {
uint32_t debug_info_flags, FunctionDebugInfo* debug_info, uint32_t debug_info_flags, FunctionDebugInfo* debug_info,
void** out_code_address, size_t* out_code_size, void** out_code_address, size_t* out_code_size,
std::vector<SourceMapEntry>* out_source_map); std::vector<SourceMapEntry>* out_source_map);
void InjectCallAddresses(void* new_execute_addr);
public: public:
// Reserved: rsp, rsi, rdi // Reserved: rsp, rsi, rdi
@ -230,7 +224,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
// xmm4-xmm15 (save to get xmm3) // xmm4-xmm15 (save to get xmm3)
static const int GPR_COUNT = 7; static const int GPR_COUNT = 7;
static const int XMM_COUNT = 12; static const int XMM_COUNT = 12;
static constexpr size_t kStashOffset = 32;
static void SetupReg(const hir::Value* v, Xbyak::Reg8& r) { static void SetupReg(const hir::Value* v, Xbyak::Reg8& r) {
auto idx = gpr_reg_map_[v->reg.index]; auto idx = gpr_reg_map_[v->reg.index];
r = Xbyak::Reg8(idx); r = Xbyak::Reg8(idx);
@ -410,8 +404,6 @@ class X64Emitter : public Xbyak::CodeGenerator {
static const uint32_t gpr_reg_map_[GPR_COUNT]; static const uint32_t gpr_reg_map_[GPR_COUNT];
static const uint32_t xmm_reg_map_[XMM_COUNT]; static const uint32_t xmm_reg_map_[XMM_COUNT];
uint32_t current_rgc_id_ = 0xEEDDF00F;
std::vector<ResolvableGuestCall> call_sites_;
/* /*
set to true if the low 32 bits of membase == 0. set to true if the low 32 bits of membase == 0.
only really advantageous if you are storing 32 bit 0 to a displaced address, only really advantageous if you are storing 32 bit 0 to a displaced address,

View File

@ -25,46 +25,46 @@ static void EmitFusedBranch(X64Emitter& e, const T& i) {
bool valid = i.instr->prev && i.instr->prev->dest == i.src1.value; bool valid = i.instr->prev && i.instr->prev->dest == i.src1.value;
auto opcode = valid ? i.instr->prev->opcode->num : -1; auto opcode = valid ? i.instr->prev->opcode->num : -1;
if (valid) { if (valid) {
auto name = i.src2.value->name; std::string name = i.src2.value->GetIdString();
switch (opcode) { switch (opcode) {
case OPCODE_COMPARE_EQ: case OPCODE_COMPARE_EQ:
e.je(name, e.T_NEAR); e.je(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_NE: case OPCODE_COMPARE_NE:
e.jne(name, e.T_NEAR); e.jne(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_SLT: case OPCODE_COMPARE_SLT:
e.jl(name, e.T_NEAR); e.jl(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_SLE: case OPCODE_COMPARE_SLE:
e.jle(name, e.T_NEAR); e.jle(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_SGT: case OPCODE_COMPARE_SGT:
e.jg(name, e.T_NEAR); e.jg(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_SGE: case OPCODE_COMPARE_SGE:
e.jge(name, e.T_NEAR); e.jge(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_ULT: case OPCODE_COMPARE_ULT:
e.jb(name, e.T_NEAR); e.jb(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_ULE: case OPCODE_COMPARE_ULE:
e.jbe(name, e.T_NEAR); e.jbe(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_UGT: case OPCODE_COMPARE_UGT:
e.ja(name, e.T_NEAR); e.ja(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_UGE: case OPCODE_COMPARE_UGE:
e.jae(name, e.T_NEAR); e.jae(std::move(name), e.T_NEAR);
break; break;
default: default:
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
e.jnz(name, e.T_NEAR); e.jnz(std::move(name), e.T_NEAR);
break; break;
} }
} else { } else {
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
e.jnz(i.src2.value->name, e.T_NEAR); e.jnz(i.src2.value->GetIdString(), e.T_NEAR);
} }
} }
// ============================================================================ // ============================================================================
@ -490,7 +490,7 @@ EMITTER_OPCODE_TABLE(OPCODE_SET_RETURN_ADDRESS, SET_RETURN_ADDRESS);
// ============================================================================ // ============================================================================
struct BRANCH : Sequence<BRANCH, I<OPCODE_BRANCH, VoidOp, LabelOp>> { struct BRANCH : Sequence<BRANCH, I<OPCODE_BRANCH, VoidOp, LabelOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.jmp(i.src1.value->name, e.T_NEAR); e.jmp(i.src1.value->GetIdString(), e.T_NEAR);
} }
}; };
EMITTER_OPCODE_TABLE(OPCODE_BRANCH, BRANCH); EMITTER_OPCODE_TABLE(OPCODE_BRANCH, BRANCH);
@ -534,7 +534,7 @@ struct BRANCH_TRUE_F32
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
e.vmovd(e.eax, input); e.vmovd(e.eax, input);
e.test(e.eax, e.eax); e.test(e.eax, e.eax);
e.jnz(i.src2.value->name, e.T_NEAR); e.jnz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
struct BRANCH_TRUE_F64 struct BRANCH_TRUE_F64
@ -543,7 +543,7 @@ struct BRANCH_TRUE_F64
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
e.vmovq(e.rax, input); e.vmovq(e.rax, input);
e.test(e.rax, e.rax); e.test(e.rax, e.rax);
e.jnz(i.src2.value->name, e.T_NEAR); e.jnz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16, EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16,
@ -557,7 +557,7 @@ struct BRANCH_FALSE_I8
: Sequence<BRANCH_FALSE_I8, I<OPCODE_BRANCH_FALSE, VoidOp, I8Op, LabelOp>> { : Sequence<BRANCH_FALSE_I8, I<OPCODE_BRANCH_FALSE, VoidOp, I8Op, LabelOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
e.jz(i.src2.value->name, e.T_NEAR); e.jz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
struct BRANCH_FALSE_I16 struct BRANCH_FALSE_I16
@ -565,7 +565,7 @@ struct BRANCH_FALSE_I16
I<OPCODE_BRANCH_FALSE, VoidOp, I16Op, LabelOp>> { I<OPCODE_BRANCH_FALSE, VoidOp, I16Op, LabelOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
e.jz(i.src2.value->name, e.T_NEAR); e.jz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
struct BRANCH_FALSE_I32 struct BRANCH_FALSE_I32
@ -573,7 +573,7 @@ struct BRANCH_FALSE_I32
I<OPCODE_BRANCH_FALSE, VoidOp, I32Op, LabelOp>> { I<OPCODE_BRANCH_FALSE, VoidOp, I32Op, LabelOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
e.jz(i.src2.value->name, e.T_NEAR); e.jz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
struct BRANCH_FALSE_I64 struct BRANCH_FALSE_I64
@ -581,7 +581,7 @@ struct BRANCH_FALSE_I64
I<OPCODE_BRANCH_FALSE, VoidOp, I64Op, LabelOp>> { I<OPCODE_BRANCH_FALSE, VoidOp, I64Op, LabelOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
e.jz(i.src2.value->name, e.T_NEAR); e.jz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
struct BRANCH_FALSE_F32 struct BRANCH_FALSE_F32
@ -591,7 +591,7 @@ struct BRANCH_FALSE_F32
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
e.vmovd(e.eax, input); e.vmovd(e.eax, input);
e.test(e.eax, e.eax); e.test(e.eax, e.eax);
e.jz(i.src2.value->name, e.T_NEAR); e.jz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
struct BRANCH_FALSE_F64 struct BRANCH_FALSE_F64
@ -601,7 +601,7 @@ struct BRANCH_FALSE_F64
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
e.vmovq(e.rax, input); e.vmovq(e.rax, input);
e.test(e.rax, e.rax); e.test(e.rax, e.rax);
e.jz(i.src2.value->name, e.T_NEAR); e.jz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
EMITTER_OPCODE_TABLE(OPCODE_BRANCH_FALSE, BRANCH_FALSE_I8, BRANCH_FALSE_I16, EMITTER_OPCODE_TABLE(OPCODE_BRANCH_FALSE, BRANCH_FALSE_I8, BRANCH_FALSE_I16,

View File

@ -805,22 +805,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB);
// ============================================================================ // ============================================================================
// OPCODE_VECTOR_SHL // OPCODE_VECTOR_SHL
// ============================================================================ // ============================================================================
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) {
alignas(16) T value[16 / sizeof(T)];
alignas(16) T shamt[16 / sizeof(T)];
// Load SSE registers into a C array.
_mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
_mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
value[i] = value[i] << (shamt[i] & ((sizeof(T) * 8) - 1));
}
// Store result and return it.
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
}
static XmmConst GetShiftmaskForType(unsigned typ) { static XmmConst GetShiftmaskForType(unsigned typ) {
if (typ == INT8_TYPE) { if (typ == INT8_TYPE) {
return XMMXOPByteShiftMask; return XMMXOPByteShiftMask;
@ -914,28 +899,14 @@ struct VECTOR_SHL_V128
} }
} }
if (all_same) { if (all_same) {
// mul by two e.vpmovzxbw(e.ymm0, i.src1);
/*if (seenvalue == 1) { e.vpsllw(e.ymm0, e.ymm0, seenvalue);
e.vpaddb(i.dest, i.src1, i.src1); e.vextracti128(e.xmm1, e.ymm0, 1);
} else if (seenvalue == 2) {
e.vpaddb(i.dest, i.src1, i.src1);
e.vpaddb(i.dest, i.dest, i.dest);
} else if (seenvalue == 3) {
// mul by 8
e.vpaddb(i.dest, i.src1, i.src1);
e.vpaddb(i.dest, i.dest, i.dest);
e.vpaddb(i.dest, i.dest, i.dest);
} else*/
{
e.vpmovzxbw(e.ymm0, i.src1);
e.vpsllw(e.ymm0, e.ymm0, seenvalue);
e.vextracti128(e.xmm1, e.ymm0, 1);
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMShortsToBytes)); e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMShortsToBytes));
e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMShortsToBytes)); e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMShortsToBytes));
e.vpunpcklqdq(i.dest, e.xmm0, e.xmm1); e.vpunpcklqdq(i.dest, e.xmm0, e.xmm1);
return; return;
}
} else { } else {
e.LoadConstantXmm(e.xmm2, constmask); e.LoadConstantXmm(e.xmm2, constmask);
@ -966,14 +937,41 @@ struct VECTOR_SHL_V128
} }
} }
} }
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
if (i.src1.is_constant) {
e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); if (i.src2.is_constant) {
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint8_t>)); e.StashConstantXmm(1, i.src2.constant());
e.vmovaps(i.dest, e.xmm0); stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
e.shl(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
e.inc(e.edx);
} else {
e.add(e.edx, 1);
}
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
} }
static void EmitInt16(X64Emitter& e, const EmitArgType& i) { static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
Xmm src1; Xmm src1;
@ -1022,14 +1020,32 @@ struct VECTOR_SHL_V128
// TODO(benvanik): native version (with shift magic). // TODO(benvanik): native version (with shift magic).
e.L(emu); e.L(emu);
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
if (i.src2.is_constant) { if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint16_t>)); Xbyak::Label looper;
e.vmovaps(i.dest, e.xmm0);
e.xor_(e.edx, e.edx);
e.L(looper);
e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
e.shl(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 2);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
e.L(end); e.L(end);
} }
@ -1098,14 +1114,32 @@ struct VECTOR_SHL_V128
// TODO(benvanik): native version (with shift magic). // TODO(benvanik): native version (with shift magic).
e.L(emu); e.L(emu);
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
if (i.src2.is_constant) { if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint32_t>)); Xbyak::Label looper;
e.vmovaps(i.dest, e.xmm0);
e.xor_(e.edx, e.edx);
e.L(looper);
e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]);
e.shl(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 4);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
e.L(end); e.L(end);
} }
@ -1116,22 +1150,6 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128);
// ============================================================================ // ============================================================================
// OPCODE_VECTOR_SHR // OPCODE_VECTOR_SHR
// ============================================================================ // ============================================================================
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) {
alignas(16) T value[16 / sizeof(T)];
alignas(16) T shamt[16 / sizeof(T)];
// Load SSE registers into a C array.
_mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
_mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
value[i] = value[i] >> (shamt[i] & ((sizeof(T) * 8) - 1));
}
// Store result and return it.
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
}
struct VECTOR_SHR_V128 struct VECTOR_SHR_V128
: Sequence<VECTOR_SHR_V128, I<OPCODE_VECTOR_SHR, V128Op, V128Op, V128Op>> { : Sequence<VECTOR_SHR_V128, I<OPCODE_VECTOR_SHR, V128Op, V128Op, V128Op>> {
@ -1179,34 +1197,63 @@ struct VECTOR_SHR_V128
} }
static void EmitInt8(X64Emitter& e, const EmitArgType& i) { static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
// TODO(benvanik): native version (with shift magic). if (i.src2.is_constant && e.IsFeatureEnabled(kX64EmitGFNI)) {
if (i.src2.is_constant) { const auto& shamt = i.src2.constant();
if (e.IsFeatureEnabled(kX64EmitGFNI)) { bool all_same = true;
const auto& shamt = i.src2.constant(); for (size_t n = 0; n < 16 - n; ++n) {
bool all_same = true; if (shamt.u8[n] != shamt.u8[n + 1]) {
for (size_t n = 0; n < 16 - n; ++n) { all_same = false;
if (shamt.u8[n] != shamt.u8[n + 1]) { break;
all_same = false;
break;
}
}
if (all_same) {
// Every count is the same, so we can use gf2p8affineqb.
const uint8_t shift_amount = shamt.u8[0] & 0b111;
const uint64_t shift_matrix = UINT64_C(0x0102040810204080)
<< (shift_amount * 8);
e.vgf2p8affineqb(i.dest, i.src1,
e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
return;
} }
} }
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); if (all_same) {
} else { // Every count is the same, so we can use gf2p8affineqb.
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); const uint8_t shift_amount = shamt.u8[0] & 0b111;
const uint64_t shift_matrix = UINT64_C(0x0102040810204080)
<< (shift_amount * 8);
e.vgf2p8affineqb(i.dest, i.src1,
e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
return;
}
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint8_t>)); unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
e.vmovaps(i.dest, e.xmm0);
if (i.src1.is_constant) {
e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
}
if (i.src2.is_constant) {
e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
// movzx is to eliminate any possible dep on previous value of rcx at start
// of loop
e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
// maybe using a memory operand as the left side isn't the best idea lol,
// still better than callnativesafe though agners docs have no timing info
// on shx [m], cl so shrug
e.shr(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
e.inc(e.edx);
} else {
e.add(e.edx, 1);
}
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
} }
static void EmitInt16(X64Emitter& e, const EmitArgType& i) { static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
@ -1248,14 +1295,38 @@ struct VECTOR_SHR_V128
// TODO(benvanik): native version (with shift magic). // TODO(benvanik): native version (with shift magic).
e.L(emu); e.L(emu);
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
if (i.src1.is_constant) {
e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint16_t>)); if (i.src2.is_constant) {
e.vmovaps(i.dest, e.xmm0); e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
e.shr(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 2);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
e.L(end); e.L(end);
} }
@ -1324,14 +1395,37 @@ struct VECTOR_SHR_V128
// TODO(benvanik): native version. // TODO(benvanik): native version.
e.L(emu); e.L(emu);
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
if (i.src1.is_constant) {
e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint32_t>)); if (i.src2.is_constant) {
e.vmovaps(i.dest, e.xmm0); e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]);
e.shr(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 4);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
e.L(end); e.L(end);
} }
@ -1388,7 +1482,8 @@ struct VECTOR_SHA_V128
} }
static void EmitInt8(X64Emitter& e, const EmitArgType& i) { static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
// TODO(benvanik): native version (with shift magic). unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
if (i.src2.is_constant) { if (i.src2.is_constant) {
const auto& shamt = i.src2.constant(); const auto& shamt = i.src2.constant();
bool all_same = true; bool all_same = true;
@ -1399,7 +1494,6 @@ struct VECTOR_SHA_V128
} }
} }
if (e.IsFeatureEnabled(kX64EmitGFNI)) { if (e.IsFeatureEnabled(kX64EmitGFNI)) {
if (all_same) { if (all_same) {
// Every count is the same, so we can use gf2p8affineqb. // Every count is the same, so we can use gf2p8affineqb.
@ -1412,8 +1506,7 @@ struct VECTOR_SHA_V128
e.StashConstantXmm(0, vec128q(shift_matrix)), 0); e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
return; return;
} }
} } else if (all_same) {
else if (all_same) {
Xmm to_be_shifted = GetInputRegOrConstant(e, i.src1, e.xmm1); Xmm to_be_shifted = GetInputRegOrConstant(e, i.src1, e.xmm1);
e.vpmovsxbw(e.xmm0, to_be_shifted); //_mm_srai_epi16 / psraw e.vpmovsxbw(e.xmm0, to_be_shifted); //_mm_srai_epi16 / psraw
@ -1425,14 +1518,41 @@ struct VECTOR_SHA_V128
return; return;
} }
e.StashConstantXmm(1, i.src2.constant());
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int8_t>)); if (i.src1.is_constant) {
e.vmovaps(i.dest, e.xmm0); e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
}
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
// movzx is to eliminate any possible dep on previous value of rcx at start
// of loop
e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
// maybe using a memory operand as the left side isn't the best idea lol,
// still better than callnativesafe though agners docs have no timing info
// on shx [m], cl so shrug
e.sar(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
e.inc(e.edx);
} else {
e.add(e.edx, 1);
}
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
} }
static void EmitInt16(X64Emitter& e, const EmitArgType& i) { static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
@ -1474,14 +1594,38 @@ struct VECTOR_SHA_V128
// TODO(benvanik): native version (with shift magic). // TODO(benvanik): native version (with shift magic).
e.L(emu); e.L(emu);
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
if (i.src1.is_constant) {
e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int16_t>)); if (i.src2.is_constant) {
e.vmovaps(i.dest, e.xmm0); e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
e.sar(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 2);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
e.L(end); e.L(end);
} }
@ -1508,9 +1652,9 @@ struct VECTOR_SHA_V128
// that happens so we mask. // that happens so we mask.
if (i.src2.is_constant) { if (i.src2.is_constant) {
e.LoadConstantXmm(e.xmm0, i.src2.constant()); e.LoadConstantXmm(e.xmm0, i.src2.constant());
e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS)); e.vpand(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS));
} else { } else {
e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
} }
e.vpsravd(i.dest, i.src1, e.xmm0); e.vpsravd(i.dest, i.src1, e.xmm0);
} else { } else {
@ -1535,14 +1679,36 @@ struct VECTOR_SHA_V128
// TODO(benvanik): native version. // TODO(benvanik): native version.
e.L(emu); e.L(emu);
if (i.src2.is_constant) { unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
if (i.src1.is_constant) {
e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int32_t>)); if (i.src2.is_constant) {
e.vmovaps(i.dest, e.xmm0); e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]);
e.sar(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 4);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
e.L(end); e.L(end);
} }
@ -1550,26 +1716,6 @@ struct VECTOR_SHA_V128
}; };
EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128); EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128);
// ============================================================================
// OPCODE_VECTOR_ROTATE_LEFT
// ============================================================================
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
static __m128i EmulateVectorRotateLeft(void*, __m128i src1, __m128i src2) {
alignas(16) T value[16 / sizeof(T)];
alignas(16) T shamt[16 / sizeof(T)];
// Load SSE registers into a C array.
_mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
_mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
value[i] = xe::rotate_left<T>(value[i], shamt[i] & ((sizeof(T) * 8) - 1));
}
// Store result and return it.
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
}
struct VECTOR_ROTATE_LEFT_V128 struct VECTOR_ROTATE_LEFT_V128
: Sequence<VECTOR_ROTATE_LEFT_V128, : Sequence<VECTOR_ROTATE_LEFT_V128,
I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> { I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
@ -1594,33 +1740,72 @@ struct VECTOR_ROTATE_LEFT_V128
} }
} else { } else {
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
switch (i.instr->flags) { switch (i.instr->flags) {
case INT8_TYPE: case INT8_TYPE: {
// TODO(benvanik): native version (with shift magic). if (i.src1.is_constant) {
if (i.src2.is_constant) { e.StashConstantXmm(0, i.src1.constant());
e.lea(e.GetNativeParam(1), stack_offset_src1 = X64Emitter::kStashOffset;
e.StashConstantXmm(1, i.src2.constant()));
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint8_t>));
e.vmovaps(i.dest, e.xmm0);
break;
case INT16_TYPE:
// TODO(benvanik): native version (with shift magic).
if (i.src2.is_constant) { if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant());
e.StashConstantXmm(1, i.src2.constant())); stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe( Xbyak::Label rotate_iter;
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint16_t>));
e.vmovaps(i.dest, e.xmm0); e.xor_(e.edx, e.edx);
break;
e.L(rotate_iter);
e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
e.rol(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 1);
e.cmp(e.edx, 16);
e.jnz(rotate_iter);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
} break;
case INT16_TYPE: {
if (i.src1.is_constant) {
e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
}
if (i.src2.is_constant) {
e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label rotate_iter;
e.xor_(e.edx, e.edx);
e.L(rotate_iter);
e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
e.rol(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 2);
e.cmp(e.edx, 16);
e.jnz(rotate_iter);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
} break;
case INT32_TYPE: { case INT32_TYPE: {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vprolvd(i.dest, i.src1, i.src2); e.vprolvd(i.dest, i.src1, i.src2);
@ -1638,23 +1823,40 @@ struct VECTOR_ROTATE_LEFT_V128
} }
e.vpsllvd(e.xmm1, i.src1, e.xmm0); e.vpsllvd(e.xmm1, i.src1, e.xmm0);
// Shift right (to get low bits): // Shift right (to get low bits):
e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); e.vmovdqa(temp, e.GetXmmConstPtr(XMMPI32));
e.vpsubd(temp, e.xmm0); e.vpsubd(temp, e.xmm0);
e.vpsrlvd(i.dest, i.src1, temp); e.vpsrlvd(i.dest, i.src1, temp);
// Merge: // Merge:
e.vpor(i.dest, e.xmm1); e.vpor(i.dest, e.xmm1);
} else { } else {
// TODO(benvanik): non-AVX2 native version. if (i.src1.is_constant) {
if (i.src2.is_constant) { e.StashConstantXmm(0, i.src1.constant());
e.lea(e.GetNativeParam(1), stack_offset_src1 = X64Emitter::kStashOffset;
e.StashConstantXmm(1, i.src2.constant()));
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe( if (i.src2.is_constant) {
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint32_t>)); e.StashConstantXmm(1, i.src2.constant());
e.vmovaps(i.dest, e.xmm0); stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label rotate_iter;
e.xor_(e.edx, e.edx);
e.L(rotate_iter);
e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]);
e.rol(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 4);
e.cmp(e.edx, 16);
e.jnz(rotate_iter);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
} }
break; break;
} }
@ -1667,80 +1869,120 @@ struct VECTOR_ROTATE_LEFT_V128
}; };
EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128); EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128);
// ============================================================================
// OPCODE_VECTOR_AVERAGE
// ============================================================================
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
static __m128i EmulateVectorAverage(void*, __m128i src1, __m128i src2) {
alignas(16) T src1v[16 / sizeof(T)];
alignas(16) T src2v[16 / sizeof(T)];
alignas(16) T value[16 / sizeof(T)];
// Load SSE registers into a C array.
_mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1);
_mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2);
for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
auto t = (uint64_t(src1v[i]) + uint64_t(src2v[i]) + 1) / 2;
value[i] = T(t);
}
// Store result and return it.
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
}
struct VECTOR_AVERAGE struct VECTOR_AVERAGE
: Sequence<VECTOR_AVERAGE, : Sequence<VECTOR_AVERAGE,
I<OPCODE_VECTOR_AVERAGE, V128Op, V128Op, V128Op>> { I<OPCODE_VECTOR_AVERAGE, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
auto i_flags = i.instr->flags;
EmitCommutativeBinaryXmmOp( EmitCommutativeBinaryXmmOp(
e, i, e, i,
[&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, const Xmm& src2) { [i_flags](X64Emitter& e, const Xmm& dest, const Xmm& src1,
const TypeName part_type = const Xmm& src2) {
static_cast<TypeName>(i.instr->flags & 0xFF); const TypeName part_type = static_cast<TypeName>(i_flags & 0xFF);
const uint32_t arithmetic_flags = i.instr->flags >> 8; const uint32_t arithmetic_flags = i_flags >> 8;
bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED);
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
switch (part_type) { switch (part_type) {
case INT8_TYPE: case INT8_TYPE:
if (is_unsigned) { if (is_unsigned) {
e.vpavgb(dest, src1, src2); e.vpavgb(dest, src1, src2);
} else { } else {
assert_always(); // todo: avx2 version or version that sign extends to two __m128
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], src2);
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
e.movsx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
e.movsx(e.eax, e.byte[e.rsp + stack_offset_src1 + e.rdx]);
e.lea(e.ecx, e.ptr[e.ecx + e.eax + 1]);
e.sar(e.ecx, 1);
e.mov(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
e.inc(e.edx);
} else {
e.add(e.edx, 1);
}
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(dest, e.ptr[e.rsp + stack_offset_src1]);
} }
break; break;
case INT16_TYPE: case INT16_TYPE:
if (is_unsigned) { if (is_unsigned) {
e.vpavgw(dest, src1, src2); e.vpavgw(dest, src1, src2);
} else { } else {
assert_always(); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], src2);
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
e.movsx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
e.movsx(e.eax, e.word[e.rsp + stack_offset_src1 + e.rdx]);
e.lea(e.ecx, e.ptr[e.ecx + e.eax + 1]);
e.sar(e.ecx, 1);
e.mov(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cx);
e.add(e.edx, 2);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(dest, e.ptr[e.rsp + stack_offset_src1]);
} }
break; break;
case INT32_TYPE: case INT32_TYPE: {
// No 32bit averages in AVX. // No 32bit averages in AVX.
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], src2);
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
auto src2_current_ptr =
e.dword[e.rsp + stack_offset_src2 + e.rdx];
auto src1_current_ptr =
e.dword[e.rsp + stack_offset_src1 + e.rdx];
if (is_unsigned) { if (is_unsigned) {
if (i.src2.is_constant) { // implicit zero-ext
e.lea(e.GetNativeParam(1), e.mov(e.ecx, src2_current_ptr);
e.StashConstantXmm(1, i.src2.constant())); e.mov(e.eax, src1_current_ptr);
} else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorAverage<uint32_t>));
e.vmovaps(i.dest, e.xmm0);
} else { } else {
if (i.src2.is_constant) { e.movsxd(e.rcx, src2_current_ptr);
e.lea(e.GetNativeParam(1), e.movsxd(e.rax, src1_current_ptr);
e.StashConstantXmm(1, i.src2.constant()));
} else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorAverage<int32_t>));
e.vmovaps(i.dest, e.xmm0);
} }
break;
e.lea(e.rcx, e.ptr[e.rcx + e.rax + 1]);
if (is_unsigned) {
e.shr(e.rcx, 1);
} else {
e.sar(e.rcx, 1);
}
e.mov(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.ecx);
e.add(e.edx, 4);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(dest, e.ptr[e.rsp + stack_offset_src1]);
} break;
default: default:
assert_unhandled_case(part_type); assert_unhandled_case(part_type);
break; break;

File diff suppressed because one or more lines are too long

View File

@ -48,9 +48,7 @@ bool ConditionalGroupPass::Initialize(Compiler* compiler) {
bool ConditionalGroupPass::Run(HIRBuilder* builder) { bool ConditionalGroupPass::Run(HIRBuilder* builder) {
bool dirty; bool dirty;
int loops = 0;
do { do {
assert_true(loops < 20); // arbitrary number
dirty = false; dirty = false;
for (size_t i = 0; i < passes_.size(); ++i) { for (size_t i = 0; i < passes_.size(); ++i) {
scratch_arena()->Reset(); scratch_arena()->Reset();
@ -68,7 +66,6 @@ bool ConditionalGroupPass::Run(HIRBuilder* builder) {
dirty |= result; dirty |= result;
} }
} }
loops++;
} while (dirty); } while (dirty);
return true; return true;
} }

View File

@ -41,18 +41,6 @@ bool FinalizationPass::Run(HIRBuilder* builder) {
block->ordinal = block_ordinal++; block->ordinal = block_ordinal++;
// Ensure all labels have names. // Ensure all labels have names.
auto label = block->label_head;
while (label) {
if (!label->name) {
const size_t label_len = 6 + 4;
char* name = reinterpret_cast<char*>(arena->Alloc(label_len + 1, 1));
assert_true(label->id <= 9999);
auto end = fmt::format_to_n(name, label_len, "_label{}", label->id);
name[end.size] = '\0';
label->name = name;
}
label = label->next;
}
// Remove unneeded jumps. // Remove unneeded jumps.
auto tail = block->instr_tail; auto tail = block->instr_tail;

View File

@ -23,52 +23,6 @@ using namespace xe::cpu::hir;
using xe::cpu::hir::HIRBuilder; using xe::cpu::hir::HIRBuilder;
using xe::cpu::hir::Instr; using xe::cpu::hir::Instr;
using xe::cpu::hir::Value; using xe::cpu::hir::Value;
using vmask_portion_t = uint64_t;
template <uint32_t Ndwords>
struct Valuemask_t {
vmask_portion_t bits[Ndwords];
static Valuemask_t create_empty(vmask_portion_t fill = 0) {
Valuemask_t result;
for (uint32_t i = 0; i < Ndwords; ++i) {
result.bits[i] = fill;
}
return result;
}
template <typename TCallable>
Valuemask_t operate(TCallable&& oper) const {
Valuemask_t result = create_empty();
for (uint32_t i = 0; i < Ndwords; ++i) {
result.bits[i] = oper(bits[i]);
}
return result;
}
template <typename TCallable>
Valuemask_t operate(TCallable&& oper, Valuemask_t other) const {
Valuemask_t result = create_empty();
for (uint32_t i = 0; i < Ndwords; ++i) {
result.bits[i] = oper(bits[i], other.bits[i]);
}
return result;
}
Valuemask_t operator&(ValueMask other) const {
return operate([](vmask_portion_t x, vmask_portion_t y) { return x & y; },
other);
}
Valuemask_t operator|(ValueMask other) const {
return operate([](vmask_portion_t x, vmask_portion_t y) { return x | y; },
other);
}
Valuemask_t operator^(ValueMask other) const {
return operate([](vmask_portion_t x, vmask_portion_t y) { return x ^ y; },
other);
}
Valuemask_t operator~() const {
return operate([](vmask_portion_t x) { return ~x; }, other);
}
};
SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {} SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {}
@ -76,17 +30,13 @@ SimplificationPass::~SimplificationPass() {}
bool SimplificationPass::Run(HIRBuilder* builder, bool& result) { bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
result = false; result = false;
bool iter_result = false;
do { result |= SimplifyBitArith(builder);
iter_result = false; result |= EliminateConversions(builder);
iter_result |= SimplifyBitArith(builder); result |= SimplifyAssignments(builder);
iter_result |= EliminateConversions(builder); result |= SimplifyBasicArith(builder);
iter_result |= SimplifyAssignments(builder); result |= SimplifyVectorOps(builder);
iter_result |= SimplifyBasicArith(builder);
iter_result |= SimplifyVectorOps(builder);
result |= iter_result;
} while (iter_result);
return true; return true;
} }
// simplifications that apply to both or and xor // simplifications that apply to both or and xor
@ -735,7 +685,9 @@ bool SimplificationPass::CheckAdd(hir::Instr* i, hir::HIRBuilder* builder) {
auto [added_constant_neg, added_var_neg] = auto [added_constant_neg, added_var_neg] =
i->BinaryValueArrangeAsConstAndVar(); i->BinaryValueArrangeAsConstAndVar();
if (!added_constant_neg) return false; if (!added_constant_neg) {
return false;
}
if (added_constant_neg->AsUint64() & if (added_constant_neg->AsUint64() &
GetScalarSignbitMask(added_constant_neg->type)) { GetScalarSignbitMask(added_constant_neg->type)) {
// adding a value that has its signbit set! // adding a value that has its signbit set!
@ -882,11 +834,6 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
} else if (cmpop == OPCODE_COMPARE_UGT) { } else if (cmpop == OPCODE_COMPARE_UGT) {
// impossible, cannot be greater than mask // impossible, cannot be greater than mask
/* i->Replace(&OPCODE_ASSIGN_info, 0);
i->set_src1(builder->LoadZeroInt8());
return true;
*/
constant_replacement = builder->LoadZeroInt8(); constant_replacement = builder->LoadZeroInt8();
} else if (cmpop == OPCODE_COMPARE_ULE) { // less than or equal to mask = } else if (cmpop == OPCODE_COMPARE_ULE) { // less than or equal to mask =
@ -914,9 +861,9 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
bool istrue = i->opcode == &OPCODE_COMPARE_NE_info; bool istrue = i->opcode == &OPCODE_COMPARE_NE_info;
bool isfalse = i->opcode == &OPCODE_COMPARE_EQ_info; bool isfalse = i->opcode == &OPCODE_COMPARE_EQ_info;
auto [input_cosntant, input] = i->BinaryValueArrangeAsConstAndVar(); auto [input_constant, input] = i->BinaryValueArrangeAsConstAndVar();
if (!input_cosntant || input_cosntant->AsUint64() != 0) { if (!input_constant || input_constant->AsUint64() != 0) {
return false; return false;
} }
@ -957,12 +904,6 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
} }
} }
/* Instr* input_def = input->def;
if (!input_def) {
return false;
}
input_def = input_def->GetDestDefSkipAssigns();*/
return false; return false;
} }
bool SimplificationPass::CheckSHRByConst(hir::Instr* i, bool SimplificationPass::CheckSHRByConst(hir::Instr* i,

View File

@ -26,6 +26,13 @@ class Label {
char* name; char* name;
void* tag; void* tag;
// just use stringification of label id
// this will later be used as an input to xbyak. xbyak only accepts
// std::string as a value, not passed by reference, so precomputing the
// stringification does not help
std::string GetIdString() {
return std::to_string(id);
}
}; };
} // namespace hir } // namespace hir

View File

@ -11,7 +11,7 @@
#define XENIA_CPU_HIR_OPCODES_H_ #define XENIA_CPU_HIR_OPCODES_H_
#include <cstdint> #include <cstdint>
#include "xenia/base/platform.h"
namespace xe { namespace xe {
namespace cpu { namespace cpu {
namespace hir { namespace hir {
@ -361,13 +361,16 @@ enum OpcodeSignature {
#define GET_OPCODE_SIG_TYPE_SRC1(sig) (OpcodeSignatureType)((sig >> 3) & 0x7) #define GET_OPCODE_SIG_TYPE_SRC1(sig) (OpcodeSignatureType)((sig >> 3) & 0x7)
#define GET_OPCODE_SIG_TYPE_SRC2(sig) (OpcodeSignatureType)((sig >> 6) & 0x7) #define GET_OPCODE_SIG_TYPE_SRC2(sig) (OpcodeSignatureType)((sig >> 6) & 0x7)
#define GET_OPCODE_SIG_TYPE_SRC3(sig) (OpcodeSignatureType)((sig >> 9) & 0x7) #define GET_OPCODE_SIG_TYPE_SRC3(sig) (OpcodeSignatureType)((sig >> 9) & 0x7)
XE_MAYBE_UNUSED
static bool IsOpcodeBinaryValue(uint32_t signature) { static bool IsOpcodeBinaryValue(uint32_t signature) {
return (signature & ~(0x7)) == return (signature & ~(0x7)) ==
((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6)); ((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6));
} }
XE_MAYBE_UNUSED
static bool IsOpcodeUnaryValue(uint32_t signature) { static bool IsOpcodeUnaryValue(uint32_t signature) {
return (signature & ~(0x7)) == ((OPCODE_SIG_TYPE_V << 3)); return (signature & ~(0x7)) == ((OPCODE_SIG_TYPE_V << 3));
} }
XE_MAYBE_UNUSED
static void UnpackOpcodeSig(uint32_t sig, OpcodeSignatureType& dest, static void UnpackOpcodeSig(uint32_t sig, OpcodeSignatureType& dest,
OpcodeSignatureType& src1, OpcodeSignatureType& src1,
OpcodeSignatureType& src2, OpcodeSignatureType& src2,

View File

@ -185,7 +185,7 @@ bool MMIOHandler::TryDecodeLoadStore(const uint8_t* p,
uint8_t rex_b = rex & 0b0001; uint8_t rex_b = rex & 0b0001;
uint8_t rex_x = rex & 0b0010; uint8_t rex_x = rex & 0b0010;
uint8_t rex_r = rex & 0b0100; uint8_t rex_r = rex & 0b0100;
uint8_t rex_w = rex & 0b1000; //uint8_t rex_w = rex & 0b1000;
// http://www.sandpile.org/x86/opc_rm.htm // http://www.sandpile.org/x86/opc_rm.htm
// http://www.sandpile.org/x86/opc_sib.htm // http://www.sandpile.org/x86/opc_sib.htm
@ -418,7 +418,6 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) {
// Quick kill anything outside our mapping. // Quick kill anything outside our mapping.
return false; return false;
} }
uint64_t hostip = ex->pc();
void* fault_host_address = reinterpret_cast<void*>(ex->fault_address()); void* fault_host_address = reinterpret_cast<void*>(ex->fault_address());

View File

@ -46,6 +46,7 @@ struct PPCDecodeData {
uint32_t LEV() const { return bits_.LEV; } uint32_t LEV() const { return bits_.LEV; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -74,6 +75,7 @@ struct PPCDecodeData {
uint32_t L() const { return bits_.RT & 0x1; } uint32_t L() const { return bits_.RT & 0x1; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -95,6 +97,7 @@ struct PPCDecodeData {
int32_t ds() const { return static_cast<int32_t>(XEEXTS16(DS() << 2)); } int32_t ds() const { return static_cast<int32_t>(XEEXTS16(DS() << 2)); }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -174,6 +177,7 @@ struct PPCDecodeData {
uint32_t CRFS() const { return bits_.RA >> 2; } uint32_t CRFS() const { return bits_.RA >> 2; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -200,6 +204,7 @@ struct PPCDecodeData {
uint32_t CRFS() const { return CRBA() >> 2; } uint32_t CRFS() const { return CRBA() >> 2; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -223,6 +228,7 @@ struct PPCDecodeData {
} }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -244,6 +250,7 @@ struct PPCDecodeData {
bool Rc() const { return bits_.Rc ? true : false; } bool Rc() const { return bits_.Rc ? true : false; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -266,6 +273,7 @@ struct PPCDecodeData {
bool Rc() const { return bits_.Rc ? true : false; } bool Rc() const { return bits_.Rc ? true : false; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -289,6 +297,7 @@ struct PPCDecodeData {
bool Rc() const { return bits_.Rc ? true : false; } bool Rc() const { return bits_.Rc ? true : false; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -314,6 +323,7 @@ struct PPCDecodeData {
bool Rc() const { return bits_.Rc ? true : false; } bool Rc() const { return bits_.Rc ? true : false; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -339,6 +349,7 @@ struct PPCDecodeData {
bool Rc() const { return bits_.Rc ? true : false; } bool Rc() const { return bits_.Rc ? true : false; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -363,6 +374,7 @@ struct PPCDecodeData {
bool Rc() const { return bits_.Rc ? true : false; } bool Rc() const { return bits_.Rc ? true : false; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -389,6 +401,7 @@ struct PPCDecodeData {
bool Rc() const { return bits_.Rc ? true : false; } bool Rc() const { return bits_.Rc ? true : false; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -412,6 +425,7 @@ struct PPCDecodeData {
int32_t SIMM() const { return static_cast<int32_t>(XEEXTS16(VA())); } int32_t SIMM() const { return static_cast<int32_t>(XEEXTS16(VA())); }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -431,6 +445,7 @@ struct PPCDecodeData {
bool Rc() const { return bits_.Rc ? true : false; } bool Rc() const { return bits_.Rc ? true : false; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -452,6 +467,7 @@ struct PPCDecodeData {
uint32_t SHB() const { return VC() & 0xF; } uint32_t SHB() const { return VC() & 0xF; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -473,6 +489,7 @@ struct PPCDecodeData {
uint32_t VB() const { return bits_.VB128l | (bits_.VB128h << 5); } uint32_t VB() const { return bits_.VB128l | (bits_.VB128h << 5); }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -498,6 +515,7 @@ struct PPCDecodeData {
uint32_t RB() const { return bits_.RB; } uint32_t RB() const { return bits_.RB; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -521,6 +539,7 @@ struct PPCDecodeData {
uint32_t VC() const { return bits_.VC; } uint32_t VC() const { return bits_.VC; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -546,6 +565,7 @@ struct PPCDecodeData {
int32_t SIMM() const { return static_cast<int32_t>(XEEXTS16(bits_.UIMM)); } int32_t SIMM() const { return static_cast<int32_t>(XEEXTS16(bits_.UIMM)); }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -567,6 +587,7 @@ struct PPCDecodeData {
uint32_t z() const { return bits_.z; } uint32_t z() const { return bits_.z; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -592,6 +613,7 @@ struct PPCDecodeData {
uint32_t SH() const { return bits_.SH; } uint32_t SH() const { return bits_.SH; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -618,6 +640,7 @@ struct PPCDecodeData {
bool Rc() const { return bits_.Rc ? true : false; } bool Rc() const { return bits_.Rc ? true : false; }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;
@ -642,6 +665,7 @@ struct PPCDecodeData {
uint32_t UIMM() const { return bits_.PERMl | (bits_.PERMh << 5); } uint32_t UIMM() const { return bits_.PERMl | (bits_.PERMh << 5); }
private: private:
XE_MAYBE_UNUSED
uint32_t address_; uint32_t address_;
union { union {
uint32_t value_; uint32_t value_;

View File

@ -2014,8 +2014,7 @@ int InstrEmit_vupkhsh(PPCHIRBuilder& f, const InstrData& i) {
return InstrEmit_vupkhsh_(f, i.VX.VD, i.VX.VB); return InstrEmit_vupkhsh_(f, i.VX.VD, i.VX.VB);
} }
int InstrEmit_vupkhsh128(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_vupkhsh128(PPCHIRBuilder& f, const InstrData& i) {
uint32_t va = VX128_VA128; assert_zero(VX128_VA128);
assert_zero(va);
return InstrEmit_vupkhsh_(f, VX128_VD128, VX128_VB128); return InstrEmit_vupkhsh_(f, VX128_VD128, VX128_VB128);
} }
@ -2032,8 +2031,7 @@ int InstrEmit_vupklsh(PPCHIRBuilder& f, const InstrData& i) {
return InstrEmit_vupklsh_(f, i.VX.VD, i.VX.VB); return InstrEmit_vupklsh_(f, i.VX.VD, i.VX.VB);
} }
int InstrEmit_vupklsh128(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_vupklsh128(PPCHIRBuilder& f, const InstrData& i) {
uint32_t va = VX128_VA128; assert_zero(VX128_VA128);
assert_zero(va);
return InstrEmit_vupklsh_(f, VX128_VD128, VX128_VB128); return InstrEmit_vupklsh_(f, VX128_VD128, VX128_VB128);
} }

View File

@ -16,7 +16,7 @@
#include "xenia/cpu/ppc/ppc_hir_builder.h" #include "xenia/cpu/ppc/ppc_hir_builder.h"
DEFINE_bool( DEFINE_bool(
disable_prefetch_and_cachecontrol, false, disable_prefetch_and_cachecontrol, true,
"Disables translating ppc prefetch/cache flush instructions to host " "Disables translating ppc prefetch/cache flush instructions to host "
"prefetch/cacheflush instructions. This may improve performance as these " "prefetch/cacheflush instructions. This may improve performance as these "
"instructions were written with the Xbox 360's cache in mind, and modern " "instructions were written with the Xbox 360's cache in mind, and modern "

View File

@ -105,6 +105,11 @@ bool PPCFrontend::Initialize() {
} }
bool PPCFrontend::DeclareFunction(GuestFunction* function) { bool PPCFrontend::DeclareFunction(GuestFunction* function) {
//chrispy: make sure we aren't declaring a function that is actually padding data, this will mess up PPCScanner and is hard to debug
//wow, this halo reach actually has branches into 0 opcodes, look into further
//xenia_assert(*reinterpret_cast<const uint32_t*>(
// this->memory()->TranslateVirtual(function->address())) != 0);
// Could scan or something here. // Could scan or something here.
// Could also check to see if it's a well-known function type and classify // Could also check to see if it's a well-known function type and classify
// for later. // for later.

View File

@ -34,6 +34,11 @@ DEFINE_bool(
"unimplemented PowerPC instruction is encountered.", "unimplemented PowerPC instruction is encountered.",
"CPU"); "CPU");
DEFINE_bool(
emit_useless_fpscr_updates, false,
"Emit useless fpscr update instructions (pre-10/30/2022 behavior). ",
"CPU");
namespace xe { namespace xe {
namespace cpu { namespace cpu {
namespace ppc { namespace ppc {
@ -89,6 +94,9 @@ bool PPCHIRBuilder::Emit(GuestFunction* function, uint32_t flags) {
function_ = function; function_ = function;
start_address_ = function_->address(); start_address_ = function_->address();
//chrispy: i've seen this one happen, not sure why but i think from trying to precompile twice
//i've also seen ones with a start and end address that are the same...
assert_true(function_->address() <= function_->end_address());
instr_count_ = (function_->end_address() - function_->address()) / 4 + 1; instr_count_ = (function_->end_address() - function_->address()) / 4 + 1;
with_debug_info_ = (flags & EMIT_DEBUG_COMMENTS) == EMIT_DEBUG_COMMENTS; with_debug_info_ = (flags & EMIT_DEBUG_COMMENTS) == EMIT_DEBUG_COMMENTS;
@ -242,6 +250,7 @@ void PPCHIRBuilder::MaybeBreakOnInstruction(uint32_t address) {
} }
void PPCHIRBuilder::AnnotateLabel(uint32_t address, Label* label) { void PPCHIRBuilder::AnnotateLabel(uint32_t address, Label* label) {
//chrispy: label->name is unused, it would be nice to be able to remove the field and this code
char name_buffer[13]; char name_buffer[13];
auto format_result = fmt::format_to_n(name_buffer, 12, "loc_{:08X}", address); auto format_result = fmt::format_to_n(name_buffer, 12, "loc_{:08X}", address);
name_buffer[format_result.size] = '\0'; name_buffer[format_result.size] = '\0';
@ -447,31 +456,38 @@ void PPCHIRBuilder::StoreFPSCR(Value* value) {
void PPCHIRBuilder::UpdateFPSCR(Value* result, bool update_cr1) { void PPCHIRBuilder::UpdateFPSCR(Value* result, bool update_cr1) {
// TODO(benvanik): detect overflow and nan cases. // TODO(benvanik): detect overflow and nan cases.
// fx and vx are the most important. // fx and vx are the most important.
Value* fx = LoadConstantInt8(0); /*
Value* fex = LoadConstantInt8(0); chrispy: stubbed this out because right now all it does is waste
Value* vx = LoadConstantInt8(0); memory and CPU time
Value* ox = LoadConstantInt8(0); */
if (cvars::emit_useless_fpscr_updates) {
Value* fx = LoadConstantInt8(0);
Value* fex = LoadConstantInt8(0);
Value* vx = LoadConstantInt8(0);
Value* ox = LoadConstantInt8(0);
if (update_cr1) { if (update_cr1) {
// Store into the CR1 field. // Store into the CR1 field.
// We do this instead of just calling CopyFPSCRToCR1 so that we don't // We do this instead of just calling CopyFPSCRToCR1 so that we don't
// have to read back the bits and do shifting work. // have to read back the bits and do shifting work.
StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx); StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx);
StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex); StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex);
StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx); StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx);
StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox); StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox);
}
// Generate our new bits.
Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31);
new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30));
new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29));
new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28));
// Mix into fpscr while preserving sticky bits (FX and OX).
Value* bits = LoadFPSCR();
bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits);
StoreFPSCR(bits);
} }
// Generate our new bits.
Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31);
new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30));
new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29));
new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28));
// Mix into fpscr while preserving sticky bits (FX and OX).
Value* bits = LoadFPSCR();
bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits);
StoreFPSCR(bits);
} }
void PPCHIRBuilder::CopyFPSCRToCR1() { void PPCHIRBuilder::CopyFPSCRToCR1() {

View File

@ -21,13 +21,7 @@ namespace xe {
namespace cpu { namespace cpu {
namespace ppc { namespace ppc {
// DEPRECATED struct PPCOpcodeBits {
// TODO(benvanik): move code to PPCDecodeData.
struct InstrData {
PPCOpcode opcode;
const PPCOpcodeInfo* opcode_info;
uint32_t address;
union { union {
uint32_t code; uint32_t code;
@ -329,6 +323,14 @@ struct InstrData {
}; };
}; };
// DEPRECATED
// TODO(benvanik): move code to PPCDecodeData.
struct InstrData : public PPCOpcodeBits {
PPCOpcode opcode;
const PPCOpcodeInfo* opcode_info;
uint32_t address;
};
} // namespace ppc } // namespace ppc
} // namespace cpu } // namespace cpu
} // namespace xe } // namespace xe

View File

@ -31,7 +31,8 @@
#include "third_party/crypto/rijndael-alg-fst.c" #include "third_party/crypto/rijndael-alg-fst.c"
#include "third_party/crypto/rijndael-alg-fst.h" #include "third_party/crypto/rijndael-alg-fst.h"
#include "third_party/pe/pe_image.h" #include "third_party/pe/pe_image.h"
#include "xenia/cpu/ppc/ppc_decode_data.h"
#include "xenia/cpu/ppc/ppc_instr.h"
DEFINE_bool(disable_instruction_infocache, false, DEFINE_bool(disable_instruction_infocache, false,
"Disables caching records of called instructions/mmio accesses.", "Disables caching records of called instructions/mmio accesses.",
"CPU"); "CPU");
@ -1074,12 +1075,13 @@ bool XexModule::LoadContinue() {
image_sha_str_ += &fmtbuf[0]; image_sha_str_ += &fmtbuf[0];
} }
info_cache_.Init(this);
// Find __savegprlr_* and __restgprlr_* and the others. // Find __savegprlr_* and __restgprlr_* and the others.
// We can flag these for special handling (inlining/etc). // We can flag these for special handling (inlining/etc).
if (!FindSaveRest()) { if (!FindSaveRest()) {
return false; return false;
} }
info_cache_.Init(this);
PrecompileDiscoveredFunctions();
// Load a specified module map and diff. // Load a specified module map and diff.
if (cvars::load_module_map.size()) { if (cvars::load_module_map.size()) {
@ -1363,7 +1365,20 @@ InfoCacheFlags* XexModule::GetInstructionAddressFlags(uint32_t guest_addr) {
return info_cache_.LookupFlags(guest_addr); return info_cache_.LookupFlags(guest_addr);
} }
void XexModule::PrecompileDiscoveredFunctions() {
auto others = PreanalyzeCode();
for (auto&& other : others) {
if (other < low_address_ || other >= high_address_) {
continue;
}
auto sym = processor_->LookupFunction(other);
if (!sym || sym->status() != Symbol::Status::kDefined) {
processor_->ResolveFunction(other);
}
}
}
void XexModule::PrecompileKnownFunctions() { void XexModule::PrecompileKnownFunctions() {
if (cvars::disable_function_precompilation) { if (cvars::disable_function_precompilation) {
return; return;
@ -1376,10 +1391,157 @@ void XexModule::PrecompileKnownFunctions() {
} }
for (uint32_t i = 0; i < end; i++) { for (uint32_t i = 0; i < end; i++) {
if (flags[i].was_resolved) { if (flags[i].was_resolved) {
processor_->ResolveFunction(low_address_ + (i * 4)); uint32_t addr = low_address_ + (i * 4);
auto sym = processor_->LookupFunction(addr);
if (!sym || sym->status() != Symbol::Status::kDefined) {
processor_->ResolveFunction(addr);
}
} }
} }
} }
static uint32_t get_bl_called_function(XexModule* xexmod, uint32_t current_base,
ppc::PPCOpcodeBits wrd) {
int32_t displ = static_cast<int32_t>(ppc::XEEXTS26(wrd.I.LI << 2));
if (wrd.I.AA) {
return static_cast<uint32_t>(displ);
} else {
return static_cast<uint32_t>(static_cast<int32_t>(current_base) + displ);
}
}
static bool is_bl(unsigned w) {
return (w >> (32 - 6)) == 18 && ppc::PPCOpcodeBits{w}.I.LK;
}
std::vector<uint32_t> XexModule::PreanalyzeCode() {
uint32_t low_8_aligned = xe::align<uint32_t>(low_address_, 8);
uint32_t high_8_aligned = high_address_ & ~(8U - 1);
uint32_t n_possible_8byte_addresses = (high_8_aligned - low_8_aligned) / 8;
uint32_t* funcstart_candidate_stack =
new uint32_t[n_possible_8byte_addresses];
uint32_t* funcstart_candstack2 = new uint32_t[n_possible_8byte_addresses];
uint32_t stack_pos = 0;
{
// all functions seem to start on 8 byte boundaries, except for obvious ones
// like the save/rest funcs
uint32_t* range_start =
(uint32_t*)memory()->TranslateVirtual(low_8_aligned);
uint32_t* range_end = (uint32_t*)memory()->TranslateVirtual(
high_8_aligned); // align down to multiple of 8
const uint8_t mfspr_r12_lr[4] = {0x7D, 0x88, 0x02, 0xA6};
// a blr instruction, with 4 zero bytes afterwards to pad the next address
// to 8 byte alignment
// if we see this prior to our address, we can assume we are a function
// start
const uint8_t blr[4] = {0x4E, 0x80, 0x0, 0x20};
uint32_t blr32 = *reinterpret_cast<const uint32_t*>(&blr[0]);
uint32_t mfspr_r12_lr32 =
*reinterpret_cast<const uint32_t*>(&mfspr_r12_lr[0]);
/*
First pass: detect save of the link register at an eight byte
aligned address
*/
for (uint32_t* first_pass = range_start; first_pass < range_end;
first_pass += 2) {
if (*first_pass == mfspr_r12_lr32) {
// Push our newly discovered function start into our list
// All addresses in the list are sorted until the second pass
funcstart_candidate_stack[stack_pos++] =
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(first_pass) -
reinterpret_cast<uintptr_t>(range_start)) +
low_8_aligned;
} else if (first_pass[-1] == 0 && *first_pass != 0) {
// originally i checked for blr followed by 0, but some functions are
// actually aligned to greater boundaries. something that appears to be
// longjmp (it occurs in most games, so standard library, and loads ctx,
// so longjmp) is aligned to 16 bytes in most games
uint32_t* check_iter = &first_pass[-2];
while (!*check_iter) {
--check_iter;
}
XE_LIKELY_IF(*check_iter == blr32) {
funcstart_candidate_stack[stack_pos++] =
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(first_pass) -
reinterpret_cast<uintptr_t>(range_start)) +
low_8_aligned;
}
}
}
uint32_t current_guestaddr = low_8_aligned;
// Second pass: detect branch with link instructions and decode the target
// address. We can safely assume that if bl is to address, that address is
// the start of the function
for (uint32_t* second_pass = range_start; second_pass < range_end;
second_pass++, current_guestaddr += 4) {
uint32_t current_call = xe::byte_swap(*second_pass);
if (is_bl(current_call)) {
funcstart_candidate_stack[stack_pos++] = get_bl_called_function(
this, current_guestaddr, ppc::PPCOpcodeBits{current_call});
}
}
auto pdata = this->GetPESection(".pdata");
if (pdata) {
uint32_t* pdata_base =
(uint32_t*)this->memory()->TranslateVirtual(pdata->address);
uint32_t n_pdata_entries = pdata->raw_size / 8;
for (uint32_t i = 0; i < n_pdata_entries; ++i) {
uint32_t funcaddr = xe::load_and_swap<uint32_t>(&pdata_base[i * 2]);
if (funcaddr >= low_address_ && funcaddr <= high_address_) {
funcstart_candidate_stack[stack_pos++] = funcaddr;
} else {
// we hit 0 for func addr, that means we're done
break;
}
}
}
}
// Sort the list of function starts and then ensure that all addresses are
// unique
uint32_t n_known_funcaddrs = 0;
{
// make addresses unique
std::sort(funcstart_candidate_stack, funcstart_candidate_stack + stack_pos);
uint32_t read_pos = 0;
uint32_t write_pos = 0;
uint32_t previous_addr = ~0u;
while (read_pos < stack_pos) {
uint32_t current_addr = funcstart_candidate_stack[read_pos++];
if (current_addr != previous_addr) {
previous_addr = current_addr;
funcstart_candstack2[write_pos++] = current_addr;
}
}
n_known_funcaddrs = write_pos;
}
delete[] funcstart_candidate_stack;
std::vector<uint32_t> result;
result.resize(n_known_funcaddrs);
memcpy(&result[0], funcstart_candstack2,
sizeof(uint32_t) * n_known_funcaddrs);
delete[] funcstart_candstack2;
return result;
}
bool XexModule::FindSaveRest() { bool XexModule::FindSaveRest() {
// Special stack save/restore functions. // Special stack save/restore functions.
// http://research.microsoft.com/en-us/um/redmond/projects/invisible/src/crt/md/ppc/xxx.s.htm // http://research.microsoft.com/en-us/um/redmond/projects/invisible/src/crt/md/ppc/xxx.s.htm
@ -1552,6 +1714,8 @@ bool XexModule::FindSaveRest() {
auto page_size = base_address_ <= 0x90000000 ? 64 * 1024 : 4 * 1024; auto page_size = base_address_ <= 0x90000000 ? 64 * 1024 : 4 * 1024;
auto sec_header = xex_security_info(); auto sec_header = xex_security_info();
std::vector<uint32_t> resolve_on_exit{};
resolve_on_exit.reserve(256);
for (uint32_t i = 0, page = 0; i < sec_header->page_descriptor_count; i++) { for (uint32_t i = 0, page = 0; i < sec_header->page_descriptor_count; i++) {
// Byteswap the bitfield manually. // Byteswap the bitfield manually.
xex2_page_descriptor desc; xex2_page_descriptor desc;
@ -1586,13 +1750,20 @@ bool XexModule::FindSaveRest() {
// Add function stubs. // Add function stubs.
char name[32]; char name[32];
auto AddXexFunction = [this, &resolve_on_exit](uint32_t address,
Function** function) {
DeclareFunction(address, function);
resolve_on_exit.push_back(address);
};
if (gplr_start) { if (gplr_start) {
uint32_t address = gplr_start; uint32_t address = gplr_start;
for (int n = 14; n <= 31; n++) { for (int n = 14; n <= 31; n++) {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__savegprlr_{}", n); fmt::format_to_n(name, xe::countof(name), "__savegprlr_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function);
AddXexFunction(address, &function);
function->set_end_address(address + (31 - n) * 4 + 2 * 4); function->set_end_address(address + (31 - n) * 4 + 2 * 4);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
@ -1608,7 +1779,7 @@ bool XexModule::FindSaveRest() {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__restgprlr_{}", n); fmt::format_to_n(name, xe::countof(name), "__restgprlr_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function); AddXexFunction(address, &function);
function->set_end_address(address + (31 - n) * 4 + 3 * 4); function->set_end_address(address + (31 - n) * 4 + 3 * 4);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
@ -1625,7 +1796,7 @@ bool XexModule::FindSaveRest() {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__savefpr_{}", n); fmt::format_to_n(name, xe::countof(name), "__savefpr_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function); AddXexFunction(address, &function);
function->set_end_address(address + (31 - n) * 4 + 1 * 4); function->set_end_address(address + (31 - n) * 4 + 1 * 4);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
@ -1641,7 +1812,7 @@ bool XexModule::FindSaveRest() {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__restfpr_{}", n); fmt::format_to_n(name, xe::countof(name), "__restfpr_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function); AddXexFunction(address, &function);
function->set_end_address(address + (31 - n) * 4 + 1 * 4); function->set_end_address(address + (31 - n) * 4 + 1 * 4);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
@ -1663,7 +1834,7 @@ bool XexModule::FindSaveRest() {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__savevmx_{}", n); fmt::format_to_n(name, xe::countof(name), "__savevmx_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function); AddXexFunction(address, &function);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx; // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx;
@ -1677,7 +1848,7 @@ bool XexModule::FindSaveRest() {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__savevmx_{}", n); fmt::format_to_n(name, xe::countof(name), "__savevmx_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function); AddXexFunction(address, &function);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx; // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx;
@ -1691,7 +1862,7 @@ bool XexModule::FindSaveRest() {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__restvmx_{}", n); fmt::format_to_n(name, xe::countof(name), "__restvmx_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function); AddXexFunction(address, &function);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx; // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx;
@ -1705,7 +1876,7 @@ bool XexModule::FindSaveRest() {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__restvmx_{}", n); fmt::format_to_n(name, xe::countof(name), "__restvmx_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function); AddXexFunction(address, &function);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx; // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx;
@ -1716,6 +1887,13 @@ bool XexModule::FindSaveRest() {
} }
} }
for (auto&& to_ensure_precompiled : resolve_on_exit) {
// we want to make sure an address for these functions is available before
// any other functions are compiled for code generation purposes but we do
// it outside of our loops, because we also want to make sure we've marked
// up the symbol with info about it being save/rest and whatnot
processor_->ResolveFunction(to_ensure_precompiled);
}
return true; return true;
} }

View File

@ -34,7 +34,8 @@ struct InfoCacheFlags {
uint32_t was_resolved : 1; // has this address ever been called/requested uint32_t was_resolved : 1; // has this address ever been called/requested
// via resolvefunction? // via resolvefunction?
uint32_t accessed_mmio : 1; uint32_t accessed_mmio : 1;
uint32_t reserved : 30; uint32_t is_syscall_func : 1;
uint32_t reserved : 29;
}; };
struct XexInfoCache { struct XexInfoCache {
struct InfoCacheFlagsHeader { struct InfoCacheFlagsHeader {
@ -209,7 +210,8 @@ class XexModule : public xe::cpu::Module {
InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr); InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr);
void PrecompileKnownFunctions(); void PrecompileKnownFunctions();
void PrecompileDiscoveredFunctions();
std::vector<uint32_t> PreanalyzeCode();
protected: protected:
std::unique_ptr<Function> CreateFunction(uint32_t address) override; std::unique_ptr<Function> CreateFunction(uint32_t address) override;

View File

@ -5152,6 +5152,7 @@ void D3D12CommandProcessor::WriteGammaRampSRV(
#define COMMAND_PROCESSOR D3D12CommandProcessor #define COMMAND_PROCESSOR D3D12CommandProcessor
#include "../pm4_command_processor_implement.h" #include "../pm4_command_processor_implement.h"
#undef COMMAND_PROCESSOR
} // namespace d3d12 } // namespace d3d12
} // namespace gpu } // namespace gpu
} // namespace xe } // namespace xe

View File

@ -50,8 +50,9 @@ struct MemExportRange {
}; };
class D3D12CommandProcessor final : public CommandProcessor { class D3D12CommandProcessor final : public CommandProcessor {
protected: protected:
#define OVERRIDING_BASE_CMDPROCESSOR
#include "../pm4_command_processor_declare.h" #include "../pm4_command_processor_declare.h"
#undef OVERRIDING_BASE_CMDPROCESSOR
public: public:
explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system, explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system,
kernel::KernelState* kernel_state); kernel::KernelState* kernel_state);

View File

@ -87,7 +87,7 @@ class D3D12TextureCache final : public TextureCache {
~D3D12TextureCache(); ~D3D12TextureCache();
void ClearCache(); void ClearCache() override;
void BeginSubmission(uint64_t new_submission_index) override; void BeginSubmission(uint64_t new_submission_index) override;
void BeginFrame() override; void BeginFrame() override;

View File

@ -1,8 +1,15 @@
void ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) XE_RESTRICT; #if defined(OVERRIDING_BASE_CMDPROCESSOR)
virtual uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index) XE_RESTRICT; #define PM4_OVERRIDE override
virtual bool ExecutePacket(); #else
#define PM4_OVERRIDE
#endif
void ExecuteIndirectBuffer(uint32_t ptr,
uint32_t count) XE_RESTRICT;
virtual uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index)
XE_RESTRICT PM4_OVERRIDE;
virtual bool ExecutePacket() PM4_OVERRIDE;
public: public:
void ExecutePacket(uint32_t ptr, uint32_t count); void ExecutePacket(uint32_t ptr, uint32_t count);
@ -111,4 +118,6 @@ XE_COLD
bool ExecutePacketType3_CountOverflow(uint32_t count); bool ExecutePacketType3_CountOverflow(uint32_t count);
XE_NOINLINE XE_NOINLINE
XE_COLD XE_COLD
bool ExecutePacketType0_CountOverflow(uint32_t count); bool ExecutePacketType0_CountOverflow(uint32_t count);
#undef PM4_OVERRIDE

View File

@ -48,8 +48,9 @@ namespace vulkan {
class VulkanCommandProcessor final : public CommandProcessor { class VulkanCommandProcessor final : public CommandProcessor {
protected: protected:
#define OVERRIDING_BASE_CMDPROCESSOR
#include "../pm4_command_processor_declare.h" #include "../pm4_command_processor_declare.h"
#undef OVERRIDING_BASE_CMDPROCESSOR
public: public:
// Single-descriptor layouts for use within a single frame. // Single-descriptor layouts for use within a single frame.
enum class SingleTransientDescriptorLayout { enum class SingleTransientDescriptorLayout {

View File

@ -28,7 +28,11 @@
namespace xe { namespace xe {
namespace kernel { namespace kernel {
namespace xboxkrnl { namespace xboxkrnl {
struct X_STRING {
unsigned short length;
unsigned short pad;
uint32_t ptr;
};
// https://msdn.microsoft.com/en-us/library/ff561778 // https://msdn.microsoft.com/en-us/library/ff561778
dword_result_t RtlCompareMemory_entry(lpvoid_t source1, lpvoid_t source2, dword_result_t RtlCompareMemory_entry(lpvoid_t source1, lpvoid_t source2,
dword_t length) { dword_t length) {
@ -142,38 +146,81 @@ dword_result_t RtlLowerChar_entry(dword_t in) {
} }
DECLARE_XBOXKRNL_EXPORT1(RtlLowerChar, kNone, kImplemented); DECLARE_XBOXKRNL_EXPORT1(RtlLowerChar, kNone, kImplemented);
dword_result_t RtlCompareString_entry(lpstring_t string_1, lpstring_t string_2,
dword_t case_insensitive) {
int ret = case_insensitive ? xe_strcasecmp(string_1, string_2)
: std::strcmp(string_1, string_2);
return ret; static int RtlCompareStringN_impl(uint8_t* string_1, unsigned int string_1_len,
uint8_t* string_2, unsigned int string_2_len,
int case_insensitive) {
if (string_1_len == 0xFFFFFFFF) {
uint8_t* string1_strlen_iter = string_1;
while (*string1_strlen_iter++)
;
string_1_len = static_cast<unsigned int>(string1_strlen_iter - string_1 - 1);
}
if (string_2_len == 0xFFFFFFFF) {
uint8_t* string2_strlen_iter = string_2;
while (*string2_strlen_iter++)
;
string_2_len = static_cast<unsigned int>(string2_strlen_iter - string_2 - 1);
}
uint8_t* string1_end = &string_1[std::min(string_2_len, string_1_len)];
if (case_insensitive) {
while (string_1 < string1_end) {
unsigned c1 = *string_1++;
unsigned c2 = *string_2++;
if (c1 != c2) {
unsigned cu1 = rtl_upper_table[c1];
unsigned cu2 = rtl_upper_table[c2];
if (cu1 != cu2) {
return cu1 - cu2;
}
}
}
} else {
while (string_1 < string1_end) {
unsigned c1 = *string_1++;
unsigned c2 = *string_2++;
if (c1 != c2) {
return c1 - c2;
}
}
}
// why? not sure, but its the original logic
return string_1_len - string_2_len;
} }
DECLARE_XBOXKRNL_EXPORT1(RtlCompareString, kNone, kImplemented);
dword_result_t RtlCompareStringN_entry(lpstring_t string_1, dword_result_t RtlCompareStringN_entry(lpstring_t string_1,
dword_t string_1_len, dword_t string_1_len,
lpstring_t string_2, lpstring_t string_2,
dword_t string_2_len, dword_t string_2_len,
dword_t case_insensitive) { dword_t case_insensitive) {
uint32_t len1 = string_1_len; return RtlCompareStringN_impl(
uint32_t len2 = string_2_len; reinterpret_cast<uint8_t*>(string_1.host_address()), string_1_len,
reinterpret_cast<uint8_t*>(string_2.host_address()), string_2_len,
if (string_1_len == 0xFFFF) { case_insensitive);
len1 = uint32_t(std::strlen(string_1));
}
if (string_2_len == 0xFFFF) {
len2 = uint32_t(std::strlen(string_2));
}
auto len = std::min(string_1_len, string_2_len);
int ret = case_insensitive ? xe_strncasecmp(string_1, string_2, len)
: std::strncmp(string_1, string_2, len);
return ret;
} }
DECLARE_XBOXKRNL_EXPORT1(RtlCompareStringN, kNone, kImplemented); DECLARE_XBOXKRNL_EXPORT1(RtlCompareStringN, kNone, kImplemented);
dword_result_t RtlCompareString_entry(lpvoid_t string_1, lpvoid_t string_2,
dword_t case_insensitive) {
X_STRING* xs1 = string_1.as<X_STRING*>();
X_STRING* xs2 = string_2.as<X_STRING*>();
unsigned length_1 = xe::load_and_swap<uint16_t>(&xs1->length);
unsigned length_2 = xe::load_and_swap<uint16_t>(&xs2->length);
uint32_t ptr_1 = xe::load_and_swap<uint32_t>(&xs1->ptr);
uint32_t ptr_2 = xe::load_and_swap<uint32_t>(&xs2->ptr);
auto kmem = kernel_memory();
return RtlCompareStringN_impl(
kmem->TranslateVirtual<uint8_t*>(ptr_1), length_1,
kmem->TranslateVirtual<uint8_t*>(ptr_2), length_2, case_insensitive);
}
DECLARE_XBOXKRNL_EXPORT1(RtlCompareString, kNone, kImplemented);
// https://msdn.microsoft.com/en-us/library/ff561918 // https://msdn.microsoft.com/en-us/library/ff561918
void RtlInitAnsiString_entry(pointer_t<X_ANSI_STRING> destination, void RtlInitAnsiString_entry(pointer_t<X_ANSI_STRING> destination,
lpstring_t source) { lpstring_t source) {
@ -188,13 +235,13 @@ void RtlInitAnsiString_entry(pointer_t<X_ANSI_STRING> destination,
destination->pointer = source.guest_address(); destination->pointer = source.guest_address();
} }
DECLARE_XBOXKRNL_EXPORT1(RtlInitAnsiString, kNone, kImplemented); DECLARE_XBOXKRNL_EXPORT1(RtlInitAnsiString, kNone, kImplemented);
//https://learn.microsoft.com/en-us/windows-hardware/drivers/ddi/wdm/nf-wdm-rtlupcaseunicodechar // https://learn.microsoft.com/en-us/windows-hardware/drivers/ddi/wdm/nf-wdm-rtlupcaseunicodechar
dword_result_t RtlUpcaseUnicodeChar_entry(dword_t SourceCharacter) { dword_result_t RtlUpcaseUnicodeChar_entry(dword_t SourceCharacter) {
return std::use_facet<std::ctype<char16_t>>(std::locale()).toupper(SourceCharacter); return std::use_facet<std::ctype<char16_t>>(std::locale())
.toupper(SourceCharacter);
} }
DECLARE_XBOXKRNL_EXPORT1(RtlUpcaseUnicodeChar, kNone, kImplemented); DECLARE_XBOXKRNL_EXPORT1(RtlUpcaseUnicodeChar, kNone, kImplemented);
// https://msdn.microsoft.com/en-us/library/ff561899 // https://msdn.microsoft.com/en-us/library/ff561899
void RtlFreeAnsiString_entry(pointer_t<X_ANSI_STRING> string) { void RtlFreeAnsiString_entry(pointer_t<X_ANSI_STRING> string) {
if (string->pointer) { if (string->pointer) {

View File

@ -9,7 +9,6 @@
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "xenia/base/atomic.h" #include "xenia/base/atomic.h"
#include "xenia/base/clock.h" #include "xenia/base/clock.h"
#include "xenia/base/logging.h" #include "xenia/base/logging.h"
@ -913,7 +912,7 @@ dword_result_t NtWaitForMultipleObjectsEx_entry(
dword_t count, lpdword_t handles, dword_t wait_type, dword_t wait_mode, dword_t count, lpdword_t handles, dword_t wait_type, dword_t wait_mode,
dword_t alertable, lpqword_t timeout_ptr) { dword_t alertable, lpqword_t timeout_ptr) {
uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u; uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
if (!count || count > 64 || wait_type != 1 && wait_type) { if (!count || count > 64 || (wait_type != 1 && wait_type)) {
return X_STATUS_INVALID_PARAMETER; return X_STATUS_INVALID_PARAMETER;
} }
return xeNtWaitForMultipleObjectsEx(count, handles, wait_type, wait_mode, return xeNtWaitForMultipleObjectsEx(count, handles, wait_type, wait_mode,
@ -964,7 +963,7 @@ uint32_t xeKeKfAcquireSpinLock(uint32_t* lock, uint64_t r13 = 1) {
PrefetchForCAS(lock); PrefetchForCAS(lock);
assert_true(*lock != static_cast<uint32_t>(r13)); assert_true(*lock != static_cast<uint32_t>(r13));
// Lock. // Lock.
while (!xe::atomic_cas(0, static_cast<uint32_t>(r13), lock)) { while (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(r13)), lock)) {
// Spin! // Spin!
// TODO(benvanik): error on deadlock? // TODO(benvanik): error on deadlock?
xe::threading::MaybeYield(); xe::threading::MaybeYield();
@ -978,7 +977,7 @@ uint32_t xeKeKfAcquireSpinLock(uint32_t* lock, uint64_t r13 = 1) {
} }
dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr, dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr,
ppc_context_t& ppc_context) { const ppc_context_t& ppc_context) {
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address()); auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
return xeKeKfAcquireSpinLock(lock, ppc_context->r[13]); return xeKeKfAcquireSpinLock(lock, ppc_context->r[13]);
} }
@ -997,9 +996,7 @@ void xeKeKfReleaseSpinLock(uint32_t* lock, dword_t old_irql) {
} }
void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql, void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql,
ppc_context_t& ppc_ctx) { const ppc_context_t& ppc_ctx) {
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13])); assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13]));
*lock_ptr = 0; *lock_ptr = 0;
@ -1014,14 +1011,14 @@ DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented,
kHighFrequency); kHighFrequency);
// todo: this is not accurate // todo: this is not accurate
void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr, void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr,
ppc_context_t& ppc_ctx) { const ppc_context_t& ppc_ctx) {
// Lock. // Lock.
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address()); auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
// must not be our own thread // must not be our own thread
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13])); assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
PrefetchForCAS(lock); PrefetchForCAS(lock);
while (!xe::atomic_cas(0, static_cast<uint32_t>(ppc_ctx->r[13]), lock)) { while (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
#if XE_ARCH_AMD64 == 1 #if XE_ARCH_AMD64 == 1
// todo: this is just a nop if they don't have SMT, which is not great // todo: this is just a nop if they don't have SMT, which is not great
// either... // either...
@ -1036,12 +1033,12 @@ DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading,
kImplemented, kBlocking, kHighFrequency); kImplemented, kBlocking, kHighFrequency);
dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry( dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(
lpdword_t lock_ptr, ppc_context_t& ppc_ctx) { lpdword_t lock_ptr, const ppc_context_t& ppc_ctx) {
// Lock. // Lock.
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address()); auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13])); assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
PrefetchForCAS(lock); PrefetchForCAS(lock);
if (!xe::atomic_cas(0, static_cast<uint32_t>(ppc_ctx->r[13]), lock)) { if (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
return 0; return 0;
} }
return 1; return 1;
@ -1050,10 +1047,9 @@ DECLARE_XBOXKRNL_EXPORT4(KeTryToAcquireSpinLockAtRaisedIrql, kThreading,
kImplemented, kBlocking, kHighFrequency, kSketchy); kImplemented, kBlocking, kHighFrequency, kSketchy);
void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr, void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr,
ppc_context_t& ppc_ctx) { const ppc_context_t& ppc_ctx) {
// Unlock. // Unlock.
assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13])); assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13]));
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
*lock_ptr = 0; *lock_ptr = 0;
} }
DECLARE_XBOXKRNL_EXPORT2(KeReleaseSpinLockFromRaisedIrql, kThreading, DECLARE_XBOXKRNL_EXPORT2(KeReleaseSpinLockFromRaisedIrql, kThreading,
@ -1283,7 +1279,8 @@ void ExInitializeReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr) {
} }
DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented); DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented);
void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) { void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr,
const ppc_context_t& ppc_context) {
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
int32_t lock_count = ++lock_ptr->lock_count; int32_t lock_count = ++lock_ptr->lock_count;
@ -1301,7 +1298,7 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockExclusive, kThreading,
kImplemented, kBlocking); kImplemented, kBlocking);
dword_result_t ExTryToAcquireReadWriteLockExclusive_entry( dword_result_t ExTryToAcquireReadWriteLockExclusive_entry(
pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) { pointer_t<X_ERWLOCK> lock_ptr, const ppc_context_t& ppc_context) {
auto old_irql = auto old_irql =
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
@ -1320,7 +1317,7 @@ DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading,
kImplemented); kImplemented);
void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr, void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr,
ppc_context_t& ppc_context) { const ppc_context_t& ppc_context) {
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
int32_t lock_count = ++lock_ptr->lock_count; int32_t lock_count = ++lock_ptr->lock_count;
@ -1340,7 +1337,7 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockShared, kThreading, kImplemented,
kBlocking); kBlocking);
dword_result_t ExTryToAcquireReadWriteLockShared_entry( dword_result_t ExTryToAcquireReadWriteLockShared_entry(
pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) { pointer_t<X_ERWLOCK> lock_ptr, const ppc_context_t& ppc_context) {
auto old_irql = auto old_irql =
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
@ -1361,7 +1358,7 @@ DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockShared, kThreading,
kImplemented); kImplemented);
void ExReleaseReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr, void ExReleaseReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr,
ppc_context_t& ppc_context) { const ppc_context_t& ppc_context) {
auto old_irql = auto old_irql =
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
@ -1404,7 +1401,7 @@ pointer_result_t InterlockedPushEntrySList_entry(
assert_not_null(entry); assert_not_null(entry);
alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr; alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr;
alignas(8) X_SLIST_HEADER new_hdr = {0}; alignas(8) X_SLIST_HEADER new_hdr = {{0}, 0, 0};
uint32_t old_head = 0; uint32_t old_head = 0;
do { do {
old_hdr = *plist_ptr; old_hdr = *plist_ptr;
@ -1428,8 +1425,8 @@ pointer_result_t InterlockedPopEntrySList_entry(
assert_not_null(plist_ptr); assert_not_null(plist_ptr);
uint32_t popped = 0; uint32_t popped = 0;
alignas(8) X_SLIST_HEADER old_hdr = {0}; alignas(8) X_SLIST_HEADER old_hdr = {{0}, 0, 0};
alignas(8) X_SLIST_HEADER new_hdr = {0}; alignas(8) X_SLIST_HEADER new_hdr = {{0}, 0, 0};
do { do {
old_hdr = *plist_ptr; old_hdr = *plist_ptr;
auto next = kernel_memory()->TranslateVirtual<X_SINGLE_LIST_ENTRY*>( auto next = kernel_memory()->TranslateVirtual<X_SINGLE_LIST_ENTRY*>(
@ -1456,7 +1453,7 @@ pointer_result_t InterlockedFlushSList_entry(
assert_not_null(plist_ptr); assert_not_null(plist_ptr);
alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr; alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr;
alignas(8) X_SLIST_HEADER new_hdr = {0}; alignas(8) X_SLIST_HEADER new_hdr = {{0}, 0, 0};
uint32_t first = 0; uint32_t first = 0;
do { do {
old_hdr = *plist_ptr; old_hdr = *plist_ptr;

View File

@ -433,7 +433,7 @@ void VdSwap_entry(
return; return;
} }
gpu_fetch.base_address = frontbuffer_physical_address >> 12; gpu_fetch.base_address = frontbuffer_physical_address >> 12;
XE_MAYBE_UNUSED
auto texture_format = gpu::xenos::TextureFormat(texture_format_ptr.value()); auto texture_format = gpu::xenos::TextureFormat(texture_format_ptr.value());
auto color_space = *color_space_ptr; auto color_space = *color_space_ptr;
assert_true(texture_format == gpu::xenos::TextureFormat::k_8_8_8_8 || assert_true(texture_format == gpu::xenos::TextureFormat::k_8_8_8_8 ||

View File

@ -21,7 +21,7 @@ namespace vfs {
NullDevice::NullDevice(const std::string& mount_path, NullDevice::NullDevice(const std::string& mount_path,
const std::initializer_list<std::string>& null_paths) const std::initializer_list<std::string>& null_paths)
: Device(mount_path), null_paths_(null_paths), name_("NullDevice") {} : Device(mount_path), name_("NullDevice"), null_paths_(null_paths) {}
NullDevice::~NullDevice() = default; NullDevice::~NullDevice() = default;

2
third_party/FFmpeg vendored

@ -1 +1 @@
Subproject commit a437fe6d8efef17c8ad33d39f5815032e7adf5d7 Subproject commit a14f5c03834a79fc401626a4dad7a58a2da0c445

2
third_party/cxxopts vendored

@ -1 +1 @@
Subproject commit b2b8cf2f50a449720874f43445e23d75b77dcc43 Subproject commit ec1cafb9f9ed15306a8f1a16f736b734cca4ae54