use much faster exp2/cos approximations in ffmpeg, large decrease in cpu usage on my machine on decoder thread

properly byteswap r13 for spinlock
Add PPCOpcodeBits
stub out broken fpscr updating in ppc_hir_builder. it's just code that repeatedly does nothing right now.
add note about 0 opcode bytes being executed to ppc_frontend
Add assert to check that function end is greater than function start, can happen with malformed functions
Disable prefetch and cachecontrol by default, automatic hardware prefetchers already do the job for the most part
minor cleanup in simplification_pass, dont loop optimizations, let the pass manager do it for us
Add experimental "delay_via_maybeyield" cvar, which uses MaybeYield to "emulate" the db16cyc instruction
Add much faster/simpler way of directly calling guest functions, no longer have to do a byte by byte search through the generated code
Generate label string ids on the fly
Fix unused function warnings for prefetch on clang, fix many other clang warnings
Eliminated majority of CallNativeSafes by replacing them with naive generic code paths.
^ Vector rotate left, vector shift left, vector shift right, vector shift arithmetic right, and vector average are included
These naive paths are implemented small loops that stash the two inputs to the stack and load them in gprs from there, they are not particularly fast but should be an order of magnitude faster than callnativesafe
to a host function, which would involve a call, stashing all volatile registers, an indirect call, potentially setting up a stack frame for the arrays that the inputs get stashed to, the actual operations, a return, loading all volatile registers, a return, etc
Added the fast SHR_V128 path back in
Implement signed vector average byte, signed vector average word. previously we were emitting no code for them. signed vector average byte appears in many games
Fix bug with signed vector average 32, we were doing unsigned shift, turning negative values into big positive ones potentially
This commit is contained in:
chss95cs@gmail.com 2022-10-30 08:48:58 -07:00
parent d8b7b3ecec
commit 550d1d0a7c
26 changed files with 881 additions and 547 deletions

View File

@ -170,8 +170,10 @@ CommandVar<T>::CommandVar(const char* name, T* default_value,
const char* description) const char* description)
: name_(name), : name_(name),
default_value_(*default_value), default_value_(*default_value),
description_(description), current_value_(default_value),
current_value_(default_value) {} commandline_value_(),
description_(description)
{}
template <class T> template <class T>
ConfigVar<T>::ConfigVar(const char* name, T* default_value, ConfigVar<T>::ConfigVar(const char* name, T* default_value,

View File

@ -59,7 +59,7 @@ static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3); CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3); CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
#pragma loop(no_vector)
for (uint32_t i = 0; i < num_lines_for_8k; ++i) { for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
xe::swcache::CacheLine line0, line1, line2, line3; xe::swcache::CacheLine line0, line1, line2, line3;
@ -92,7 +92,6 @@ static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to,
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3); CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3); CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
#pragma loop(no_vector)
for (uint32_t i = 0; i < num_lines_for_8k; ++i) { for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
_movdir64b(dest1 + i, src1 + i); _movdir64b(dest1 + i, src1 + i);
_movdir64b(dest2 + i, src2 + i); _movdir64b(dest2 + i, src2 + i);

View File

@ -620,23 +620,23 @@ static void Prefetch(const void* addr) {
} }
template <> template <>
void Prefetch<PrefetchTag::Write>(const void* addr) { XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Write>(const void* addr) {
PrefetchW(addr); PrefetchW(addr);
} }
template <> template <>
void Prefetch<PrefetchTag::Nontemporal>(const void* addr) { XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Nontemporal>(const void* addr) {
PrefetchNTA(addr); PrefetchNTA(addr);
} }
template <> template <>
void Prefetch<PrefetchTag::Level3>(const void* addr) { XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Level3>(const void* addr) {
PrefetchL3(addr); PrefetchL3(addr);
} }
template <> template <>
void Prefetch<PrefetchTag::Level2>(const void* addr) { XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Level2>(const void* addr) {
PrefetchL2(addr); PrefetchL2(addr);
} }
template <> template <>
void Prefetch<PrefetchTag::Level1>(const void* addr) { XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Level1>(const void* addr) {
PrefetchL1(addr); PrefetchL1(addr);
} }
// todo: does aarch64 have streaming stores/loads? // todo: does aarch64 have streaming stores/loads?

View File

@ -25,6 +25,7 @@ namespace xe {
*/ */
class alignas(4096) xe_global_mutex { class alignas(4096) xe_global_mutex {
XE_MAYBE_UNUSED
char detail[64]; char detail[64];
public: public:
@ -38,6 +39,7 @@ class alignas(4096) xe_global_mutex {
using global_mutex_type = xe_global_mutex; using global_mutex_type = xe_global_mutex;
class alignas(64) xe_fast_mutex { class alignas(64) xe_fast_mutex {
XE_MAYBE_UNUSED
char detail[64]; char detail[64];
public: public:
@ -62,8 +64,6 @@ class xe_unlikely_mutex {
~xe_unlikely_mutex() { mut = 0; } ~xe_unlikely_mutex() { mut = 0; }
void lock() { void lock() {
uint32_t lock_expected = 0;
if (XE_LIKELY(_tryget())) { if (XE_LIKELY(_tryget())) {
return; return;
} else { } else {

View File

@ -144,9 +144,11 @@
#define XE_MSVC_OPTIMIZE_SMALL() #define XE_MSVC_OPTIMIZE_SMALL()
#define XE_MSVC_OPTIMIZE_REVERT() #define XE_MSVC_OPTIMIZE_REVERT()
#endif #endif
#if XE_COMPILER_HAS_GNU_EXTENSIONS == 1 #if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
#define XE_LIKELY_IF(...) if (XE_LIKELY(__VA_ARGS__)) #define XE_LIKELY_IF(...) if (XE_LIKELY(__VA_ARGS__))
#define XE_UNLIKELY_IF(...) if (XE_UNLIKELY(__VA_ARGS__)) #define XE_UNLIKELY_IF(...) if (XE_UNLIKELY(__VA_ARGS__))
#define XE_MAYBE_UNUSED __attribute__((unused))
#else #else
#if __cplusplus >= 202002 #if __cplusplus >= 202002
#define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) [[likely]] #define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) [[likely]]
@ -155,6 +157,7 @@
#define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) #define XE_LIKELY_IF(...) if (!!(__VA_ARGS__))
#define XE_UNLIKELY_IF(...) if (!!(__VA_ARGS__)) #define XE_UNLIKELY_IF(...) if (!!(__VA_ARGS__))
#endif #endif
#define XE_MAYBE_UNUSED
#endif #endif
// only use __restrict if MSVC, for clang/gcc we can use -fstrict-aliasing which // only use __restrict if MSVC, for clang/gcc we can use -fstrict-aliasing which
// acts as __restrict across the board todo: __restrict is part of the type // acts as __restrict across the board todo: __restrict is part of the type

View File

@ -78,7 +78,9 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t _count) {
if (read_offset_ < write_offset_) { if (read_offset_ < write_offset_) {
assert_true(read_offset_ + count <= write_offset_); assert_true(read_offset_ + count <= write_offset_);
} else if (read_offset_ + count >= capacity_) { } else if (read_offset_ + count >= capacity_) {
XE_MAYBE_UNUSED
ring_size_t left_half = capacity_ - read_offset_; ring_size_t left_half = capacity_ - read_offset_;
assert_true(count - left_half <= write_offset_); assert_true(count - left_half <= write_offset_);
} }
@ -107,6 +109,7 @@ size_t RingBuffer::Write(const uint8_t* buffer, size_t _count) {
if (write_offset_ < read_offset_) { if (write_offset_ < read_offset_) {
assert_true(write_offset_ + count <= read_offset_); assert_true(write_offset_ + count <= read_offset_);
} else if (write_offset_ + count >= capacity_) { } else if (write_offset_ + count >= capacity_) {
XE_MAYBE_UNUSED
size_t left_half = capacity_ - write_offset_; size_t left_half = capacity_ - write_offset_;
assert_true(count - left_half <= read_offset_); assert_true(count - left_half <= read_offset_);
} }

View File

@ -68,7 +68,6 @@ class RingBuffer {
ring_size_t offset_delta = write_offs - read_offs; ring_size_t offset_delta = write_offs - read_offs;
ring_size_t wrap_read_count = (cap - read_offs) + write_offs; ring_size_t wrap_read_count = (cap - read_offs) + write_offs;
ring_size_t comparison_value = read_offs <= write_offs;
if (XE_LIKELY(read_offs <= write_offs)) { if (XE_LIKELY(read_offs <= write_offs)) {
return offset_delta; // will be 0 if they are equal, semantically return offset_delta; // will be 0 if they are equal, semantically

View File

@ -117,7 +117,7 @@ void set_name(const std::string_view name) {
// checked ntoskrnl, it does not modify delay, so we can place this as a // checked ntoskrnl, it does not modify delay, so we can place this as a
// constant and avoid creating a stack variable // constant and avoid creating a stack variable
static const LARGE_INTEGER sleepdelay0_for_maybeyield{0LL}; static const LARGE_INTEGER sleepdelay0_for_maybeyield{{0LL}};
void MaybeYield() { void MaybeYield() {
#if 0 #if 0
@ -314,7 +314,8 @@ class Win32Event : public Win32Handle<Event> {
} }
#endif #endif
EventInfo Query() { EventInfo result{}; EventInfo Query() override {
EventInfo result{};
NtQueryEventPointer.invoke(handle_, 0, &result, sizeof(EventInfo), nullptr); NtQueryEventPointer.invoke(handle_, 0, &result, sizeof(EventInfo), nullptr);
return result; return result;
} }
@ -429,7 +430,7 @@ class Win32Timer : public Win32Handle<Timer> {
} }
bool SetRepeatingAt(GClock_::time_point due_time, bool SetRepeatingAt(GClock_::time_point due_time,
std::chrono::milliseconds period, std::chrono::milliseconds period,
std::function<void()> opt_callback = nullptr) { std::function<void()> opt_callback = nullptr) override {
return SetRepeatingAt(date::clock_cast<WClock_>(due_time), period, return SetRepeatingAt(date::clock_cast<WClock_>(due_time), period,
std::move(opt_callback)); std::move(opt_callback));
} }

View File

@ -46,10 +46,6 @@ DEFINE_bool(ignore_undefined_externs, true,
DEFINE_bool(emit_source_annotations, false, DEFINE_bool(emit_source_annotations, false,
"Add extra movs and nops to make disassembly easier to read.", "Add extra movs and nops to make disassembly easier to read.",
"CPU"); "CPU");
DEFINE_bool(resolve_rel32_guest_calls, true,
"Experimental optimization, directly call already resolved "
"functions via x86 rel32 call/jmp",
"CPU");
DEFINE_bool(enable_incorrect_roundingmode_behavior, false, DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
"Disables the FPU/VMX MXCSR sharing workaround, potentially " "Disables the FPU/VMX MXCSR sharing workaround, potentially "
@ -78,7 +74,6 @@ using namespace xe::literals;
static const size_t kMaxCodeSize = 1_MiB; static const size_t kMaxCodeSize = 1_MiB;
static const size_t kStashOffset = 32;
// static const size_t kStashOffsetHigh = 32 + 32; // static const size_t kStashOffsetHigh = 32 + 32;
const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = { const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = {
@ -141,55 +136,6 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
return true; return true;
} }
#pragma pack(push, 1)
struct RGCEmitted {
uint8_t ff_;
uint32_t rgcid_;
};
#pragma pack(pop)
#if 0
void X64Emitter::InjectCallAddresses(void* new_execute_address) {
for (auto&& callsite : call_sites_) {
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
hunter =
reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
}
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
hunter->rgcid_ =
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
reinterpret_cast<intptr_t>(hunter + 1));
}
}
#else
void X64Emitter::InjectCallAddresses(void* new_execute_address) {
#if 0
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
std::map<uint32_t, ResolvableGuestCall*> id_to_rgc{};
for (auto&& callsite : call_sites_) {
id_to_rgc[callsite.offset_] = &callsite;
}
#else
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
for (auto&& callsite : call_sites_) {
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
hunter =
reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
}
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
hunter->rgcid_ =
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
reinterpret_cast<intptr_t>(hunter + 1));
}
#endif
}
#endif
void* X64Emitter::Emplace(const EmitFunctionInfo& func_info, void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
GuestFunction* function) { GuestFunction* function) {
// To avoid changing xbyak, we do a switcharoo here. // To avoid changing xbyak, we do a switcharoo here.
@ -207,10 +153,6 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
if (function) { if (function) {
code_cache_->PlaceGuestCode(function->address(), top_, func_info, function, code_cache_->PlaceGuestCode(function->address(), top_, func_info, function,
new_execute_address, new_write_address); new_execute_address, new_write_address);
if (cvars::resolve_rel32_guest_calls) {
InjectCallAddresses(new_execute_address);
}
} else { } else {
code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address, code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address,
new_write_address); new_write_address);
@ -219,7 +161,6 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
ready(); ready();
top_ = old_address; top_ = old_address;
reset(); reset();
call_sites_.clear();
tail_code_.clear(); tail_code_.clear();
for (auto&& cached_label : label_cache_) { for (auto&& cached_label : label_cache_) {
delete cached_label; delete cached_label;
@ -336,7 +277,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
// Mark block labels. // Mark block labels.
auto label = block->label_head; auto label = block->label_head;
while (label) { while (label) {
L(label->name); L(std::to_string(label->id));
label = label->next; label = label->next;
} }
@ -418,7 +359,6 @@ void X64Emitter::EmitProfilerEpilogue() {
// actually... lets just try without atomics lol // actually... lets just try without atomics lol
// lock(); // lock();
add(qword[r10], rdx); add(qword[r10], rdx);
} }
#endif #endif
} }
@ -534,44 +474,23 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
auto fn = static_cast<X64Function*>(function); auto fn = static_cast<X64Function*>(function);
// Resolve address to the function to call and store in rax. // Resolve address to the function to call and store in rax.
if (cvars::resolve_rel32_guest_calls && fn->machine_code()) { if (fn->machine_code()) {
ResolvableGuestCall rgc;
rgc.destination_ = uint32_t(uint64_t(fn->machine_code()));
rgc.offset_ = current_rgc_id_;
current_rgc_id_++;
if (!(instr->flags & hir::CALL_TAIL)) { if (!(instr->flags & hir::CALL_TAIL)) {
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
db(0xFF); call((void*)fn->machine_code());
rgc.is_jump_ = false;
dd(rgc.offset_);
} else { } else {
// tail call // tail call
EmitTraceUserCallReturn(); EmitTraceUserCallReturn();
EmitProfilerEpilogue();
rgc.is_jump_ = true;
// Pass the callers return address over. // Pass the callers return address over.
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]); mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
add(rsp, static_cast<uint32_t>(stack_size())); add(rsp, static_cast<uint32_t>(stack_size()));
db(0xFF); jmp((void*)fn->machine_code(), T_NEAR);
dd(rgc.offset_);
} }
call_sites_.push_back(rgc);
return; return;
}
if (fn->machine_code()) {
// TODO(benvanik): is it worth it to do this? It removes the need for
// a ResolveFunction call, but makes the table less useful.
assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000);
// todo: this should be changed so that we can actually do a call to
// fn->machine_code. the code will be emitted near us, so 32 bit rel jmp
// should be possible
mov(eax, uint32_t(uint64_t(fn->machine_code())));
} else if (code_cache_->has_indirection_table()) { } else if (code_cache_->has_indirection_table()) {
// Load the pointer to the indirection table maintained in X64CodeCache. // Load the pointer to the indirection table maintained in X64CodeCache.
// The target dword will either contain the address of the generated code // The target dword will either contain the address of the generated code
@ -1017,7 +936,10 @@ static const vec128_t xmm_consts[] = {
/*XMMSTVLShuffle*/ /*XMMSTVLShuffle*/
v128_setr_bytes(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), v128_setr_bytes(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
/* XMMSTVRSwapMask*/ /* XMMSTVRSwapMask*/
vec128b((uint8_t)0x83)}; vec128b((uint8_t)0x83), /*XMMVSRShlByteshuf*/
v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
// XMMVSRMask
vec128b(1)};
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
for (auto& vec : xmm_consts) { for (auto& vec : xmm_consts) {

View File

@ -172,7 +172,9 @@ enum XmmConst {
XMMLVLShuffle, XMMLVLShuffle,
XMMLVRCmp16, XMMLVRCmp16,
XMMSTVLShuffle, XMMSTVLShuffle,
XMMSTVRSwapMask // swapwordmask with bit 7 set XMMSTVRSwapMask, // swapwordmask with bit 7 set
XMMVSRShlByteshuf,
XMMVSRMask
}; };
using amdfx::xopcompare_e; using amdfx::xopcompare_e;
@ -190,13 +192,6 @@ class XbyakAllocator : public Xbyak::Allocator {
virtual bool useProtect() const { return false; } virtual bool useProtect() const { return false; }
}; };
class ResolvableGuestCall {
public:
bool is_jump_;
uintptr_t destination_;
// rgcid
unsigned offset_;
};
class X64Emitter; class X64Emitter;
using TailEmitCallback = std::function<void(X64Emitter& e, Xbyak::Label& lbl)>; using TailEmitCallback = std::function<void(X64Emitter& e, Xbyak::Label& lbl)>;
struct TailEmitter { struct TailEmitter {
@ -220,7 +215,6 @@ class X64Emitter : public Xbyak::CodeGenerator {
uint32_t debug_info_flags, FunctionDebugInfo* debug_info, uint32_t debug_info_flags, FunctionDebugInfo* debug_info,
void** out_code_address, size_t* out_code_size, void** out_code_address, size_t* out_code_size,
std::vector<SourceMapEntry>* out_source_map); std::vector<SourceMapEntry>* out_source_map);
void InjectCallAddresses(void* new_execute_addr);
public: public:
// Reserved: rsp, rsi, rdi // Reserved: rsp, rsi, rdi
@ -230,7 +224,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
// xmm4-xmm15 (save to get xmm3) // xmm4-xmm15 (save to get xmm3)
static const int GPR_COUNT = 7; static const int GPR_COUNT = 7;
static const int XMM_COUNT = 12; static const int XMM_COUNT = 12;
static constexpr size_t kStashOffset = 32;
static void SetupReg(const hir::Value* v, Xbyak::Reg8& r) { static void SetupReg(const hir::Value* v, Xbyak::Reg8& r) {
auto idx = gpr_reg_map_[v->reg.index]; auto idx = gpr_reg_map_[v->reg.index];
r = Xbyak::Reg8(idx); r = Xbyak::Reg8(idx);
@ -410,8 +404,6 @@ class X64Emitter : public Xbyak::CodeGenerator {
static const uint32_t gpr_reg_map_[GPR_COUNT]; static const uint32_t gpr_reg_map_[GPR_COUNT];
static const uint32_t xmm_reg_map_[XMM_COUNT]; static const uint32_t xmm_reg_map_[XMM_COUNT];
uint32_t current_rgc_id_ = 0xEEDDF00F;
std::vector<ResolvableGuestCall> call_sites_;
/* /*
set to true if the low 32 bits of membase == 0. set to true if the low 32 bits of membase == 0.
only really advantageous if you are storing 32 bit 0 to a displaced address, only really advantageous if you are storing 32 bit 0 to a displaced address,

View File

@ -25,46 +25,46 @@ static void EmitFusedBranch(X64Emitter& e, const T& i) {
bool valid = i.instr->prev && i.instr->prev->dest == i.src1.value; bool valid = i.instr->prev && i.instr->prev->dest == i.src1.value;
auto opcode = valid ? i.instr->prev->opcode->num : -1; auto opcode = valid ? i.instr->prev->opcode->num : -1;
if (valid) { if (valid) {
auto name = i.src2.value->name; std::string name = i.src2.value->GetIdString();
switch (opcode) { switch (opcode) {
case OPCODE_COMPARE_EQ: case OPCODE_COMPARE_EQ:
e.je(name, e.T_NEAR); e.je(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_NE: case OPCODE_COMPARE_NE:
e.jne(name, e.T_NEAR); e.jne(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_SLT: case OPCODE_COMPARE_SLT:
e.jl(name, e.T_NEAR); e.jl(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_SLE: case OPCODE_COMPARE_SLE:
e.jle(name, e.T_NEAR); e.jle(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_SGT: case OPCODE_COMPARE_SGT:
e.jg(name, e.T_NEAR); e.jg(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_SGE: case OPCODE_COMPARE_SGE:
e.jge(name, e.T_NEAR); e.jge(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_ULT: case OPCODE_COMPARE_ULT:
e.jb(name, e.T_NEAR); e.jb(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_ULE: case OPCODE_COMPARE_ULE:
e.jbe(name, e.T_NEAR); e.jbe(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_UGT: case OPCODE_COMPARE_UGT:
e.ja(name, e.T_NEAR); e.ja(std::move(name), e.T_NEAR);
break; break;
case OPCODE_COMPARE_UGE: case OPCODE_COMPARE_UGE:
e.jae(name, e.T_NEAR); e.jae(std::move(name), e.T_NEAR);
break; break;
default: default:
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
e.jnz(name, e.T_NEAR); e.jnz(std::move(name), e.T_NEAR);
break; break;
} }
} else { } else {
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
e.jnz(i.src2.value->name, e.T_NEAR); e.jnz(i.src2.value->GetIdString(), e.T_NEAR);
} }
} }
// ============================================================================ // ============================================================================
@ -490,7 +490,7 @@ EMITTER_OPCODE_TABLE(OPCODE_SET_RETURN_ADDRESS, SET_RETURN_ADDRESS);
// ============================================================================ // ============================================================================
struct BRANCH : Sequence<BRANCH, I<OPCODE_BRANCH, VoidOp, LabelOp>> { struct BRANCH : Sequence<BRANCH, I<OPCODE_BRANCH, VoidOp, LabelOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.jmp(i.src1.value->name, e.T_NEAR); e.jmp(i.src1.value->GetIdString(), e.T_NEAR);
} }
}; };
EMITTER_OPCODE_TABLE(OPCODE_BRANCH, BRANCH); EMITTER_OPCODE_TABLE(OPCODE_BRANCH, BRANCH);
@ -534,7 +534,7 @@ struct BRANCH_TRUE_F32
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
e.vmovd(e.eax, input); e.vmovd(e.eax, input);
e.test(e.eax, e.eax); e.test(e.eax, e.eax);
e.jnz(i.src2.value->name, e.T_NEAR); e.jnz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
struct BRANCH_TRUE_F64 struct BRANCH_TRUE_F64
@ -543,7 +543,7 @@ struct BRANCH_TRUE_F64
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
e.vmovq(e.rax, input); e.vmovq(e.rax, input);
e.test(e.rax, e.rax); e.test(e.rax, e.rax);
e.jnz(i.src2.value->name, e.T_NEAR); e.jnz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16, EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16,
@ -557,7 +557,7 @@ struct BRANCH_FALSE_I8
: Sequence<BRANCH_FALSE_I8, I<OPCODE_BRANCH_FALSE, VoidOp, I8Op, LabelOp>> { : Sequence<BRANCH_FALSE_I8, I<OPCODE_BRANCH_FALSE, VoidOp, I8Op, LabelOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
e.jz(i.src2.value->name, e.T_NEAR); e.jz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
struct BRANCH_FALSE_I16 struct BRANCH_FALSE_I16
@ -565,7 +565,7 @@ struct BRANCH_FALSE_I16
I<OPCODE_BRANCH_FALSE, VoidOp, I16Op, LabelOp>> { I<OPCODE_BRANCH_FALSE, VoidOp, I16Op, LabelOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
e.jz(i.src2.value->name, e.T_NEAR); e.jz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
struct BRANCH_FALSE_I32 struct BRANCH_FALSE_I32
@ -573,7 +573,7 @@ struct BRANCH_FALSE_I32
I<OPCODE_BRANCH_FALSE, VoidOp, I32Op, LabelOp>> { I<OPCODE_BRANCH_FALSE, VoidOp, I32Op, LabelOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
e.jz(i.src2.value->name, e.T_NEAR); e.jz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
struct BRANCH_FALSE_I64 struct BRANCH_FALSE_I64
@ -581,7 +581,7 @@ struct BRANCH_FALSE_I64
I<OPCODE_BRANCH_FALSE, VoidOp, I64Op, LabelOp>> { I<OPCODE_BRANCH_FALSE, VoidOp, I64Op, LabelOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
e.jz(i.src2.value->name, e.T_NEAR); e.jz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
struct BRANCH_FALSE_F32 struct BRANCH_FALSE_F32
@ -591,7 +591,7 @@ struct BRANCH_FALSE_F32
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
e.vmovd(e.eax, input); e.vmovd(e.eax, input);
e.test(e.eax, e.eax); e.test(e.eax, e.eax);
e.jz(i.src2.value->name, e.T_NEAR); e.jz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
struct BRANCH_FALSE_F64 struct BRANCH_FALSE_F64
@ -601,7 +601,7 @@ struct BRANCH_FALSE_F64
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
e.vmovq(e.rax, input); e.vmovq(e.rax, input);
e.test(e.rax, e.rax); e.test(e.rax, e.rax);
e.jz(i.src2.value->name, e.T_NEAR); e.jz(i.src2.value->GetIdString(), e.T_NEAR);
} }
}; };
EMITTER_OPCODE_TABLE(OPCODE_BRANCH_FALSE, BRANCH_FALSE_I8, BRANCH_FALSE_I16, EMITTER_OPCODE_TABLE(OPCODE_BRANCH_FALSE, BRANCH_FALSE_I8, BRANCH_FALSE_I16,

View File

@ -805,22 +805,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB);
// ============================================================================ // ============================================================================
// OPCODE_VECTOR_SHL // OPCODE_VECTOR_SHL
// ============================================================================ // ============================================================================
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) {
alignas(16) T value[16 / sizeof(T)];
alignas(16) T shamt[16 / sizeof(T)];
// Load SSE registers into a C array.
_mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
_mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
value[i] = value[i] << (shamt[i] & ((sizeof(T) * 8) - 1));
}
// Store result and return it.
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
}
static XmmConst GetShiftmaskForType(unsigned typ) { static XmmConst GetShiftmaskForType(unsigned typ) {
if (typ == INT8_TYPE) { if (typ == INT8_TYPE) {
return XMMXOPByteShiftMask; return XMMXOPByteShiftMask;
@ -914,28 +899,14 @@ struct VECTOR_SHL_V128
} }
} }
if (all_same) { if (all_same) {
// mul by two e.vpmovzxbw(e.ymm0, i.src1);
/*if (seenvalue == 1) { e.vpsllw(e.ymm0, e.ymm0, seenvalue);
e.vpaddb(i.dest, i.src1, i.src1); e.vextracti128(e.xmm1, e.ymm0, 1);
} else if (seenvalue == 2) {
e.vpaddb(i.dest, i.src1, i.src1);
e.vpaddb(i.dest, i.dest, i.dest);
} else if (seenvalue == 3) {
// mul by 8
e.vpaddb(i.dest, i.src1, i.src1);
e.vpaddb(i.dest, i.dest, i.dest);
e.vpaddb(i.dest, i.dest, i.dest);
} else*/
{
e.vpmovzxbw(e.ymm0, i.src1);
e.vpsllw(e.ymm0, e.ymm0, seenvalue);
e.vextracti128(e.xmm1, e.ymm0, 1);
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMShortsToBytes)); e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMShortsToBytes));
e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMShortsToBytes)); e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMShortsToBytes));
e.vpunpcklqdq(i.dest, e.xmm0, e.xmm1); e.vpunpcklqdq(i.dest, e.xmm0, e.xmm1);
return; return;
}
} else { } else {
e.LoadConstantXmm(e.xmm2, constmask); e.LoadConstantXmm(e.xmm2, constmask);
@ -966,14 +937,41 @@ struct VECTOR_SHL_V128
} }
} }
} }
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
if (i.src1.is_constant) {
e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); if (i.src2.is_constant) {
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint8_t>)); e.StashConstantXmm(1, i.src2.constant());
e.vmovaps(i.dest, e.xmm0); stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
e.shl(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
e.inc(e.edx);
} else {
e.add(e.edx, 1);
}
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
} }
static void EmitInt16(X64Emitter& e, const EmitArgType& i) { static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
Xmm src1; Xmm src1;
@ -1022,14 +1020,32 @@ struct VECTOR_SHL_V128
// TODO(benvanik): native version (with shift magic). // TODO(benvanik): native version (with shift magic).
e.L(emu); e.L(emu);
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
if (i.src2.is_constant) { if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint16_t>)); Xbyak::Label looper;
e.vmovaps(i.dest, e.xmm0);
e.xor_(e.edx, e.edx);
e.L(looper);
e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
e.shl(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 2);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
e.L(end); e.L(end);
} }
@ -1098,14 +1114,32 @@ struct VECTOR_SHL_V128
// TODO(benvanik): native version (with shift magic). // TODO(benvanik): native version (with shift magic).
e.L(emu); e.L(emu);
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
if (i.src2.is_constant) { if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint32_t>)); Xbyak::Label looper;
e.vmovaps(i.dest, e.xmm0);
e.xor_(e.edx, e.edx);
e.L(looper);
e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]);
e.shl(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 4);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
e.L(end); e.L(end);
} }
@ -1116,22 +1150,6 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128);
// ============================================================================ // ============================================================================
// OPCODE_VECTOR_SHR // OPCODE_VECTOR_SHR
// ============================================================================ // ============================================================================
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) {
alignas(16) T value[16 / sizeof(T)];
alignas(16) T shamt[16 / sizeof(T)];
// Load SSE registers into a C array.
_mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
_mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
value[i] = value[i] >> (shamt[i] & ((sizeof(T) * 8) - 1));
}
// Store result and return it.
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
}
struct VECTOR_SHR_V128 struct VECTOR_SHR_V128
: Sequence<VECTOR_SHR_V128, I<OPCODE_VECTOR_SHR, V128Op, V128Op, V128Op>> { : Sequence<VECTOR_SHR_V128, I<OPCODE_VECTOR_SHR, V128Op, V128Op, V128Op>> {
@ -1179,34 +1197,63 @@ struct VECTOR_SHR_V128
} }
static void EmitInt8(X64Emitter& e, const EmitArgType& i) { static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
// TODO(benvanik): native version (with shift magic). if (i.src2.is_constant && e.IsFeatureEnabled(kX64EmitGFNI)) {
if (i.src2.is_constant) { const auto& shamt = i.src2.constant();
if (e.IsFeatureEnabled(kX64EmitGFNI)) { bool all_same = true;
const auto& shamt = i.src2.constant(); for (size_t n = 0; n < 16 - n; ++n) {
bool all_same = true; if (shamt.u8[n] != shamt.u8[n + 1]) {
for (size_t n = 0; n < 16 - n; ++n) { all_same = false;
if (shamt.u8[n] != shamt.u8[n + 1]) { break;
all_same = false;
break;
}
}
if (all_same) {
// Every count is the same, so we can use gf2p8affineqb.
const uint8_t shift_amount = shamt.u8[0] & 0b111;
const uint64_t shift_matrix = UINT64_C(0x0102040810204080)
<< (shift_amount * 8);
e.vgf2p8affineqb(i.dest, i.src1,
e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
return;
} }
} }
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); if (all_same) {
} else { // Every count is the same, so we can use gf2p8affineqb.
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); const uint8_t shift_amount = shamt.u8[0] & 0b111;
const uint64_t shift_matrix = UINT64_C(0x0102040810204080)
<< (shift_amount * 8);
e.vgf2p8affineqb(i.dest, i.src1,
e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
return;
}
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint8_t>)); unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
e.vmovaps(i.dest, e.xmm0);
if (i.src1.is_constant) {
e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
}
if (i.src2.is_constant) {
e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
// movzx is to eliminate any possible dep on previous value of rcx at start
// of loop
e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
// maybe using a memory operand as the left side isn't the best idea lol,
// still better than callnativesafe though agners docs have no timing info
// on shx [m], cl so shrug
e.shr(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
e.inc(e.edx);
} else {
e.add(e.edx, 1);
}
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
} }
static void EmitInt16(X64Emitter& e, const EmitArgType& i) { static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
@ -1248,14 +1295,38 @@ struct VECTOR_SHR_V128
// TODO(benvanik): native version (with shift magic). // TODO(benvanik): native version (with shift magic).
e.L(emu); e.L(emu);
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
if (i.src1.is_constant) {
e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint16_t>)); if (i.src2.is_constant) {
e.vmovaps(i.dest, e.xmm0); e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
e.shr(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 2);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
e.L(end); e.L(end);
} }
@ -1324,14 +1395,37 @@ struct VECTOR_SHR_V128
// TODO(benvanik): native version. // TODO(benvanik): native version.
e.L(emu); e.L(emu);
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
if (i.src1.is_constant) {
e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint32_t>)); if (i.src2.is_constant) {
e.vmovaps(i.dest, e.xmm0); e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]);
e.shr(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 4);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
e.L(end); e.L(end);
} }
@ -1388,7 +1482,8 @@ struct VECTOR_SHA_V128
} }
static void EmitInt8(X64Emitter& e, const EmitArgType& i) { static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
// TODO(benvanik): native version (with shift magic). unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
if (i.src2.is_constant) { if (i.src2.is_constant) {
const auto& shamt = i.src2.constant(); const auto& shamt = i.src2.constant();
bool all_same = true; bool all_same = true;
@ -1399,7 +1494,6 @@ struct VECTOR_SHA_V128
} }
} }
if (e.IsFeatureEnabled(kX64EmitGFNI)) { if (e.IsFeatureEnabled(kX64EmitGFNI)) {
if (all_same) { if (all_same) {
// Every count is the same, so we can use gf2p8affineqb. // Every count is the same, so we can use gf2p8affineqb.
@ -1412,8 +1506,7 @@ struct VECTOR_SHA_V128
e.StashConstantXmm(0, vec128q(shift_matrix)), 0); e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
return; return;
} }
} } else if (all_same) {
else if (all_same) {
Xmm to_be_shifted = GetInputRegOrConstant(e, i.src1, e.xmm1); Xmm to_be_shifted = GetInputRegOrConstant(e, i.src1, e.xmm1);
e.vpmovsxbw(e.xmm0, to_be_shifted); //_mm_srai_epi16 / psraw e.vpmovsxbw(e.xmm0, to_be_shifted); //_mm_srai_epi16 / psraw
@ -1425,14 +1518,41 @@ struct VECTOR_SHA_V128
return; return;
} }
e.StashConstantXmm(1, i.src2.constant());
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int8_t>)); if (i.src1.is_constant) {
e.vmovaps(i.dest, e.xmm0); e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
}
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
// movzx is to eliminate any possible dep on previous value of rcx at start
// of loop
e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
// maybe using a memory operand as the left side isn't the best idea lol,
// still better than callnativesafe though agners docs have no timing info
// on shx [m], cl so shrug
e.sar(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
e.inc(e.edx);
} else {
e.add(e.edx, 1);
}
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
} }
static void EmitInt16(X64Emitter& e, const EmitArgType& i) { static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
@ -1474,14 +1594,38 @@ struct VECTOR_SHA_V128
// TODO(benvanik): native version (with shift magic). // TODO(benvanik): native version (with shift magic).
e.L(emu); e.L(emu);
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
if (i.src1.is_constant) {
e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int16_t>)); if (i.src2.is_constant) {
e.vmovaps(i.dest, e.xmm0); e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
e.sar(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 2);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
e.L(end); e.L(end);
} }
@ -1508,9 +1652,9 @@ struct VECTOR_SHA_V128
// that happens so we mask. // that happens so we mask.
if (i.src2.is_constant) { if (i.src2.is_constant) {
e.LoadConstantXmm(e.xmm0, i.src2.constant()); e.LoadConstantXmm(e.xmm0, i.src2.constant());
e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS)); e.vpand(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS));
} else { } else {
e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
} }
e.vpsravd(i.dest, i.src1, e.xmm0); e.vpsravd(i.dest, i.src1, e.xmm0);
} else { } else {
@ -1535,14 +1679,36 @@ struct VECTOR_SHA_V128
// TODO(benvanik): native version. // TODO(benvanik): native version.
e.L(emu); e.L(emu);
if (i.src2.is_constant) { unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
if (i.src1.is_constant) {
e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int32_t>)); if (i.src2.is_constant) {
e.vmovaps(i.dest, e.xmm0); e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]);
e.sar(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 4);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
e.L(end); e.L(end);
} }
@ -1550,26 +1716,6 @@ struct VECTOR_SHA_V128
}; };
EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128); EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128);
// ============================================================================
// OPCODE_VECTOR_ROTATE_LEFT
// ============================================================================
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
static __m128i EmulateVectorRotateLeft(void*, __m128i src1, __m128i src2) {
alignas(16) T value[16 / sizeof(T)];
alignas(16) T shamt[16 / sizeof(T)];
// Load SSE registers into a C array.
_mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
_mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
value[i] = xe::rotate_left<T>(value[i], shamt[i] & ((sizeof(T) * 8) - 1));
}
// Store result and return it.
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
}
struct VECTOR_ROTATE_LEFT_V128 struct VECTOR_ROTATE_LEFT_V128
: Sequence<VECTOR_ROTATE_LEFT_V128, : Sequence<VECTOR_ROTATE_LEFT_V128,
I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> { I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
@ -1594,33 +1740,72 @@ struct VECTOR_ROTATE_LEFT_V128
} }
} else { } else {
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
switch (i.instr->flags) { switch (i.instr->flags) {
case INT8_TYPE: case INT8_TYPE: {
// TODO(benvanik): native version (with shift magic). if (i.src1.is_constant) {
if (i.src2.is_constant) { e.StashConstantXmm(0, i.src1.constant());
e.lea(e.GetNativeParam(1), stack_offset_src1 = X64Emitter::kStashOffset;
e.StashConstantXmm(1, i.src2.constant()));
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint8_t>));
e.vmovaps(i.dest, e.xmm0);
break;
case INT16_TYPE:
// TODO(benvanik): native version (with shift magic).
if (i.src2.is_constant) { if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant());
e.StashConstantXmm(1, i.src2.constant())); stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe( Xbyak::Label rotate_iter;
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint16_t>));
e.vmovaps(i.dest, e.xmm0); e.xor_(e.edx, e.edx);
break;
e.L(rotate_iter);
e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
e.rol(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 1);
e.cmp(e.edx, 16);
e.jnz(rotate_iter);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
} break;
case INT16_TYPE: {
if (i.src1.is_constant) {
e.StashConstantXmm(0, i.src1.constant());
stack_offset_src1 = X64Emitter::kStashOffset;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
}
if (i.src2.is_constant) {
e.StashConstantXmm(1, i.src2.constant());
stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label rotate_iter;
e.xor_(e.edx, e.edx);
e.L(rotate_iter);
e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
e.rol(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 2);
e.cmp(e.edx, 16);
e.jnz(rotate_iter);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
} break;
case INT32_TYPE: { case INT32_TYPE: {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vprolvd(i.dest, i.src1, i.src2); e.vprolvd(i.dest, i.src1, i.src2);
@ -1638,23 +1823,40 @@ struct VECTOR_ROTATE_LEFT_V128
} }
e.vpsllvd(e.xmm1, i.src1, e.xmm0); e.vpsllvd(e.xmm1, i.src1, e.xmm0);
// Shift right (to get low bits): // Shift right (to get low bits):
e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); e.vmovdqa(temp, e.GetXmmConstPtr(XMMPI32));
e.vpsubd(temp, e.xmm0); e.vpsubd(temp, e.xmm0);
e.vpsrlvd(i.dest, i.src1, temp); e.vpsrlvd(i.dest, i.src1, temp);
// Merge: // Merge:
e.vpor(i.dest, e.xmm1); e.vpor(i.dest, e.xmm1);
} else { } else {
// TODO(benvanik): non-AVX2 native version. if (i.src1.is_constant) {
if (i.src2.is_constant) { e.StashConstantXmm(0, i.src1.constant());
e.lea(e.GetNativeParam(1), stack_offset_src1 = X64Emitter::kStashOffset;
e.StashConstantXmm(1, i.src2.constant()));
} else { } else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe( if (i.src2.is_constant) {
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint32_t>)); e.StashConstantXmm(1, i.src2.constant());
e.vmovaps(i.dest, e.xmm0); stack_offset_src2 = X64Emitter::kStashOffset + 16;
} else {
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
}
Xbyak::Label rotate_iter;
e.xor_(e.edx, e.edx);
e.L(rotate_iter);
e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]);
e.rol(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl);
e.add(e.edx, 4);
e.cmp(e.edx, 16);
e.jnz(rotate_iter);
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
} }
break; break;
} }
@ -1667,80 +1869,120 @@ struct VECTOR_ROTATE_LEFT_V128
}; };
EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128); EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128);
// ============================================================================
// OPCODE_VECTOR_AVERAGE
// ============================================================================
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
static __m128i EmulateVectorAverage(void*, __m128i src1, __m128i src2) {
alignas(16) T src1v[16 / sizeof(T)];
alignas(16) T src2v[16 / sizeof(T)];
alignas(16) T value[16 / sizeof(T)];
// Load SSE registers into a C array.
_mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1);
_mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2);
for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
auto t = (uint64_t(src1v[i]) + uint64_t(src2v[i]) + 1) / 2;
value[i] = T(t);
}
// Store result and return it.
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
}
struct VECTOR_AVERAGE struct VECTOR_AVERAGE
: Sequence<VECTOR_AVERAGE, : Sequence<VECTOR_AVERAGE,
I<OPCODE_VECTOR_AVERAGE, V128Op, V128Op, V128Op>> { I<OPCODE_VECTOR_AVERAGE, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
auto i_flags = i.instr->flags;
EmitCommutativeBinaryXmmOp( EmitCommutativeBinaryXmmOp(
e, i, e, i,
[&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, const Xmm& src2) { [i_flags](X64Emitter& e, const Xmm& dest, const Xmm& src1,
const TypeName part_type = const Xmm& src2) {
static_cast<TypeName>(i.instr->flags & 0xFF); const TypeName part_type = static_cast<TypeName>(i_flags & 0xFF);
const uint32_t arithmetic_flags = i.instr->flags >> 8; const uint32_t arithmetic_flags = i_flags >> 8;
bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED);
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
switch (part_type) { switch (part_type) {
case INT8_TYPE: case INT8_TYPE:
if (is_unsigned) { if (is_unsigned) {
e.vpavgb(dest, src1, src2); e.vpavgb(dest, src1, src2);
} else { } else {
assert_always(); // todo: avx2 version or version that sign extends to two __m128
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], src2);
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
e.movsx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
e.movsx(e.eax, e.byte[e.rsp + stack_offset_src1 + e.rdx]);
e.lea(e.ecx, e.ptr[e.ecx + e.eax + 1]);
e.sar(e.ecx, 1);
e.mov(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
e.inc(e.edx);
} else {
e.add(e.edx, 1);
}
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(dest, e.ptr[e.rsp + stack_offset_src1]);
} }
break; break;
case INT16_TYPE: case INT16_TYPE:
if (is_unsigned) { if (is_unsigned) {
e.vpavgw(dest, src1, src2); e.vpavgw(dest, src1, src2);
} else { } else {
assert_always(); e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], src2);
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
e.movsx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
e.movsx(e.eax, e.word[e.rsp + stack_offset_src1 + e.rdx]);
e.lea(e.ecx, e.ptr[e.ecx + e.eax + 1]);
e.sar(e.ecx, 1);
e.mov(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cx);
e.add(e.edx, 2);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(dest, e.ptr[e.rsp + stack_offset_src1]);
} }
break; break;
case INT32_TYPE: case INT32_TYPE: {
// No 32bit averages in AVX. // No 32bit averages in AVX.
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], src2);
Xbyak::Label looper;
e.xor_(e.edx, e.edx);
e.L(looper);
auto src2_current_ptr =
e.dword[e.rsp + stack_offset_src2 + e.rdx];
auto src1_current_ptr =
e.dword[e.rsp + stack_offset_src1 + e.rdx];
if (is_unsigned) { if (is_unsigned) {
if (i.src2.is_constant) { // implicit zero-ext
e.lea(e.GetNativeParam(1), e.mov(e.ecx, src2_current_ptr);
e.StashConstantXmm(1, i.src2.constant())); e.mov(e.eax, src1_current_ptr);
} else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorAverage<uint32_t>));
e.vmovaps(i.dest, e.xmm0);
} else { } else {
if (i.src2.is_constant) { e.movsxd(e.rcx, src2_current_ptr);
e.lea(e.GetNativeParam(1), e.movsxd(e.rax, src1_current_ptr);
e.StashConstantXmm(1, i.src2.constant()));
} else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorAverage<int32_t>));
e.vmovaps(i.dest, e.xmm0);
} }
break;
e.lea(e.rcx, e.ptr[e.rcx + e.rax + 1]);
if (is_unsigned) {
e.shr(e.rcx, 1);
} else {
e.sar(e.rcx, 1);
}
e.mov(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.ecx);
e.add(e.edx, 4);
e.cmp(e.edx, 16);
e.jnz(looper);
e.vmovdqa(dest, e.ptr[e.rsp + stack_offset_src1]);
} break;
default: default:
assert_unhandled_case(part_type); assert_unhandled_case(part_type);
break; break;

File diff suppressed because one or more lines are too long

View File

@ -41,18 +41,6 @@ bool FinalizationPass::Run(HIRBuilder* builder) {
block->ordinal = block_ordinal++; block->ordinal = block_ordinal++;
// Ensure all labels have names. // Ensure all labels have names.
auto label = block->label_head;
while (label) {
if (!label->name) {
const size_t label_len = 6 + 4;
char* name = reinterpret_cast<char*>(arena->Alloc(label_len + 1, 1));
assert_true(label->id <= 9999);
auto end = fmt::format_to_n(name, label_len, "_label{}", label->id);
name[end.size] = '\0';
label->name = name;
}
label = label->next;
}
// Remove unneeded jumps. // Remove unneeded jumps.
auto tail = block->instr_tail; auto tail = block->instr_tail;

View File

@ -23,52 +23,6 @@ using namespace xe::cpu::hir;
using xe::cpu::hir::HIRBuilder; using xe::cpu::hir::HIRBuilder;
using xe::cpu::hir::Instr; using xe::cpu::hir::Instr;
using xe::cpu::hir::Value; using xe::cpu::hir::Value;
using vmask_portion_t = uint64_t;
template <uint32_t Ndwords>
struct Valuemask_t {
vmask_portion_t bits[Ndwords];
static Valuemask_t create_empty(vmask_portion_t fill = 0) {
Valuemask_t result;
for (uint32_t i = 0; i < Ndwords; ++i) {
result.bits[i] = fill;
}
return result;
}
template <typename TCallable>
Valuemask_t operate(TCallable&& oper) const {
Valuemask_t result = create_empty();
for (uint32_t i = 0; i < Ndwords; ++i) {
result.bits[i] = oper(bits[i]);
}
return result;
}
template <typename TCallable>
Valuemask_t operate(TCallable&& oper, Valuemask_t other) const {
Valuemask_t result = create_empty();
for (uint32_t i = 0; i < Ndwords; ++i) {
result.bits[i] = oper(bits[i], other.bits[i]);
}
return result;
}
Valuemask_t operator&(ValueMask other) const {
return operate([](vmask_portion_t x, vmask_portion_t y) { return x & y; },
other);
}
Valuemask_t operator|(ValueMask other) const {
return operate([](vmask_portion_t x, vmask_portion_t y) { return x | y; },
other);
}
Valuemask_t operator^(ValueMask other) const {
return operate([](vmask_portion_t x, vmask_portion_t y) { return x ^ y; },
other);
}
Valuemask_t operator~() const {
return operate([](vmask_portion_t x) { return ~x; }, other);
}
};
SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {} SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {}
@ -76,17 +30,13 @@ SimplificationPass::~SimplificationPass() {}
bool SimplificationPass::Run(HIRBuilder* builder, bool& result) { bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
result = false; result = false;
bool iter_result = false;
do { result |= SimplifyBitArith(builder);
iter_result = false; result |= EliminateConversions(builder);
iter_result |= SimplifyBitArith(builder); result |= SimplifyAssignments(builder);
iter_result |= EliminateConversions(builder); result |= SimplifyBasicArith(builder);
iter_result |= SimplifyAssignments(builder); result |= SimplifyVectorOps(builder);
iter_result |= SimplifyBasicArith(builder);
iter_result |= SimplifyVectorOps(builder);
result |= iter_result;
} while (iter_result);
return true; return true;
} }
// simplifications that apply to both or and xor // simplifications that apply to both or and xor
@ -735,7 +685,9 @@ bool SimplificationPass::CheckAdd(hir::Instr* i, hir::HIRBuilder* builder) {
auto [added_constant_neg, added_var_neg] = auto [added_constant_neg, added_var_neg] =
i->BinaryValueArrangeAsConstAndVar(); i->BinaryValueArrangeAsConstAndVar();
if (!added_constant_neg) return false; if (!added_constant_neg) {
return false;
}
if (added_constant_neg->AsUint64() & if (added_constant_neg->AsUint64() &
GetScalarSignbitMask(added_constant_neg->type)) { GetScalarSignbitMask(added_constant_neg->type)) {
// adding a value that has its signbit set! // adding a value that has its signbit set!
@ -882,11 +834,6 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
} else if (cmpop == OPCODE_COMPARE_UGT) { } else if (cmpop == OPCODE_COMPARE_UGT) {
// impossible, cannot be greater than mask // impossible, cannot be greater than mask
/* i->Replace(&OPCODE_ASSIGN_info, 0);
i->set_src1(builder->LoadZeroInt8());
return true;
*/
constant_replacement = builder->LoadZeroInt8(); constant_replacement = builder->LoadZeroInt8();
} else if (cmpop == OPCODE_COMPARE_ULE) { // less than or equal to mask = } else if (cmpop == OPCODE_COMPARE_ULE) { // less than or equal to mask =
@ -914,9 +861,9 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
bool istrue = i->opcode == &OPCODE_COMPARE_NE_info; bool istrue = i->opcode == &OPCODE_COMPARE_NE_info;
bool isfalse = i->opcode == &OPCODE_COMPARE_EQ_info; bool isfalse = i->opcode == &OPCODE_COMPARE_EQ_info;
auto [input_cosntant, input] = i->BinaryValueArrangeAsConstAndVar(); auto [input_constant, input] = i->BinaryValueArrangeAsConstAndVar();
if (!input_cosntant || input_cosntant->AsUint64() != 0) { if (!input_constant || input_constant->AsUint64() != 0) {
return false; return false;
} }
@ -957,12 +904,6 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
} }
} }
/* Instr* input_def = input->def;
if (!input_def) {
return false;
}
input_def = input_def->GetDestDefSkipAssigns();*/
return false; return false;
} }
bool SimplificationPass::CheckSHRByConst(hir::Instr* i, bool SimplificationPass::CheckSHRByConst(hir::Instr* i,

View File

@ -1872,18 +1872,14 @@ Value* HIRBuilder::AndNot(Value* value1, Value* value2) {
ASSERT_NON_FLOAT_TYPE(value1); ASSERT_NON_FLOAT_TYPE(value1);
ASSERT_NON_FLOAT_TYPE(value2); ASSERT_NON_FLOAT_TYPE(value2);
ASSERT_TYPES_EQUAL(value1, value2); ASSERT_TYPES_EQUAL(value1, value2);
//only other type it can be used with is INT64_TYPE (andc) // only other type it can be used with is INT64_TYPE (andc)
if (value1->type != VEC128_TYPE) { if (value1->type != VEC128_TYPE) {
return this->And(this->Not(value2), value1); return this->And(this->Not(value2), value1);
} else if (value1 == value2) {
return LoadZero(value1->type);
} else if (value1->IsConstantZero() || value2->IsConstantZero()) {
return value1;
} else { } else {
if (value1 == value2) {
return LoadZero(value1->type);
} else if (value1->IsConstantZero()) {
return value1;
} else if (value2->IsConstantZero()) {
return value1;
}
Instr* i = AppendInstr(OPCODE_AND_NOT_info, 0, AllocValue(value1->type)); Instr* i = AppendInstr(OPCODE_AND_NOT_info, 0, AllocValue(value1->type));
i->set_src1(value1); i->set_src1(value1);
i->set_src2(value2); i->set_src2(value2);

View File

@ -26,6 +26,13 @@ class Label {
char* name; char* name;
void* tag; void* tag;
// just use stringification of label id
// this will later be used as an input to xbyak. xbyak only accepts
// std::string as a value, not passed by reference, so precomputing the
// stringification does not help
std::string GetIdString() {
return std::to_string(id);
}
}; };
} // namespace hir } // namespace hir

View File

@ -16,7 +16,7 @@
#include "xenia/cpu/ppc/ppc_hir_builder.h" #include "xenia/cpu/ppc/ppc_hir_builder.h"
DEFINE_bool( DEFINE_bool(
disable_prefetch_and_cachecontrol, false, disable_prefetch_and_cachecontrol, true,
"Disables translating ppc prefetch/cache flush instructions to host " "Disables translating ppc prefetch/cache flush instructions to host "
"prefetch/cacheflush instructions. This may improve performance as these " "prefetch/cacheflush instructions. This may improve performance as these "
"instructions were written with the Xbox 360's cache in mind, and modern " "instructions were written with the Xbox 360's cache in mind, and modern "

View File

@ -105,6 +105,11 @@ bool PPCFrontend::Initialize() {
} }
bool PPCFrontend::DeclareFunction(GuestFunction* function) { bool PPCFrontend::DeclareFunction(GuestFunction* function) {
//chrispy: make sure we aren't declaring a function that is actually padding data, this will mess up PPCScanner and is hard to debug
//wow, this halo reach actually has branches into 0 opcodes, look into further
//xenia_assert(*reinterpret_cast<const uint32_t*>(
// this->memory()->TranslateVirtual(function->address())) != 0);
// Could scan or something here. // Could scan or something here.
// Could also check to see if it's a well-known function type and classify // Could also check to see if it's a well-known function type and classify
// for later. // for later.

View File

@ -34,6 +34,11 @@ DEFINE_bool(
"unimplemented PowerPC instruction is encountered.", "unimplemented PowerPC instruction is encountered.",
"CPU"); "CPU");
DEFINE_bool(
emit_useless_fpscr_updates, false,
"Emit useless fpscr update instructions (pre-10/30/2022 behavior). ",
"CPU");
namespace xe { namespace xe {
namespace cpu { namespace cpu {
namespace ppc { namespace ppc {
@ -89,6 +94,9 @@ bool PPCHIRBuilder::Emit(GuestFunction* function, uint32_t flags) {
function_ = function; function_ = function;
start_address_ = function_->address(); start_address_ = function_->address();
//chrispy: i've seen this one happen, not sure why but i think from trying to precompile twice
//i've also seen ones with a start and end address that are the same...
assert_true(function_->address() <= function_->end_address());
instr_count_ = (function_->end_address() - function_->address()) / 4 + 1; instr_count_ = (function_->end_address() - function_->address()) / 4 + 1;
with_debug_info_ = (flags & EMIT_DEBUG_COMMENTS) == EMIT_DEBUG_COMMENTS; with_debug_info_ = (flags & EMIT_DEBUG_COMMENTS) == EMIT_DEBUG_COMMENTS;
@ -242,6 +250,7 @@ void PPCHIRBuilder::MaybeBreakOnInstruction(uint32_t address) {
} }
void PPCHIRBuilder::AnnotateLabel(uint32_t address, Label* label) { void PPCHIRBuilder::AnnotateLabel(uint32_t address, Label* label) {
//chrispy: label->name is unused, it would be nice to be able to remove the field and this code
char name_buffer[13]; char name_buffer[13];
auto format_result = fmt::format_to_n(name_buffer, 12, "loc_{:08X}", address); auto format_result = fmt::format_to_n(name_buffer, 12, "loc_{:08X}", address);
name_buffer[format_result.size] = '\0'; name_buffer[format_result.size] = '\0';
@ -447,31 +456,38 @@ void PPCHIRBuilder::StoreFPSCR(Value* value) {
void PPCHIRBuilder::UpdateFPSCR(Value* result, bool update_cr1) { void PPCHIRBuilder::UpdateFPSCR(Value* result, bool update_cr1) {
// TODO(benvanik): detect overflow and nan cases. // TODO(benvanik): detect overflow and nan cases.
// fx and vx are the most important. // fx and vx are the most important.
Value* fx = LoadConstantInt8(0); /*
Value* fex = LoadConstantInt8(0); chrispy: stubbed this out because right now all it does is waste
Value* vx = LoadConstantInt8(0); memory and CPU time
Value* ox = LoadConstantInt8(0); */
if (cvars::emit_useless_fpscr_updates) {
Value* fx = LoadConstantInt8(0);
Value* fex = LoadConstantInt8(0);
Value* vx = LoadConstantInt8(0);
Value* ox = LoadConstantInt8(0);
if (update_cr1) { if (update_cr1) {
// Store into the CR1 field. // Store into the CR1 field.
// We do this instead of just calling CopyFPSCRToCR1 so that we don't // We do this instead of just calling CopyFPSCRToCR1 so that we don't
// have to read back the bits and do shifting work. // have to read back the bits and do shifting work.
StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx); StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx);
StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex); StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex);
StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx); StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx);
StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox); StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox);
}
// Generate our new bits.
Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31);
new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30));
new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29));
new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28));
// Mix into fpscr while preserving sticky bits (FX and OX).
Value* bits = LoadFPSCR();
bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits);
StoreFPSCR(bits);
} }
// Generate our new bits.
Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31);
new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30));
new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29));
new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28));
// Mix into fpscr while preserving sticky bits (FX and OX).
Value* bits = LoadFPSCR();
bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits);
StoreFPSCR(bits);
} }
void PPCHIRBuilder::CopyFPSCRToCR1() { void PPCHIRBuilder::CopyFPSCRToCR1() {

View File

@ -21,13 +21,7 @@ namespace xe {
namespace cpu { namespace cpu {
namespace ppc { namespace ppc {
// DEPRECATED struct PPCOpcodeBits {
// TODO(benvanik): move code to PPCDecodeData.
struct InstrData {
PPCOpcode opcode;
const PPCOpcodeInfo* opcode_info;
uint32_t address;
union { union {
uint32_t code; uint32_t code;
@ -329,6 +323,14 @@ struct InstrData {
}; };
}; };
// DEPRECATED
// TODO(benvanik): move code to PPCDecodeData.
struct InstrData : public PPCOpcodeBits {
PPCOpcode opcode;
const PPCOpcodeInfo* opcode_info;
uint32_t address;
};
} // namespace ppc } // namespace ppc
} // namespace cpu } // namespace cpu
} // namespace xe } // namespace xe

View File

@ -31,7 +31,8 @@
#include "third_party/crypto/rijndael-alg-fst.c" #include "third_party/crypto/rijndael-alg-fst.c"
#include "third_party/crypto/rijndael-alg-fst.h" #include "third_party/crypto/rijndael-alg-fst.h"
#include "third_party/pe/pe_image.h" #include "third_party/pe/pe_image.h"
#include "xenia/cpu/ppc/ppc_decode_data.h"
#include "xenia/cpu/ppc/ppc_instr.h"
DEFINE_bool(disable_instruction_infocache, false, DEFINE_bool(disable_instruction_infocache, false,
"Disables caching records of called instructions/mmio accesses.", "Disables caching records of called instructions/mmio accesses.",
"CPU"); "CPU");
@ -1074,12 +1075,13 @@ bool XexModule::LoadContinue() {
image_sha_str_ += &fmtbuf[0]; image_sha_str_ += &fmtbuf[0];
} }
info_cache_.Init(this);
// Find __savegprlr_* and __restgprlr_* and the others. // Find __savegprlr_* and __restgprlr_* and the others.
// We can flag these for special handling (inlining/etc). // We can flag these for special handling (inlining/etc).
if (!FindSaveRest()) { if (!FindSaveRest()) {
return false; return false;
} }
info_cache_.Init(this);
PrecompileDiscoveredFunctions();
// Load a specified module map and diff. // Load a specified module map and diff.
if (cvars::load_module_map.size()) { if (cvars::load_module_map.size()) {
@ -1363,7 +1365,20 @@ InfoCacheFlags* XexModule::GetInstructionAddressFlags(uint32_t guest_addr) {
return info_cache_.LookupFlags(guest_addr); return info_cache_.LookupFlags(guest_addr);
} }
void XexModule::PrecompileDiscoveredFunctions() {
auto others = PreanalyzeCode();
for (auto&& other : others) {
if (other < low_address_ || other >= high_address_) {
continue;
}
auto sym = processor_->LookupFunction(other);
if (!sym || sym->status() != Symbol::Status::kDefined) {
processor_->ResolveFunction(other);
}
}
}
void XexModule::PrecompileKnownFunctions() { void XexModule::PrecompileKnownFunctions() {
if (cvars::disable_function_precompilation) { if (cvars::disable_function_precompilation) {
return; return;
@ -1376,10 +1391,157 @@ void XexModule::PrecompileKnownFunctions() {
} }
for (uint32_t i = 0; i < end; i++) { for (uint32_t i = 0; i < end; i++) {
if (flags[i].was_resolved) { if (flags[i].was_resolved) {
processor_->ResolveFunction(low_address_ + (i * 4)); uint32_t addr = low_address_ + (i * 4);
auto sym = processor_->LookupFunction(addr);
if (!sym || sym->status() != Symbol::Status::kDefined) {
processor_->ResolveFunction(addr);
}
} }
} }
} }
static uint32_t get_bl_called_function(XexModule* xexmod, uint32_t current_base,
ppc::PPCOpcodeBits wrd) {
int32_t displ = static_cast<int32_t>(ppc::XEEXTS26(wrd.I.LI << 2));
if (wrd.I.AA) {
return static_cast<uint32_t>(displ);
} else {
return static_cast<uint32_t>(static_cast<int32_t>(current_base) + displ);
}
}
static bool is_bl(unsigned w) {
return (w >> (32 - 6)) == 18 && ppc::PPCOpcodeBits{w}.I.LK;
}
std::vector<uint32_t> XexModule::PreanalyzeCode() {
uint32_t low_8_aligned = xe::align<uint32_t>(low_address_, 8);
uint32_t high_8_aligned = high_address_ & ~(8U - 1);
uint32_t n_possible_8byte_addresses = (high_8_aligned - low_8_aligned) / 8;
uint32_t* funcstart_candidate_stack =
new uint32_t[n_possible_8byte_addresses];
uint32_t* funcstart_candstack2 = new uint32_t[n_possible_8byte_addresses];
uint32_t stack_pos = 0;
{
// all functions seem to start on 8 byte boundaries, except for obvious ones
// like the save/rest funcs
uint32_t* range_start =
(uint32_t*)memory()->TranslateVirtual(low_8_aligned);
uint32_t* range_end = (uint32_t*)memory()->TranslateVirtual(
high_8_aligned); // align down to multiple of 8
const uint8_t mfspr_r12_lr[4] = {0x7D, 0x88, 0x02, 0xA6};
// a blr instruction, with 4 zero bytes afterwards to pad the next address
// to 8 byte alignment
// if we see this prior to our address, we can assume we are a function
// start
const uint8_t blr[4] = {0x4E, 0x80, 0x0, 0x20};
uint32_t blr32 = *reinterpret_cast<const uint32_t*>(&blr[0]);
uint32_t mfspr_r12_lr32 =
*reinterpret_cast<const uint32_t*>(&mfspr_r12_lr[0]);
/*
First pass: detect save of the link register at an eight byte
aligned address
*/
for (uint32_t* first_pass = range_start; first_pass < range_end;
first_pass += 2) {
if (*first_pass == mfspr_r12_lr32) {
// Push our newly discovered function start into our list
// All addresses in the list are sorted until the second pass
funcstart_candidate_stack[stack_pos++] =
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(first_pass) -
reinterpret_cast<uintptr_t>(range_start)) +
low_8_aligned;
} else if (first_pass[-1] == 0 && *first_pass != 0) {
// originally i checked for blr followed by 0, but some functions are
// actually aligned to greater boundaries. something that appears to be
// longjmp (it occurs in most games, so standard library, and loads ctx,
// so longjmp) is aligned to 16 bytes in most games
uint32_t* check_iter = &first_pass[-2];
while (!*check_iter) {
--check_iter;
}
XE_LIKELY_IF(*check_iter == blr32) {
funcstart_candidate_stack[stack_pos++] =
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(first_pass) -
reinterpret_cast<uintptr_t>(range_start)) +
low_8_aligned;
}
}
}
uint32_t current_guestaddr = low_8_aligned;
// Second pass: detect branch with link instructions and decode the target
// address. We can safely assume that if bl is to address, that address is
// the start of the function
for (uint32_t* second_pass = range_start; second_pass < range_end;
second_pass++, current_guestaddr += 4) {
uint32_t current_call = xe::byte_swap(*second_pass);
if (is_bl(current_call)) {
funcstart_candidate_stack[stack_pos++] = get_bl_called_function(
this, current_guestaddr, ppc::PPCOpcodeBits{current_call});
}
}
auto pdata = this->GetPESection(".pdata");
if (pdata) {
uint32_t* pdata_base =
(uint32_t*)this->memory()->TranslateVirtual(pdata->address);
uint32_t n_pdata_entries = pdata->raw_size / 8;
for (uint32_t i = 0; i < n_pdata_entries; ++i) {
uint32_t funcaddr = xe::load_and_swap<uint32_t>(&pdata_base[i * 2]);
if (funcaddr >= low_address_ && funcaddr <= high_address_) {
funcstart_candidate_stack[stack_pos++] = funcaddr;
} else {
// we hit 0 for func addr, that means we're done
break;
}
}
}
}
// Sort the list of function starts and then ensure that all addresses are
// unique
uint32_t n_known_funcaddrs = 0;
{
// make addresses unique
std::sort(funcstart_candidate_stack, funcstart_candidate_stack + stack_pos);
uint32_t read_pos = 0;
uint32_t write_pos = 0;
uint32_t previous_addr = ~0u;
while (read_pos < stack_pos) {
uint32_t current_addr = funcstart_candidate_stack[read_pos++];
if (current_addr != previous_addr) {
previous_addr = current_addr;
funcstart_candstack2[write_pos++] = current_addr;
}
}
n_known_funcaddrs = write_pos;
}
delete[] funcstart_candidate_stack;
std::vector<uint32_t> result;
result.resize(n_known_funcaddrs);
memcpy(&result[0], funcstart_candstack2,
sizeof(uint32_t) * n_known_funcaddrs);
delete[] funcstart_candstack2;
return result;
}
bool XexModule::FindSaveRest() { bool XexModule::FindSaveRest() {
// Special stack save/restore functions. // Special stack save/restore functions.
// http://research.microsoft.com/en-us/um/redmond/projects/invisible/src/crt/md/ppc/xxx.s.htm // http://research.microsoft.com/en-us/um/redmond/projects/invisible/src/crt/md/ppc/xxx.s.htm
@ -1552,6 +1714,8 @@ bool XexModule::FindSaveRest() {
auto page_size = base_address_ <= 0x90000000 ? 64 * 1024 : 4 * 1024; auto page_size = base_address_ <= 0x90000000 ? 64 * 1024 : 4 * 1024;
auto sec_header = xex_security_info(); auto sec_header = xex_security_info();
std::vector<uint32_t> resolve_on_exit{};
resolve_on_exit.reserve(256);
for (uint32_t i = 0, page = 0; i < sec_header->page_descriptor_count; i++) { for (uint32_t i = 0, page = 0; i < sec_header->page_descriptor_count; i++) {
// Byteswap the bitfield manually. // Byteswap the bitfield manually.
xex2_page_descriptor desc; xex2_page_descriptor desc;
@ -1586,13 +1750,20 @@ bool XexModule::FindSaveRest() {
// Add function stubs. // Add function stubs.
char name[32]; char name[32];
auto AddXexFunction = [this, &resolve_on_exit](uint32_t address,
Function** function) {
DeclareFunction(address, function);
resolve_on_exit.push_back(address);
};
if (gplr_start) { if (gplr_start) {
uint32_t address = gplr_start; uint32_t address = gplr_start;
for (int n = 14; n <= 31; n++) { for (int n = 14; n <= 31; n++) {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__savegprlr_{}", n); fmt::format_to_n(name, xe::countof(name), "__savegprlr_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function);
AddXexFunction(address, &function);
function->set_end_address(address + (31 - n) * 4 + 2 * 4); function->set_end_address(address + (31 - n) * 4 + 2 * 4);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
@ -1608,7 +1779,7 @@ bool XexModule::FindSaveRest() {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__restgprlr_{}", n); fmt::format_to_n(name, xe::countof(name), "__restgprlr_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function); AddXexFunction(address, &function);
function->set_end_address(address + (31 - n) * 4 + 3 * 4); function->set_end_address(address + (31 - n) * 4 + 3 * 4);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
@ -1625,7 +1796,7 @@ bool XexModule::FindSaveRest() {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__savefpr_{}", n); fmt::format_to_n(name, xe::countof(name), "__savefpr_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function); AddXexFunction(address, &function);
function->set_end_address(address + (31 - n) * 4 + 1 * 4); function->set_end_address(address + (31 - n) * 4 + 1 * 4);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
@ -1641,7 +1812,7 @@ bool XexModule::FindSaveRest() {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__restfpr_{}", n); fmt::format_to_n(name, xe::countof(name), "__restfpr_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function); AddXexFunction(address, &function);
function->set_end_address(address + (31 - n) * 4 + 1 * 4); function->set_end_address(address + (31 - n) * 4 + 1 * 4);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
@ -1663,7 +1834,7 @@ bool XexModule::FindSaveRest() {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__savevmx_{}", n); fmt::format_to_n(name, xe::countof(name), "__savevmx_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function); AddXexFunction(address, &function);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx; // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx;
@ -1677,7 +1848,7 @@ bool XexModule::FindSaveRest() {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__savevmx_{}", n); fmt::format_to_n(name, xe::countof(name), "__savevmx_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function); AddXexFunction(address, &function);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx; // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx;
@ -1691,7 +1862,7 @@ bool XexModule::FindSaveRest() {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__restvmx_{}", n); fmt::format_to_n(name, xe::countof(name), "__restvmx_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function); AddXexFunction(address, &function);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx; // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx;
@ -1705,7 +1876,7 @@ bool XexModule::FindSaveRest() {
auto format_result = auto format_result =
fmt::format_to_n(name, xe::countof(name), "__restvmx_{}", n); fmt::format_to_n(name, xe::countof(name), "__restvmx_{}", n);
Function* function; Function* function;
DeclareFunction(address, &function); AddXexFunction(address, &function);
function->set_name(std::string_view(name, format_result.size)); function->set_name(std::string_view(name, format_result.size));
// TODO(benvanik): set type fn->type = FunctionSymbol::User; // TODO(benvanik): set type fn->type = FunctionSymbol::User;
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx; // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx;
@ -1716,6 +1887,13 @@ bool XexModule::FindSaveRest() {
} }
} }
for (auto&& to_ensure_precompiled : resolve_on_exit) {
// we want to make sure an address for these functions is available before
// any other functions are compiled for code generation purposes but we do
// it outside of our loops, because we also want to make sure we've marked
// up the symbol with info about it being save/rest and whatnot
processor_->ResolveFunction(to_ensure_precompiled);
}
return true; return true;
} }

View File

@ -34,7 +34,8 @@ struct InfoCacheFlags {
uint32_t was_resolved : 1; // has this address ever been called/requested uint32_t was_resolved : 1; // has this address ever been called/requested
// via resolvefunction? // via resolvefunction?
uint32_t accessed_mmio : 1; uint32_t accessed_mmio : 1;
uint32_t reserved : 30; uint32_t is_syscall_func : 1;
uint32_t reserved : 29;
}; };
struct XexInfoCache { struct XexInfoCache {
struct InfoCacheFlagsHeader { struct InfoCacheFlagsHeader {
@ -209,7 +210,8 @@ class XexModule : public xe::cpu::Module {
InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr); InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr);
void PrecompileKnownFunctions(); void PrecompileKnownFunctions();
void PrecompileDiscoveredFunctions();
std::vector<uint32_t> PreanalyzeCode();
protected: protected:
std::unique_ptr<Function> CreateFunction(uint32_t address) override; std::unique_ptr<Function> CreateFunction(uint32_t address) override;

View File

@ -9,7 +9,6 @@
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "xenia/base/atomic.h" #include "xenia/base/atomic.h"
#include "xenia/base/clock.h" #include "xenia/base/clock.h"
#include "xenia/base/logging.h" #include "xenia/base/logging.h"
@ -964,7 +963,7 @@ uint32_t xeKeKfAcquireSpinLock(uint32_t* lock, uint64_t r13 = 1) {
PrefetchForCAS(lock); PrefetchForCAS(lock);
assert_true(*lock != static_cast<uint32_t>(r13)); assert_true(*lock != static_cast<uint32_t>(r13));
// Lock. // Lock.
while (!xe::atomic_cas(0, static_cast<uint32_t>(r13), lock)) { while (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(r13)), lock)) {
// Spin! // Spin!
// TODO(benvanik): error on deadlock? // TODO(benvanik): error on deadlock?
xe::threading::MaybeYield(); xe::threading::MaybeYield();
@ -978,7 +977,7 @@ uint32_t xeKeKfAcquireSpinLock(uint32_t* lock, uint64_t r13 = 1) {
} }
dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr, dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr,
ppc_context_t& ppc_context) { const ppc_context_t& ppc_context) {
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address()); auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
return xeKeKfAcquireSpinLock(lock, ppc_context->r[13]); return xeKeKfAcquireSpinLock(lock, ppc_context->r[13]);
} }
@ -997,7 +996,7 @@ void xeKeKfReleaseSpinLock(uint32_t* lock, dword_t old_irql) {
} }
void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql, void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql,
ppc_context_t& ppc_ctx) { const ppc_context_t& ppc_ctx) {
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address()); auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13])); assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13]));
@ -1014,14 +1013,14 @@ DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented,
kHighFrequency); kHighFrequency);
// todo: this is not accurate // todo: this is not accurate
void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr, void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr,
ppc_context_t& ppc_ctx) { const ppc_context_t& ppc_ctx) {
// Lock. // Lock.
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address()); auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
// must not be our own thread // must not be our own thread
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13])); assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
PrefetchForCAS(lock); PrefetchForCAS(lock);
while (!xe::atomic_cas(0, static_cast<uint32_t>(ppc_ctx->r[13]), lock)) { while (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
#if XE_ARCH_AMD64 == 1 #if XE_ARCH_AMD64 == 1
// todo: this is just a nop if they don't have SMT, which is not great // todo: this is just a nop if they don't have SMT, which is not great
// either... // either...
@ -1036,12 +1035,12 @@ DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading,
kImplemented, kBlocking, kHighFrequency); kImplemented, kBlocking, kHighFrequency);
dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry( dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(
lpdword_t lock_ptr, ppc_context_t& ppc_ctx) { lpdword_t lock_ptr, const ppc_context_t& ppc_ctx) {
// Lock. // Lock.
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address()); auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13])); assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
PrefetchForCAS(lock); PrefetchForCAS(lock);
if (!xe::atomic_cas(0, static_cast<uint32_t>(ppc_ctx->r[13]), lock)) { if (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
return 0; return 0;
} }
return 1; return 1;
@ -1050,7 +1049,7 @@ DECLARE_XBOXKRNL_EXPORT4(KeTryToAcquireSpinLockAtRaisedIrql, kThreading,
kImplemented, kBlocking, kHighFrequency, kSketchy); kImplemented, kBlocking, kHighFrequency, kSketchy);
void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr, void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr,
ppc_context_t& ppc_ctx) { const ppc_context_t& ppc_ctx) {
// Unlock. // Unlock.
assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13])); assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13]));
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address()); auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
@ -1283,7 +1282,8 @@ void ExInitializeReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr) {
} }
DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented); DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented);
void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) { void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr,
const ppc_context_t& ppc_context) {
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
int32_t lock_count = ++lock_ptr->lock_count; int32_t lock_count = ++lock_ptr->lock_count;
@ -1301,7 +1301,7 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockExclusive, kThreading,
kImplemented, kBlocking); kImplemented, kBlocking);
dword_result_t ExTryToAcquireReadWriteLockExclusive_entry( dword_result_t ExTryToAcquireReadWriteLockExclusive_entry(
pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) { pointer_t<X_ERWLOCK> lock_ptr, const ppc_context_t& ppc_context) {
auto old_irql = auto old_irql =
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
@ -1320,7 +1320,7 @@ DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading,
kImplemented); kImplemented);
void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr, void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr,
ppc_context_t& ppc_context) { const ppc_context_t& ppc_context) {
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
int32_t lock_count = ++lock_ptr->lock_count; int32_t lock_count = ++lock_ptr->lock_count;
@ -1340,7 +1340,7 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockShared, kThreading, kImplemented,
kBlocking); kBlocking);
dword_result_t ExTryToAcquireReadWriteLockShared_entry( dword_result_t ExTryToAcquireReadWriteLockShared_entry(
pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) { pointer_t<X_ERWLOCK> lock_ptr, const ppc_context_t& ppc_context) {
auto old_irql = auto old_irql =
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
@ -1361,7 +1361,7 @@ DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockShared, kThreading,
kImplemented); kImplemented);
void ExReleaseReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr, void ExReleaseReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr,
ppc_context_t& ppc_context) { const ppc_context_t& ppc_context) {
auto old_irql = auto old_irql =
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);

View File

@ -21,7 +21,7 @@ namespace vfs {
NullDevice::NullDevice(const std::string& mount_path, NullDevice::NullDevice(const std::string& mount_path,
const std::initializer_list<std::string>& null_paths) const std::initializer_list<std::string>& null_paths)
: Device(mount_path), null_paths_(null_paths), name_("NullDevice") {} : Device(mount_path), name_("NullDevice"), null_paths_(null_paths) {}
NullDevice::~NullDevice() = default; NullDevice::~NullDevice() = default;

2
third_party/FFmpeg vendored

@ -1 +1 @@
Subproject commit a437fe6d8efef17c8ad33d39f5815032e7adf5d7 Subproject commit fa4f77cf444cd30894a222148efc5a371b3f76a6