use much faster exp2/cos approximations in ffmpeg, large decrease in cpu usage on my machine on decoder thread
properly byteswap r13 for spinlock Add PPCOpcodeBits stub out broken fpscr updating in ppc_hir_builder. it's just code that repeatedly does nothing right now. add note about 0 opcode bytes being executed to ppc_frontend Add assert to check that function end is greater than function start, can happen with malformed functions Disable prefetch and cachecontrol by default, automatic hardware prefetchers already do the job for the most part minor cleanup in simplification_pass, dont loop optimizations, let the pass manager do it for us Add experimental "delay_via_maybeyield" cvar, which uses MaybeYield to "emulate" the db16cyc instruction Add much faster/simpler way of directly calling guest functions, no longer have to do a byte by byte search through the generated code Generate label string ids on the fly Fix unused function warnings for prefetch on clang, fix many other clang warnings Eliminated majority of CallNativeSafes by replacing them with naive generic code paths. ^ Vector rotate left, vector shift left, vector shift right, vector shift arithmetic right, and vector average are included These naive paths are implemented small loops that stash the two inputs to the stack and load them in gprs from there, they are not particularly fast but should be an order of magnitude faster than callnativesafe to a host function, which would involve a call, stashing all volatile registers, an indirect call, potentially setting up a stack frame for the arrays that the inputs get stashed to, the actual operations, a return, loading all volatile registers, a return, etc Added the fast SHR_V128 path back in Implement signed vector average byte, signed vector average word. previously we were emitting no code for them. signed vector average byte appears in many games Fix bug with signed vector average 32, we were doing unsigned shift, turning negative values into big positive ones potentially
This commit is contained in:
parent
d8b7b3ecec
commit
550d1d0a7c
|
@ -170,8 +170,10 @@ CommandVar<T>::CommandVar(const char* name, T* default_value,
|
|||
const char* description)
|
||||
: name_(name),
|
||||
default_value_(*default_value),
|
||||
description_(description),
|
||||
current_value_(default_value) {}
|
||||
current_value_(default_value),
|
||||
commandline_value_(),
|
||||
description_(description)
|
||||
{}
|
||||
|
||||
template <class T>
|
||||
ConfigVar<T>::ConfigVar(const char* name, T* default_value,
|
||||
|
|
|
@ -59,7 +59,7 @@ static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
|
|||
|
||||
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
|
||||
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
|
||||
#pragma loop(no_vector)
|
||||
|
||||
for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
|
||||
xe::swcache::CacheLine line0, line1, line2, line3;
|
||||
|
||||
|
@ -92,7 +92,6 @@ static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to,
|
|||
|
||||
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
|
||||
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
|
||||
#pragma loop(no_vector)
|
||||
for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
|
||||
_movdir64b(dest1 + i, src1 + i);
|
||||
_movdir64b(dest2 + i, src2 + i);
|
||||
|
|
|
@ -620,23 +620,23 @@ static void Prefetch(const void* addr) {
|
|||
}
|
||||
|
||||
template <>
|
||||
void Prefetch<PrefetchTag::Write>(const void* addr) {
|
||||
XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Write>(const void* addr) {
|
||||
PrefetchW(addr);
|
||||
}
|
||||
template <>
|
||||
void Prefetch<PrefetchTag::Nontemporal>(const void* addr) {
|
||||
XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Nontemporal>(const void* addr) {
|
||||
PrefetchNTA(addr);
|
||||
}
|
||||
template <>
|
||||
void Prefetch<PrefetchTag::Level3>(const void* addr) {
|
||||
XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Level3>(const void* addr) {
|
||||
PrefetchL3(addr);
|
||||
}
|
||||
template <>
|
||||
void Prefetch<PrefetchTag::Level2>(const void* addr) {
|
||||
XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Level2>(const void* addr) {
|
||||
PrefetchL2(addr);
|
||||
}
|
||||
template <>
|
||||
void Prefetch<PrefetchTag::Level1>(const void* addr) {
|
||||
XE_MAYBE_UNUSED void Prefetch<PrefetchTag::Level1>(const void* addr) {
|
||||
PrefetchL1(addr);
|
||||
}
|
||||
// todo: does aarch64 have streaming stores/loads?
|
||||
|
|
|
@ -25,6 +25,7 @@ namespace xe {
|
|||
*/
|
||||
|
||||
class alignas(4096) xe_global_mutex {
|
||||
XE_MAYBE_UNUSED
|
||||
char detail[64];
|
||||
|
||||
public:
|
||||
|
@ -38,6 +39,7 @@ class alignas(4096) xe_global_mutex {
|
|||
using global_mutex_type = xe_global_mutex;
|
||||
|
||||
class alignas(64) xe_fast_mutex {
|
||||
XE_MAYBE_UNUSED
|
||||
char detail[64];
|
||||
|
||||
public:
|
||||
|
@ -62,8 +64,6 @@ class xe_unlikely_mutex {
|
|||
~xe_unlikely_mutex() { mut = 0; }
|
||||
|
||||
void lock() {
|
||||
uint32_t lock_expected = 0;
|
||||
|
||||
if (XE_LIKELY(_tryget())) {
|
||||
return;
|
||||
} else {
|
||||
|
|
|
@ -144,9 +144,11 @@
|
|||
#define XE_MSVC_OPTIMIZE_SMALL()
|
||||
#define XE_MSVC_OPTIMIZE_REVERT()
|
||||
#endif
|
||||
|
||||
#if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
|
||||
#define XE_LIKELY_IF(...) if (XE_LIKELY(__VA_ARGS__))
|
||||
#define XE_UNLIKELY_IF(...) if (XE_UNLIKELY(__VA_ARGS__))
|
||||
#define XE_MAYBE_UNUSED __attribute__((unused))
|
||||
#else
|
||||
#if __cplusplus >= 202002
|
||||
#define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) [[likely]]
|
||||
|
@ -155,6 +157,7 @@
|
|||
#define XE_LIKELY_IF(...) if (!!(__VA_ARGS__))
|
||||
#define XE_UNLIKELY_IF(...) if (!!(__VA_ARGS__))
|
||||
#endif
|
||||
#define XE_MAYBE_UNUSED
|
||||
#endif
|
||||
// only use __restrict if MSVC, for clang/gcc we can use -fstrict-aliasing which
|
||||
// acts as __restrict across the board todo: __restrict is part of the type
|
||||
|
|
|
@ -78,7 +78,9 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t _count) {
|
|||
if (read_offset_ < write_offset_) {
|
||||
assert_true(read_offset_ + count <= write_offset_);
|
||||
} else if (read_offset_ + count >= capacity_) {
|
||||
XE_MAYBE_UNUSED
|
||||
ring_size_t left_half = capacity_ - read_offset_;
|
||||
|
||||
assert_true(count - left_half <= write_offset_);
|
||||
}
|
||||
|
||||
|
@ -107,6 +109,7 @@ size_t RingBuffer::Write(const uint8_t* buffer, size_t _count) {
|
|||
if (write_offset_ < read_offset_) {
|
||||
assert_true(write_offset_ + count <= read_offset_);
|
||||
} else if (write_offset_ + count >= capacity_) {
|
||||
XE_MAYBE_UNUSED
|
||||
size_t left_half = capacity_ - write_offset_;
|
||||
assert_true(count - left_half <= read_offset_);
|
||||
}
|
||||
|
|
|
@ -68,7 +68,6 @@ class RingBuffer {
|
|||
ring_size_t offset_delta = write_offs - read_offs;
|
||||
ring_size_t wrap_read_count = (cap - read_offs) + write_offs;
|
||||
|
||||
ring_size_t comparison_value = read_offs <= write_offs;
|
||||
|
||||
if (XE_LIKELY(read_offs <= write_offs)) {
|
||||
return offset_delta; // will be 0 if they are equal, semantically
|
||||
|
|
|
@ -117,7 +117,7 @@ void set_name(const std::string_view name) {
|
|||
|
||||
// checked ntoskrnl, it does not modify delay, so we can place this as a
|
||||
// constant and avoid creating a stack variable
|
||||
static const LARGE_INTEGER sleepdelay0_for_maybeyield{0LL};
|
||||
static const LARGE_INTEGER sleepdelay0_for_maybeyield{{0LL}};
|
||||
|
||||
void MaybeYield() {
|
||||
#if 0
|
||||
|
@ -314,7 +314,8 @@ class Win32Event : public Win32Handle<Event> {
|
|||
}
|
||||
#endif
|
||||
|
||||
EventInfo Query() { EventInfo result{};
|
||||
EventInfo Query() override {
|
||||
EventInfo result{};
|
||||
NtQueryEventPointer.invoke(handle_, 0, &result, sizeof(EventInfo), nullptr);
|
||||
return result;
|
||||
}
|
||||
|
@ -429,7 +430,7 @@ class Win32Timer : public Win32Handle<Timer> {
|
|||
}
|
||||
bool SetRepeatingAt(GClock_::time_point due_time,
|
||||
std::chrono::milliseconds period,
|
||||
std::function<void()> opt_callback = nullptr) {
|
||||
std::function<void()> opt_callback = nullptr) override {
|
||||
return SetRepeatingAt(date::clock_cast<WClock_>(due_time), period,
|
||||
std::move(opt_callback));
|
||||
}
|
||||
|
|
|
@ -46,10 +46,6 @@ DEFINE_bool(ignore_undefined_externs, true,
|
|||
DEFINE_bool(emit_source_annotations, false,
|
||||
"Add extra movs and nops to make disassembly easier to read.",
|
||||
"CPU");
|
||||
DEFINE_bool(resolve_rel32_guest_calls, true,
|
||||
"Experimental optimization, directly call already resolved "
|
||||
"functions via x86 rel32 call/jmp",
|
||||
"CPU");
|
||||
|
||||
DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
|
||||
"Disables the FPU/VMX MXCSR sharing workaround, potentially "
|
||||
|
@ -78,7 +74,6 @@ using namespace xe::literals;
|
|||
|
||||
static const size_t kMaxCodeSize = 1_MiB;
|
||||
|
||||
static const size_t kStashOffset = 32;
|
||||
// static const size_t kStashOffsetHigh = 32 + 32;
|
||||
|
||||
const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = {
|
||||
|
@ -141,55 +136,6 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
|
|||
|
||||
return true;
|
||||
}
|
||||
#pragma pack(push, 1)
|
||||
struct RGCEmitted {
|
||||
uint8_t ff_;
|
||||
uint32_t rgcid_;
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
#if 0
|
||||
void X64Emitter::InjectCallAddresses(void* new_execute_address) {
|
||||
for (auto&& callsite : call_sites_) {
|
||||
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
|
||||
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
|
||||
hunter =
|
||||
reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
|
||||
}
|
||||
|
||||
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
|
||||
hunter->rgcid_ =
|
||||
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
|
||||
reinterpret_cast<intptr_t>(hunter + 1));
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
void X64Emitter::InjectCallAddresses(void* new_execute_address) {
|
||||
#if 0
|
||||
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
|
||||
|
||||
std::map<uint32_t, ResolvableGuestCall*> id_to_rgc{};
|
||||
|
||||
for (auto&& callsite : call_sites_) {
|
||||
id_to_rgc[callsite.offset_] = &callsite;
|
||||
}
|
||||
#else
|
||||
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
|
||||
for (auto&& callsite : call_sites_) {
|
||||
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
|
||||
hunter =
|
||||
reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
|
||||
}
|
||||
|
||||
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
|
||||
hunter->rgcid_ =
|
||||
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
|
||||
reinterpret_cast<intptr_t>(hunter + 1));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
|
||||
GuestFunction* function) {
|
||||
// To avoid changing xbyak, we do a switcharoo here.
|
||||
|
@ -207,10 +153,6 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
|
|||
if (function) {
|
||||
code_cache_->PlaceGuestCode(function->address(), top_, func_info, function,
|
||||
new_execute_address, new_write_address);
|
||||
|
||||
if (cvars::resolve_rel32_guest_calls) {
|
||||
InjectCallAddresses(new_execute_address);
|
||||
}
|
||||
} else {
|
||||
code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address,
|
||||
new_write_address);
|
||||
|
@ -219,7 +161,6 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
|
|||
ready();
|
||||
top_ = old_address;
|
||||
reset();
|
||||
call_sites_.clear();
|
||||
tail_code_.clear();
|
||||
for (auto&& cached_label : label_cache_) {
|
||||
delete cached_label;
|
||||
|
@ -336,7 +277,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
// Mark block labels.
|
||||
auto label = block->label_head;
|
||||
while (label) {
|
||||
L(label->name);
|
||||
L(std::to_string(label->id));
|
||||
label = label->next;
|
||||
}
|
||||
|
||||
|
@ -418,7 +359,6 @@ void X64Emitter::EmitProfilerEpilogue() {
|
|||
// actually... lets just try without atomics lol
|
||||
// lock();
|
||||
add(qword[r10], rdx);
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -534,44 +474,23 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
|
|||
auto fn = static_cast<X64Function*>(function);
|
||||
// Resolve address to the function to call and store in rax.
|
||||
|
||||
if (cvars::resolve_rel32_guest_calls && fn->machine_code()) {
|
||||
ResolvableGuestCall rgc;
|
||||
rgc.destination_ = uint32_t(uint64_t(fn->machine_code()));
|
||||
rgc.offset_ = current_rgc_id_;
|
||||
current_rgc_id_++;
|
||||
|
||||
if (fn->machine_code()) {
|
||||
if (!(instr->flags & hir::CALL_TAIL)) {
|
||||
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
||||
|
||||
db(0xFF);
|
||||
rgc.is_jump_ = false;
|
||||
|
||||
dd(rgc.offset_);
|
||||
call((void*)fn->machine_code());
|
||||
|
||||
} else {
|
||||
// tail call
|
||||
EmitTraceUserCallReturn();
|
||||
|
||||
rgc.is_jump_ = true;
|
||||
EmitProfilerEpilogue();
|
||||
// Pass the callers return address over.
|
||||
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||
|
||||
add(rsp, static_cast<uint32_t>(stack_size()));
|
||||
db(0xFF);
|
||||
dd(rgc.offset_);
|
||||
jmp((void*)fn->machine_code(), T_NEAR);
|
||||
}
|
||||
call_sites_.push_back(rgc);
|
||||
return;
|
||||
}
|
||||
|
||||
if (fn->machine_code()) {
|
||||
// TODO(benvanik): is it worth it to do this? It removes the need for
|
||||
// a ResolveFunction call, but makes the table less useful.
|
||||
assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000);
|
||||
// todo: this should be changed so that we can actually do a call to
|
||||
// fn->machine_code. the code will be emitted near us, so 32 bit rel jmp
|
||||
// should be possible
|
||||
mov(eax, uint32_t(uint64_t(fn->machine_code())));
|
||||
} else if (code_cache_->has_indirection_table()) {
|
||||
// Load the pointer to the indirection table maintained in X64CodeCache.
|
||||
// The target dword will either contain the address of the generated code
|
||||
|
@ -1017,7 +936,10 @@ static const vec128_t xmm_consts[] = {
|
|||
/*XMMSTVLShuffle*/
|
||||
v128_setr_bytes(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
|
||||
/* XMMSTVRSwapMask*/
|
||||
vec128b((uint8_t)0x83)};
|
||||
vec128b((uint8_t)0x83), /*XMMVSRShlByteshuf*/
|
||||
v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
|
||||
// XMMVSRMask
|
||||
vec128b(1)};
|
||||
|
||||
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
|
||||
for (auto& vec : xmm_consts) {
|
||||
|
|
|
@ -172,7 +172,9 @@ enum XmmConst {
|
|||
XMMLVLShuffle,
|
||||
XMMLVRCmp16,
|
||||
XMMSTVLShuffle,
|
||||
XMMSTVRSwapMask // swapwordmask with bit 7 set
|
||||
XMMSTVRSwapMask, // swapwordmask with bit 7 set
|
||||
XMMVSRShlByteshuf,
|
||||
XMMVSRMask
|
||||
|
||||
};
|
||||
using amdfx::xopcompare_e;
|
||||
|
@ -190,13 +192,6 @@ class XbyakAllocator : public Xbyak::Allocator {
|
|||
virtual bool useProtect() const { return false; }
|
||||
};
|
||||
|
||||
class ResolvableGuestCall {
|
||||
public:
|
||||
bool is_jump_;
|
||||
uintptr_t destination_;
|
||||
// rgcid
|
||||
unsigned offset_;
|
||||
};
|
||||
class X64Emitter;
|
||||
using TailEmitCallback = std::function<void(X64Emitter& e, Xbyak::Label& lbl)>;
|
||||
struct TailEmitter {
|
||||
|
@ -220,7 +215,6 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
uint32_t debug_info_flags, FunctionDebugInfo* debug_info,
|
||||
void** out_code_address, size_t* out_code_size,
|
||||
std::vector<SourceMapEntry>* out_source_map);
|
||||
void InjectCallAddresses(void* new_execute_addr);
|
||||
|
||||
public:
|
||||
// Reserved: rsp, rsi, rdi
|
||||
|
@ -230,7 +224,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
// xmm4-xmm15 (save to get xmm3)
|
||||
static const int GPR_COUNT = 7;
|
||||
static const int XMM_COUNT = 12;
|
||||
|
||||
static constexpr size_t kStashOffset = 32;
|
||||
static void SetupReg(const hir::Value* v, Xbyak::Reg8& r) {
|
||||
auto idx = gpr_reg_map_[v->reg.index];
|
||||
r = Xbyak::Reg8(idx);
|
||||
|
@ -410,8 +404,6 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
|
||||
static const uint32_t gpr_reg_map_[GPR_COUNT];
|
||||
static const uint32_t xmm_reg_map_[XMM_COUNT];
|
||||
uint32_t current_rgc_id_ = 0xEEDDF00F;
|
||||
std::vector<ResolvableGuestCall> call_sites_;
|
||||
/*
|
||||
set to true if the low 32 bits of membase == 0.
|
||||
only really advantageous if you are storing 32 bit 0 to a displaced address,
|
||||
|
|
|
@ -25,46 +25,46 @@ static void EmitFusedBranch(X64Emitter& e, const T& i) {
|
|||
bool valid = i.instr->prev && i.instr->prev->dest == i.src1.value;
|
||||
auto opcode = valid ? i.instr->prev->opcode->num : -1;
|
||||
if (valid) {
|
||||
auto name = i.src2.value->name;
|
||||
std::string name = i.src2.value->GetIdString();
|
||||
switch (opcode) {
|
||||
case OPCODE_COMPARE_EQ:
|
||||
e.je(name, e.T_NEAR);
|
||||
e.je(std::move(name), e.T_NEAR);
|
||||
break;
|
||||
case OPCODE_COMPARE_NE:
|
||||
e.jne(name, e.T_NEAR);
|
||||
e.jne(std::move(name), e.T_NEAR);
|
||||
break;
|
||||
case OPCODE_COMPARE_SLT:
|
||||
e.jl(name, e.T_NEAR);
|
||||
e.jl(std::move(name), e.T_NEAR);
|
||||
break;
|
||||
case OPCODE_COMPARE_SLE:
|
||||
e.jle(name, e.T_NEAR);
|
||||
e.jle(std::move(name), e.T_NEAR);
|
||||
break;
|
||||
case OPCODE_COMPARE_SGT:
|
||||
e.jg(name, e.T_NEAR);
|
||||
e.jg(std::move(name), e.T_NEAR);
|
||||
break;
|
||||
case OPCODE_COMPARE_SGE:
|
||||
e.jge(name, e.T_NEAR);
|
||||
e.jge(std::move(name), e.T_NEAR);
|
||||
break;
|
||||
case OPCODE_COMPARE_ULT:
|
||||
e.jb(name, e.T_NEAR);
|
||||
e.jb(std::move(name), e.T_NEAR);
|
||||
break;
|
||||
case OPCODE_COMPARE_ULE:
|
||||
e.jbe(name, e.T_NEAR);
|
||||
e.jbe(std::move(name), e.T_NEAR);
|
||||
break;
|
||||
case OPCODE_COMPARE_UGT:
|
||||
e.ja(name, e.T_NEAR);
|
||||
e.ja(std::move(name), e.T_NEAR);
|
||||
break;
|
||||
case OPCODE_COMPARE_UGE:
|
||||
e.jae(name, e.T_NEAR);
|
||||
e.jae(std::move(name), e.T_NEAR);
|
||||
break;
|
||||
default:
|
||||
e.test(i.src1, i.src1);
|
||||
e.jnz(name, e.T_NEAR);
|
||||
e.jnz(std::move(name), e.T_NEAR);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
e.jnz(i.src2.value->name, e.T_NEAR);
|
||||
e.jnz(i.src2.value->GetIdString(), e.T_NEAR);
|
||||
}
|
||||
}
|
||||
// ============================================================================
|
||||
|
@ -490,7 +490,7 @@ EMITTER_OPCODE_TABLE(OPCODE_SET_RETURN_ADDRESS, SET_RETURN_ADDRESS);
|
|||
// ============================================================================
|
||||
struct BRANCH : Sequence<BRANCH, I<OPCODE_BRANCH, VoidOp, LabelOp>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.jmp(i.src1.value->name, e.T_NEAR);
|
||||
e.jmp(i.src1.value->GetIdString(), e.T_NEAR);
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_BRANCH, BRANCH);
|
||||
|
@ -534,7 +534,7 @@ struct BRANCH_TRUE_F32
|
|||
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
e.vmovd(e.eax, input);
|
||||
e.test(e.eax, e.eax);
|
||||
e.jnz(i.src2.value->name, e.T_NEAR);
|
||||
e.jnz(i.src2.value->GetIdString(), e.T_NEAR);
|
||||
}
|
||||
};
|
||||
struct BRANCH_TRUE_F64
|
||||
|
@ -543,7 +543,7 @@ struct BRANCH_TRUE_F64
|
|||
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
e.vmovq(e.rax, input);
|
||||
e.test(e.rax, e.rax);
|
||||
e.jnz(i.src2.value->name, e.T_NEAR);
|
||||
e.jnz(i.src2.value->GetIdString(), e.T_NEAR);
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16,
|
||||
|
@ -557,7 +557,7 @@ struct BRANCH_FALSE_I8
|
|||
: Sequence<BRANCH_FALSE_I8, I<OPCODE_BRANCH_FALSE, VoidOp, I8Op, LabelOp>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
e.jz(i.src2.value->name, e.T_NEAR);
|
||||
e.jz(i.src2.value->GetIdString(), e.T_NEAR);
|
||||
}
|
||||
};
|
||||
struct BRANCH_FALSE_I16
|
||||
|
@ -565,7 +565,7 @@ struct BRANCH_FALSE_I16
|
|||
I<OPCODE_BRANCH_FALSE, VoidOp, I16Op, LabelOp>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
e.jz(i.src2.value->name, e.T_NEAR);
|
||||
e.jz(i.src2.value->GetIdString(), e.T_NEAR);
|
||||
}
|
||||
};
|
||||
struct BRANCH_FALSE_I32
|
||||
|
@ -573,7 +573,7 @@ struct BRANCH_FALSE_I32
|
|||
I<OPCODE_BRANCH_FALSE, VoidOp, I32Op, LabelOp>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
e.jz(i.src2.value->name, e.T_NEAR);
|
||||
e.jz(i.src2.value->GetIdString(), e.T_NEAR);
|
||||
}
|
||||
};
|
||||
struct BRANCH_FALSE_I64
|
||||
|
@ -581,7 +581,7 @@ struct BRANCH_FALSE_I64
|
|||
I<OPCODE_BRANCH_FALSE, VoidOp, I64Op, LabelOp>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
e.jz(i.src2.value->name, e.T_NEAR);
|
||||
e.jz(i.src2.value->GetIdString(), e.T_NEAR);
|
||||
}
|
||||
};
|
||||
struct BRANCH_FALSE_F32
|
||||
|
@ -591,7 +591,7 @@ struct BRANCH_FALSE_F32
|
|||
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
e.vmovd(e.eax, input);
|
||||
e.test(e.eax, e.eax);
|
||||
e.jz(i.src2.value->name, e.T_NEAR);
|
||||
e.jz(i.src2.value->GetIdString(), e.T_NEAR);
|
||||
}
|
||||
};
|
||||
struct BRANCH_FALSE_F64
|
||||
|
@ -601,7 +601,7 @@ struct BRANCH_FALSE_F64
|
|||
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
e.vmovq(e.rax, input);
|
||||
e.test(e.rax, e.rax);
|
||||
e.jz(i.src2.value->name, e.T_NEAR);
|
||||
e.jz(i.src2.value->GetIdString(), e.T_NEAR);
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_BRANCH_FALSE, BRANCH_FALSE_I8, BRANCH_FALSE_I16,
|
||||
|
|
|
@ -805,22 +805,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB);
|
|||
// ============================================================================
|
||||
// OPCODE_VECTOR_SHL
|
||||
// ============================================================================
|
||||
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
|
||||
static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) {
|
||||
alignas(16) T value[16 / sizeof(T)];
|
||||
alignas(16) T shamt[16 / sizeof(T)];
|
||||
|
||||
// Load SSE registers into a C array.
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
|
||||
|
||||
for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
|
||||
value[i] = value[i] << (shamt[i] & ((sizeof(T) * 8) - 1));
|
||||
}
|
||||
|
||||
// Store result and return it.
|
||||
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
|
||||
}
|
||||
static XmmConst GetShiftmaskForType(unsigned typ) {
|
||||
if (typ == INT8_TYPE) {
|
||||
return XMMXOPByteShiftMask;
|
||||
|
@ -914,28 +899,14 @@ struct VECTOR_SHL_V128
|
|||
}
|
||||
}
|
||||
if (all_same) {
|
||||
// mul by two
|
||||
/*if (seenvalue == 1) {
|
||||
e.vpaddb(i.dest, i.src1, i.src1);
|
||||
} else if (seenvalue == 2) {
|
||||
e.vpaddb(i.dest, i.src1, i.src1);
|
||||
e.vpaddb(i.dest, i.dest, i.dest);
|
||||
} else if (seenvalue == 3) {
|
||||
// mul by 8
|
||||
e.vpaddb(i.dest, i.src1, i.src1);
|
||||
e.vpaddb(i.dest, i.dest, i.dest);
|
||||
e.vpaddb(i.dest, i.dest, i.dest);
|
||||
} else*/
|
||||
{
|
||||
e.vpmovzxbw(e.ymm0, i.src1);
|
||||
e.vpsllw(e.ymm0, e.ymm0, seenvalue);
|
||||
e.vextracti128(e.xmm1, e.ymm0, 1);
|
||||
e.vpmovzxbw(e.ymm0, i.src1);
|
||||
e.vpsllw(e.ymm0, e.ymm0, seenvalue);
|
||||
e.vextracti128(e.xmm1, e.ymm0, 1);
|
||||
|
||||
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMShortsToBytes));
|
||||
e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMShortsToBytes));
|
||||
e.vpunpcklqdq(i.dest, e.xmm0, e.xmm1);
|
||||
return;
|
||||
}
|
||||
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMShortsToBytes));
|
||||
e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMShortsToBytes));
|
||||
e.vpunpcklqdq(i.dest, e.xmm0, e.xmm1);
|
||||
return;
|
||||
|
||||
} else {
|
||||
e.LoadConstantXmm(e.xmm2, constmask);
|
||||
|
@ -966,14 +937,41 @@ struct VECTOR_SHL_V128
|
|||
}
|
||||
}
|
||||
}
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
|
||||
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
|
||||
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
|
||||
|
||||
if (i.src1.is_constant) {
|
||||
e.StashConstantXmm(0, i.src1.constant());
|
||||
stack_offset_src1 = X64Emitter::kStashOffset;
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint8_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
if (i.src2.is_constant) {
|
||||
e.StashConstantXmm(1, i.src2.constant());
|
||||
stack_offset_src2 = X64Emitter::kStashOffset + 16;
|
||||
} else {
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
|
||||
}
|
||||
|
||||
Xbyak::Label looper;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(looper);
|
||||
e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
|
||||
|
||||
e.shl(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
|
||||
|
||||
if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
|
||||
e.inc(e.edx);
|
||||
} else {
|
||||
e.add(e.edx, 1);
|
||||
}
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(looper);
|
||||
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
|
||||
}
|
||||
static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
|
||||
Xmm src1;
|
||||
|
@ -1022,14 +1020,32 @@ struct VECTOR_SHL_V128
|
|||
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
e.L(emu);
|
||||
|
||||
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
|
||||
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
|
||||
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
e.StashConstantXmm(1, i.src2.constant());
|
||||
stack_offset_src2 = X64Emitter::kStashOffset + 16;
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint16_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
|
||||
Xbyak::Label looper;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(looper);
|
||||
e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
|
||||
|
||||
e.shl(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl);
|
||||
|
||||
e.add(e.edx, 2);
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(looper);
|
||||
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
|
||||
|
||||
e.L(end);
|
||||
}
|
||||
|
@ -1098,14 +1114,32 @@ struct VECTOR_SHL_V128
|
|||
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
e.L(emu);
|
||||
|
||||
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
|
||||
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
|
||||
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
e.StashConstantXmm(1, i.src2.constant());
|
||||
stack_offset_src2 = X64Emitter::kStashOffset + 16;
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint32_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
|
||||
Xbyak::Label looper;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(looper);
|
||||
e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]);
|
||||
|
||||
e.shl(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl);
|
||||
|
||||
e.add(e.edx, 4);
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(looper);
|
||||
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
|
||||
|
||||
e.L(end);
|
||||
}
|
||||
|
@ -1116,22 +1150,6 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128);
|
|||
// ============================================================================
|
||||
// OPCODE_VECTOR_SHR
|
||||
// ============================================================================
|
||||
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
|
||||
static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) {
|
||||
alignas(16) T value[16 / sizeof(T)];
|
||||
alignas(16) T shamt[16 / sizeof(T)];
|
||||
|
||||
// Load SSE registers into a C array.
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
|
||||
|
||||
for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
|
||||
value[i] = value[i] >> (shamt[i] & ((sizeof(T) * 8) - 1));
|
||||
}
|
||||
|
||||
// Store result and return it.
|
||||
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
|
||||
}
|
||||
|
||||
struct VECTOR_SHR_V128
|
||||
: Sequence<VECTOR_SHR_V128, I<OPCODE_VECTOR_SHR, V128Op, V128Op, V128Op>> {
|
||||
|
@ -1179,34 +1197,63 @@ struct VECTOR_SHR_V128
|
|||
}
|
||||
|
||||
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
if (i.src2.is_constant) {
|
||||
if (e.IsFeatureEnabled(kX64EmitGFNI)) {
|
||||
const auto& shamt = i.src2.constant();
|
||||
bool all_same = true;
|
||||
for (size_t n = 0; n < 16 - n; ++n) {
|
||||
if (shamt.u8[n] != shamt.u8[n + 1]) {
|
||||
all_same = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (all_same) {
|
||||
// Every count is the same, so we can use gf2p8affineqb.
|
||||
const uint8_t shift_amount = shamt.u8[0] & 0b111;
|
||||
const uint64_t shift_matrix = UINT64_C(0x0102040810204080)
|
||||
<< (shift_amount * 8);
|
||||
e.vgf2p8affineqb(i.dest, i.src1,
|
||||
e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
|
||||
return;
|
||||
if (i.src2.is_constant && e.IsFeatureEnabled(kX64EmitGFNI)) {
|
||||
const auto& shamt = i.src2.constant();
|
||||
bool all_same = true;
|
||||
for (size_t n = 0; n < 16 - n; ++n) {
|
||||
if (shamt.u8[n] != shamt.u8[n + 1]) {
|
||||
all_same = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
if (all_same) {
|
||||
// Every count is the same, so we can use gf2p8affineqb.
|
||||
const uint8_t shift_amount = shamt.u8[0] & 0b111;
|
||||
const uint64_t shift_matrix = UINT64_C(0x0102040810204080)
|
||||
<< (shift_amount * 8);
|
||||
e.vgf2p8affineqb(i.dest, i.src1,
|
||||
e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
|
||||
return;
|
||||
}
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint8_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
|
||||
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
|
||||
|
||||
if (i.src1.is_constant) {
|
||||
e.StashConstantXmm(0, i.src1.constant());
|
||||
stack_offset_src1 = X64Emitter::kStashOffset;
|
||||
} else {
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
|
||||
}
|
||||
if (i.src2.is_constant) {
|
||||
e.StashConstantXmm(1, i.src2.constant());
|
||||
stack_offset_src2 = X64Emitter::kStashOffset + 16;
|
||||
} else {
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
|
||||
}
|
||||
|
||||
Xbyak::Label looper;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(looper);
|
||||
// movzx is to eliminate any possible dep on previous value of rcx at start
|
||||
// of loop
|
||||
e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
|
||||
// maybe using a memory operand as the left side isn't the best idea lol,
|
||||
// still better than callnativesafe though agners docs have no timing info
|
||||
// on shx [m], cl so shrug
|
||||
e.shr(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
|
||||
|
||||
if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
|
||||
e.inc(e.edx);
|
||||
} else {
|
||||
e.add(e.edx, 1);
|
||||
}
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(looper);
|
||||
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
|
||||
}
|
||||
|
||||
static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
|
||||
|
@ -1248,14 +1295,38 @@ struct VECTOR_SHR_V128
|
|||
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
e.L(emu);
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
|
||||
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
|
||||
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
|
||||
if (i.src1.is_constant) {
|
||||
e.StashConstantXmm(0, i.src1.constant());
|
||||
stack_offset_src1 = X64Emitter::kStashOffset;
|
||||
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint16_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
|
||||
if (i.src2.is_constant) {
|
||||
e.StashConstantXmm(1, i.src2.constant());
|
||||
stack_offset_src2 = X64Emitter::kStashOffset + 16;
|
||||
} else {
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
|
||||
}
|
||||
|
||||
Xbyak::Label looper;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(looper);
|
||||
e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
|
||||
|
||||
e.shr(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl);
|
||||
|
||||
e.add(e.edx, 2);
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(looper);
|
||||
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
|
||||
|
||||
e.L(end);
|
||||
}
|
||||
|
@ -1324,14 +1395,37 @@ struct VECTOR_SHR_V128
|
|||
|
||||
// TODO(benvanik): native version.
|
||||
e.L(emu);
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
|
||||
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
|
||||
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
|
||||
if (i.src1.is_constant) {
|
||||
e.StashConstantXmm(0, i.src1.constant());
|
||||
stack_offset_src1 = X64Emitter::kStashOffset;
|
||||
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint32_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
|
||||
if (i.src2.is_constant) {
|
||||
e.StashConstantXmm(1, i.src2.constant());
|
||||
stack_offset_src2 = X64Emitter::kStashOffset + 16;
|
||||
} else {
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
|
||||
}
|
||||
|
||||
Xbyak::Label looper;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(looper);
|
||||
e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]);
|
||||
e.shr(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl);
|
||||
|
||||
e.add(e.edx, 4);
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(looper);
|
||||
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
|
||||
|
||||
e.L(end);
|
||||
}
|
||||
|
@ -1388,7 +1482,8 @@ struct VECTOR_SHA_V128
|
|||
}
|
||||
|
||||
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
|
||||
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
|
||||
if (i.src2.is_constant) {
|
||||
const auto& shamt = i.src2.constant();
|
||||
bool all_same = true;
|
||||
|
@ -1399,7 +1494,6 @@ struct VECTOR_SHA_V128
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
if (e.IsFeatureEnabled(kX64EmitGFNI)) {
|
||||
if (all_same) {
|
||||
// Every count is the same, so we can use gf2p8affineqb.
|
||||
|
@ -1412,8 +1506,7 @@ struct VECTOR_SHA_V128
|
|||
e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
|
||||
return;
|
||||
}
|
||||
}
|
||||
else if (all_same) {
|
||||
} else if (all_same) {
|
||||
Xmm to_be_shifted = GetInputRegOrConstant(e, i.src1, e.xmm1);
|
||||
|
||||
e.vpmovsxbw(e.xmm0, to_be_shifted); //_mm_srai_epi16 / psraw
|
||||
|
@ -1425,14 +1518,41 @@ struct VECTOR_SHA_V128
|
|||
return;
|
||||
}
|
||||
|
||||
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
e.StashConstantXmm(1, i.src2.constant());
|
||||
stack_offset_src2 = X64Emitter::kStashOffset + 16;
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int8_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
|
||||
if (i.src1.is_constant) {
|
||||
e.StashConstantXmm(0, i.src1.constant());
|
||||
stack_offset_src1 = X64Emitter::kStashOffset;
|
||||
} else {
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
|
||||
}
|
||||
|
||||
Xbyak::Label looper;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(looper);
|
||||
// movzx is to eliminate any possible dep on previous value of rcx at start
|
||||
// of loop
|
||||
e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
|
||||
// maybe using a memory operand as the left side isn't the best idea lol,
|
||||
// still better than callnativesafe though agners docs have no timing info
|
||||
// on shx [m], cl so shrug
|
||||
e.sar(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
|
||||
|
||||
if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
|
||||
e.inc(e.edx);
|
||||
} else {
|
||||
e.add(e.edx, 1);
|
||||
}
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(looper);
|
||||
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
|
||||
}
|
||||
|
||||
static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
|
||||
|
@ -1474,14 +1594,38 @@ struct VECTOR_SHA_V128
|
|||
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
e.L(emu);
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
|
||||
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
|
||||
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
|
||||
if (i.src1.is_constant) {
|
||||
e.StashConstantXmm(0, i.src1.constant());
|
||||
stack_offset_src1 = X64Emitter::kStashOffset;
|
||||
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int16_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
|
||||
if (i.src2.is_constant) {
|
||||
e.StashConstantXmm(1, i.src2.constant());
|
||||
stack_offset_src2 = X64Emitter::kStashOffset + 16;
|
||||
} else {
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
|
||||
}
|
||||
|
||||
Xbyak::Label looper;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(looper);
|
||||
e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
|
||||
|
||||
e.sar(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl);
|
||||
|
||||
e.add(e.edx, 2);
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(looper);
|
||||
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
|
||||
|
||||
e.L(end);
|
||||
}
|
||||
|
@ -1508,9 +1652,9 @@ struct VECTOR_SHA_V128
|
|||
// that happens so we mask.
|
||||
if (i.src2.is_constant) {
|
||||
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
||||
e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS));
|
||||
e.vpand(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS));
|
||||
} else {
|
||||
e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
|
||||
e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
|
||||
}
|
||||
e.vpsravd(i.dest, i.src1, e.xmm0);
|
||||
} else {
|
||||
|
@ -1535,14 +1679,36 @@ struct VECTOR_SHA_V128
|
|||
|
||||
// TODO(benvanik): native version.
|
||||
e.L(emu);
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
|
||||
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
|
||||
if (i.src1.is_constant) {
|
||||
e.StashConstantXmm(0, i.src1.constant());
|
||||
stack_offset_src1 = X64Emitter::kStashOffset;
|
||||
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int32_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
|
||||
if (i.src2.is_constant) {
|
||||
e.StashConstantXmm(1, i.src2.constant());
|
||||
stack_offset_src2 = X64Emitter::kStashOffset + 16;
|
||||
} else {
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
|
||||
}
|
||||
|
||||
Xbyak::Label looper;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(looper);
|
||||
e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]);
|
||||
e.sar(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl);
|
||||
|
||||
e.add(e.edx, 4);
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(looper);
|
||||
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
|
||||
|
||||
e.L(end);
|
||||
}
|
||||
|
@ -1550,26 +1716,6 @@ struct VECTOR_SHA_V128
|
|||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128);
|
||||
|
||||
// ============================================================================
|
||||
// OPCODE_VECTOR_ROTATE_LEFT
|
||||
// ============================================================================
|
||||
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
|
||||
static __m128i EmulateVectorRotateLeft(void*, __m128i src1, __m128i src2) {
|
||||
alignas(16) T value[16 / sizeof(T)];
|
||||
alignas(16) T shamt[16 / sizeof(T)];
|
||||
|
||||
// Load SSE registers into a C array.
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
|
||||
|
||||
for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
|
||||
value[i] = xe::rotate_left<T>(value[i], shamt[i] & ((sizeof(T) * 8) - 1));
|
||||
}
|
||||
|
||||
// Store result and return it.
|
||||
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
|
||||
}
|
||||
|
||||
struct VECTOR_ROTATE_LEFT_V128
|
||||
: Sequence<VECTOR_ROTATE_LEFT_V128,
|
||||
I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
|
||||
|
@ -1594,33 +1740,72 @@ struct VECTOR_ROTATE_LEFT_V128
|
|||
}
|
||||
|
||||
} else {
|
||||
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
|
||||
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1),
|
||||
e.StashConstantXmm(1, i.src2.constant()));
|
||||
case INT8_TYPE: {
|
||||
if (i.src1.is_constant) {
|
||||
e.StashConstantXmm(0, i.src1.constant());
|
||||
stack_offset_src1 = X64Emitter::kStashOffset;
|
||||
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(
|
||||
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint8_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1),
|
||||
e.StashConstantXmm(1, i.src2.constant()));
|
||||
e.StashConstantXmm(1, i.src2.constant());
|
||||
stack_offset_src2 = X64Emitter::kStashOffset + 16;
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(
|
||||
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint16_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
break;
|
||||
|
||||
Xbyak::Label rotate_iter;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(rotate_iter);
|
||||
e.movzx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
|
||||
|
||||
e.rol(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
|
||||
|
||||
e.add(e.edx, 1);
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(rotate_iter);
|
||||
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
|
||||
|
||||
} break;
|
||||
case INT16_TYPE: {
|
||||
if (i.src1.is_constant) {
|
||||
e.StashConstantXmm(0, i.src1.constant());
|
||||
stack_offset_src1 = X64Emitter::kStashOffset;
|
||||
|
||||
} else {
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
|
||||
}
|
||||
|
||||
if (i.src2.is_constant) {
|
||||
e.StashConstantXmm(1, i.src2.constant());
|
||||
stack_offset_src2 = X64Emitter::kStashOffset + 16;
|
||||
} else {
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
|
||||
}
|
||||
|
||||
Xbyak::Label rotate_iter;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(rotate_iter);
|
||||
e.movzx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
|
||||
e.rol(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cl);
|
||||
|
||||
e.add(e.edx, 2);
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(rotate_iter);
|
||||
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
|
||||
|
||||
} break;
|
||||
case INT32_TYPE: {
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
|
||||
e.vprolvd(i.dest, i.src1, i.src2);
|
||||
|
@ -1638,23 +1823,40 @@ struct VECTOR_ROTATE_LEFT_V128
|
|||
}
|
||||
e.vpsllvd(e.xmm1, i.src1, e.xmm0);
|
||||
// Shift right (to get low bits):
|
||||
e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32));
|
||||
e.vmovdqa(temp, e.GetXmmConstPtr(XMMPI32));
|
||||
e.vpsubd(temp, e.xmm0);
|
||||
e.vpsrlvd(i.dest, i.src1, temp);
|
||||
// Merge:
|
||||
e.vpor(i.dest, e.xmm1);
|
||||
} else {
|
||||
// TODO(benvanik): non-AVX2 native version.
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1),
|
||||
e.StashConstantXmm(1, i.src2.constant()));
|
||||
if (i.src1.is_constant) {
|
||||
e.StashConstantXmm(0, i.src1.constant());
|
||||
stack_offset_src1 = X64Emitter::kStashOffset;
|
||||
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], i.src1);
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(
|
||||
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint32_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
|
||||
if (i.src2.is_constant) {
|
||||
e.StashConstantXmm(1, i.src2.constant());
|
||||
stack_offset_src2 = X64Emitter::kStashOffset + 16;
|
||||
} else {
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], i.src2);
|
||||
}
|
||||
|
||||
Xbyak::Label rotate_iter;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(rotate_iter);
|
||||
e.mov(e.ecx, e.dword[e.rsp + stack_offset_src2 + e.rdx]);
|
||||
e.rol(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.cl);
|
||||
|
||||
e.add(e.edx, 4);
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(rotate_iter);
|
||||
e.vmovdqa(i.dest, e.byte[e.rsp + stack_offset_src1]);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -1667,80 +1869,120 @@ struct VECTOR_ROTATE_LEFT_V128
|
|||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128);
|
||||
|
||||
// ============================================================================
|
||||
// OPCODE_VECTOR_AVERAGE
|
||||
// ============================================================================
|
||||
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
|
||||
static __m128i EmulateVectorAverage(void*, __m128i src1, __m128i src2) {
|
||||
alignas(16) T src1v[16 / sizeof(T)];
|
||||
alignas(16) T src2v[16 / sizeof(T)];
|
||||
alignas(16) T value[16 / sizeof(T)];
|
||||
|
||||
// Load SSE registers into a C array.
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1);
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2);
|
||||
|
||||
for (size_t i = 0; i < (16 / sizeof(T)); ++i) {
|
||||
auto t = (uint64_t(src1v[i]) + uint64_t(src2v[i]) + 1) / 2;
|
||||
value[i] = T(t);
|
||||
}
|
||||
|
||||
// Store result and return it.
|
||||
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
|
||||
}
|
||||
|
||||
struct VECTOR_AVERAGE
|
||||
: Sequence<VECTOR_AVERAGE,
|
||||
I<OPCODE_VECTOR_AVERAGE, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
auto i_flags = i.instr->flags;
|
||||
EmitCommutativeBinaryXmmOp(
|
||||
e, i,
|
||||
[&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, const Xmm& src2) {
|
||||
const TypeName part_type =
|
||||
static_cast<TypeName>(i.instr->flags & 0xFF);
|
||||
const uint32_t arithmetic_flags = i.instr->flags >> 8;
|
||||
[i_flags](X64Emitter& e, const Xmm& dest, const Xmm& src1,
|
||||
const Xmm& src2) {
|
||||
const TypeName part_type = static_cast<TypeName>(i_flags & 0xFF);
|
||||
const uint32_t arithmetic_flags = i_flags >> 8;
|
||||
bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED);
|
||||
unsigned stack_offset_src1 = StackLayout::GUEST_SCRATCH;
|
||||
unsigned stack_offset_src2 = StackLayout::GUEST_SCRATCH + 16;
|
||||
switch (part_type) {
|
||||
case INT8_TYPE:
|
||||
if (is_unsigned) {
|
||||
e.vpavgb(dest, src1, src2);
|
||||
} else {
|
||||
assert_always();
|
||||
// todo: avx2 version or version that sign extends to two __m128
|
||||
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], src2);
|
||||
|
||||
Xbyak::Label looper;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(looper);
|
||||
|
||||
e.movsx(e.ecx, e.byte[e.rsp + stack_offset_src2 + e.rdx]);
|
||||
e.movsx(e.eax, e.byte[e.rsp + stack_offset_src1 + e.rdx]);
|
||||
|
||||
e.lea(e.ecx, e.ptr[e.ecx + e.eax + 1]);
|
||||
e.sar(e.ecx, 1);
|
||||
e.mov(e.byte[e.rsp + stack_offset_src1 + e.rdx], e.cl);
|
||||
|
||||
if (e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
|
||||
e.inc(e.edx);
|
||||
} else {
|
||||
e.add(e.edx, 1);
|
||||
}
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(looper);
|
||||
e.vmovdqa(dest, e.ptr[e.rsp + stack_offset_src1]);
|
||||
}
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
if (is_unsigned) {
|
||||
e.vpavgw(dest, src1, src2);
|
||||
} else {
|
||||
assert_always();
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], src2);
|
||||
|
||||
Xbyak::Label looper;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(looper);
|
||||
|
||||
e.movsx(e.ecx, e.word[e.rsp + stack_offset_src2 + e.rdx]);
|
||||
e.movsx(e.eax, e.word[e.rsp + stack_offset_src1 + e.rdx]);
|
||||
|
||||
e.lea(e.ecx, e.ptr[e.ecx + e.eax + 1]);
|
||||
e.sar(e.ecx, 1);
|
||||
e.mov(e.word[e.rsp + stack_offset_src1 + e.rdx], e.cx);
|
||||
|
||||
e.add(e.edx, 2);
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(looper);
|
||||
e.vmovdqa(dest, e.ptr[e.rsp + stack_offset_src1]);
|
||||
}
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
case INT32_TYPE: {
|
||||
// No 32bit averages in AVX.
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src1], src1);
|
||||
e.vmovdqa(e.ptr[e.rsp + stack_offset_src2], src2);
|
||||
|
||||
Xbyak::Label looper;
|
||||
|
||||
e.xor_(e.edx, e.edx);
|
||||
|
||||
e.L(looper);
|
||||
auto src2_current_ptr =
|
||||
e.dword[e.rsp + stack_offset_src2 + e.rdx];
|
||||
auto src1_current_ptr =
|
||||
e.dword[e.rsp + stack_offset_src1 + e.rdx];
|
||||
|
||||
if (is_unsigned) {
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1),
|
||||
e.StashConstantXmm(1, i.src2.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(
|
||||
reinterpret_cast<void*>(EmulateVectorAverage<uint32_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
// implicit zero-ext
|
||||
e.mov(e.ecx, src2_current_ptr);
|
||||
e.mov(e.eax, src1_current_ptr);
|
||||
} else {
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1),
|
||||
e.StashConstantXmm(1, i.src2.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(
|
||||
reinterpret_cast<void*>(EmulateVectorAverage<int32_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
e.movsxd(e.rcx, src2_current_ptr);
|
||||
e.movsxd(e.rax, src1_current_ptr);
|
||||
}
|
||||
break;
|
||||
|
||||
e.lea(e.rcx, e.ptr[e.rcx + e.rax + 1]);
|
||||
if (is_unsigned) {
|
||||
e.shr(e.rcx, 1);
|
||||
} else {
|
||||
e.sar(e.rcx, 1);
|
||||
}
|
||||
e.mov(e.dword[e.rsp + stack_offset_src1 + e.rdx], e.ecx);
|
||||
|
||||
e.add(e.edx, 4);
|
||||
|
||||
e.cmp(e.edx, 16);
|
||||
e.jnz(looper);
|
||||
e.vmovdqa(dest, e.ptr[e.rsp + stack_offset_src1]);
|
||||
} break;
|
||||
|
||||
default:
|
||||
assert_unhandled_case(part_type);
|
||||
break;
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -41,18 +41,6 @@ bool FinalizationPass::Run(HIRBuilder* builder) {
|
|||
block->ordinal = block_ordinal++;
|
||||
|
||||
// Ensure all labels have names.
|
||||
auto label = block->label_head;
|
||||
while (label) {
|
||||
if (!label->name) {
|
||||
const size_t label_len = 6 + 4;
|
||||
char* name = reinterpret_cast<char*>(arena->Alloc(label_len + 1, 1));
|
||||
assert_true(label->id <= 9999);
|
||||
auto end = fmt::format_to_n(name, label_len, "_label{}", label->id);
|
||||
name[end.size] = '\0';
|
||||
label->name = name;
|
||||
}
|
||||
label = label->next;
|
||||
}
|
||||
|
||||
// Remove unneeded jumps.
|
||||
auto tail = block->instr_tail;
|
||||
|
|
|
@ -23,52 +23,6 @@ using namespace xe::cpu::hir;
|
|||
using xe::cpu::hir::HIRBuilder;
|
||||
using xe::cpu::hir::Instr;
|
||||
using xe::cpu::hir::Value;
|
||||
using vmask_portion_t = uint64_t;
|
||||
template <uint32_t Ndwords>
|
||||
struct Valuemask_t {
|
||||
vmask_portion_t bits[Ndwords];
|
||||
|
||||
static Valuemask_t create_empty(vmask_portion_t fill = 0) {
|
||||
Valuemask_t result;
|
||||
for (uint32_t i = 0; i < Ndwords; ++i) {
|
||||
result.bits[i] = fill;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
template <typename TCallable>
|
||||
Valuemask_t operate(TCallable&& oper) const {
|
||||
Valuemask_t result = create_empty();
|
||||
|
||||
for (uint32_t i = 0; i < Ndwords; ++i) {
|
||||
result.bits[i] = oper(bits[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
template <typename TCallable>
|
||||
Valuemask_t operate(TCallable&& oper, Valuemask_t other) const {
|
||||
Valuemask_t result = create_empty();
|
||||
|
||||
for (uint32_t i = 0; i < Ndwords; ++i) {
|
||||
result.bits[i] = oper(bits[i], other.bits[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
Valuemask_t operator&(ValueMask other) const {
|
||||
return operate([](vmask_portion_t x, vmask_portion_t y) { return x & y; },
|
||||
other);
|
||||
}
|
||||
Valuemask_t operator|(ValueMask other) const {
|
||||
return operate([](vmask_portion_t x, vmask_portion_t y) { return x | y; },
|
||||
other);
|
||||
}
|
||||
Valuemask_t operator^(ValueMask other) const {
|
||||
return operate([](vmask_portion_t x, vmask_portion_t y) { return x ^ y; },
|
||||
other);
|
||||
}
|
||||
Valuemask_t operator~() const {
|
||||
return operate([](vmask_portion_t x) { return ~x; }, other);
|
||||
}
|
||||
};
|
||||
|
||||
SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {}
|
||||
|
||||
|
@ -76,17 +30,13 @@ SimplificationPass::~SimplificationPass() {}
|
|||
|
||||
bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
|
||||
result = false;
|
||||
bool iter_result = false;
|
||||
|
||||
do {
|
||||
iter_result = false;
|
||||
iter_result |= SimplifyBitArith(builder);
|
||||
iter_result |= EliminateConversions(builder);
|
||||
iter_result |= SimplifyAssignments(builder);
|
||||
iter_result |= SimplifyBasicArith(builder);
|
||||
iter_result |= SimplifyVectorOps(builder);
|
||||
result |= iter_result;
|
||||
} while (iter_result);
|
||||
result |= SimplifyBitArith(builder);
|
||||
result |= EliminateConversions(builder);
|
||||
result |= SimplifyAssignments(builder);
|
||||
result |= SimplifyBasicArith(builder);
|
||||
result |= SimplifyVectorOps(builder);
|
||||
|
||||
return true;
|
||||
}
|
||||
// simplifications that apply to both or and xor
|
||||
|
@ -735,7 +685,9 @@ bool SimplificationPass::CheckAdd(hir::Instr* i, hir::HIRBuilder* builder) {
|
|||
auto [added_constant_neg, added_var_neg] =
|
||||
i->BinaryValueArrangeAsConstAndVar();
|
||||
|
||||
if (!added_constant_neg) return false;
|
||||
if (!added_constant_neg) {
|
||||
return false;
|
||||
}
|
||||
if (added_constant_neg->AsUint64() &
|
||||
GetScalarSignbitMask(added_constant_neg->type)) {
|
||||
// adding a value that has its signbit set!
|
||||
|
@ -882,11 +834,6 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
|
|||
|
||||
} else if (cmpop == OPCODE_COMPARE_UGT) {
|
||||
// impossible, cannot be greater than mask
|
||||
|
||||
/* i->Replace(&OPCODE_ASSIGN_info, 0);
|
||||
i->set_src1(builder->LoadZeroInt8());
|
||||
return true;
|
||||
*/
|
||||
constant_replacement = builder->LoadZeroInt8();
|
||||
|
||||
} else if (cmpop == OPCODE_COMPARE_ULE) { // less than or equal to mask =
|
||||
|
@ -914,9 +861,9 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
|
|||
bool istrue = i->opcode == &OPCODE_COMPARE_NE_info;
|
||||
bool isfalse = i->opcode == &OPCODE_COMPARE_EQ_info;
|
||||
|
||||
auto [input_cosntant, input] = i->BinaryValueArrangeAsConstAndVar();
|
||||
auto [input_constant, input] = i->BinaryValueArrangeAsConstAndVar();
|
||||
|
||||
if (!input_cosntant || input_cosntant->AsUint64() != 0) {
|
||||
if (!input_constant || input_constant->AsUint64() != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -957,12 +904,6 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
|
|||
}
|
||||
}
|
||||
|
||||
/* Instr* input_def = input->def;
|
||||
if (!input_def) {
|
||||
return false;
|
||||
}
|
||||
|
||||
input_def = input_def->GetDestDefSkipAssigns();*/
|
||||
return false;
|
||||
}
|
||||
bool SimplificationPass::CheckSHRByConst(hir::Instr* i,
|
||||
|
|
|
@ -1872,18 +1872,14 @@ Value* HIRBuilder::AndNot(Value* value1, Value* value2) {
|
|||
ASSERT_NON_FLOAT_TYPE(value1);
|
||||
ASSERT_NON_FLOAT_TYPE(value2);
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
//only other type it can be used with is INT64_TYPE (andc)
|
||||
// only other type it can be used with is INT64_TYPE (andc)
|
||||
if (value1->type != VEC128_TYPE) {
|
||||
return this->And(this->Not(value2), value1);
|
||||
} else if (value1 == value2) {
|
||||
return LoadZero(value1->type);
|
||||
} else if (value1->IsConstantZero() || value2->IsConstantZero()) {
|
||||
return value1;
|
||||
} else {
|
||||
if (value1 == value2) {
|
||||
return LoadZero(value1->type);
|
||||
} else if (value1->IsConstantZero()) {
|
||||
return value1;
|
||||
} else if (value2->IsConstantZero()) {
|
||||
return value1;
|
||||
}
|
||||
|
||||
Instr* i = AppendInstr(OPCODE_AND_NOT_info, 0, AllocValue(value1->type));
|
||||
i->set_src1(value1);
|
||||
i->set_src2(value2);
|
||||
|
|
|
@ -26,6 +26,13 @@ class Label {
|
|||
char* name;
|
||||
|
||||
void* tag;
|
||||
// just use stringification of label id
|
||||
// this will later be used as an input to xbyak. xbyak only accepts
|
||||
// std::string as a value, not passed by reference, so precomputing the
|
||||
// stringification does not help
|
||||
std::string GetIdString() {
|
||||
return std::to_string(id);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace hir
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
#include "xenia/cpu/ppc/ppc_hir_builder.h"
|
||||
|
||||
DEFINE_bool(
|
||||
disable_prefetch_and_cachecontrol, false,
|
||||
disable_prefetch_and_cachecontrol, true,
|
||||
"Disables translating ppc prefetch/cache flush instructions to host "
|
||||
"prefetch/cacheflush instructions. This may improve performance as these "
|
||||
"instructions were written with the Xbox 360's cache in mind, and modern "
|
||||
|
|
|
@ -105,6 +105,11 @@ bool PPCFrontend::Initialize() {
|
|||
}
|
||||
|
||||
bool PPCFrontend::DeclareFunction(GuestFunction* function) {
|
||||
|
||||
//chrispy: make sure we aren't declaring a function that is actually padding data, this will mess up PPCScanner and is hard to debug
|
||||
//wow, this halo reach actually has branches into 0 opcodes, look into further
|
||||
//xenia_assert(*reinterpret_cast<const uint32_t*>(
|
||||
// this->memory()->TranslateVirtual(function->address())) != 0);
|
||||
// Could scan or something here.
|
||||
// Could also check to see if it's a well-known function type and classify
|
||||
// for later.
|
||||
|
|
|
@ -34,6 +34,11 @@ DEFINE_bool(
|
|||
"unimplemented PowerPC instruction is encountered.",
|
||||
"CPU");
|
||||
|
||||
DEFINE_bool(
|
||||
emit_useless_fpscr_updates, false,
|
||||
"Emit useless fpscr update instructions (pre-10/30/2022 behavior). ",
|
||||
"CPU");
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace ppc {
|
||||
|
@ -89,6 +94,9 @@ bool PPCHIRBuilder::Emit(GuestFunction* function, uint32_t flags) {
|
|||
|
||||
function_ = function;
|
||||
start_address_ = function_->address();
|
||||
//chrispy: i've seen this one happen, not sure why but i think from trying to precompile twice
|
||||
//i've also seen ones with a start and end address that are the same...
|
||||
assert_true(function_->address() <= function_->end_address());
|
||||
instr_count_ = (function_->end_address() - function_->address()) / 4 + 1;
|
||||
|
||||
with_debug_info_ = (flags & EMIT_DEBUG_COMMENTS) == EMIT_DEBUG_COMMENTS;
|
||||
|
@ -242,6 +250,7 @@ void PPCHIRBuilder::MaybeBreakOnInstruction(uint32_t address) {
|
|||
}
|
||||
|
||||
void PPCHIRBuilder::AnnotateLabel(uint32_t address, Label* label) {
|
||||
//chrispy: label->name is unused, it would be nice to be able to remove the field and this code
|
||||
char name_buffer[13];
|
||||
auto format_result = fmt::format_to_n(name_buffer, 12, "loc_{:08X}", address);
|
||||
name_buffer[format_result.size] = '\0';
|
||||
|
@ -447,31 +456,38 @@ void PPCHIRBuilder::StoreFPSCR(Value* value) {
|
|||
void PPCHIRBuilder::UpdateFPSCR(Value* result, bool update_cr1) {
|
||||
// TODO(benvanik): detect overflow and nan cases.
|
||||
// fx and vx are the most important.
|
||||
Value* fx = LoadConstantInt8(0);
|
||||
Value* fex = LoadConstantInt8(0);
|
||||
Value* vx = LoadConstantInt8(0);
|
||||
Value* ox = LoadConstantInt8(0);
|
||||
/*
|
||||
chrispy: stubbed this out because right now all it does is waste
|
||||
memory and CPU time
|
||||
*/
|
||||
if (cvars::emit_useless_fpscr_updates) {
|
||||
Value* fx = LoadConstantInt8(0);
|
||||
Value* fex = LoadConstantInt8(0);
|
||||
Value* vx = LoadConstantInt8(0);
|
||||
Value* ox = LoadConstantInt8(0);
|
||||
|
||||
if (update_cr1) {
|
||||
// Store into the CR1 field.
|
||||
// We do this instead of just calling CopyFPSCRToCR1 so that we don't
|
||||
// have to read back the bits and do shifting work.
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx);
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex);
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx);
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox);
|
||||
if (update_cr1) {
|
||||
// Store into the CR1 field.
|
||||
// We do this instead of just calling CopyFPSCRToCR1 so that we don't
|
||||
// have to read back the bits and do shifting work.
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx);
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex);
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx);
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox);
|
||||
}
|
||||
|
||||
// Generate our new bits.
|
||||
Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31);
|
||||
new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30));
|
||||
new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29));
|
||||
new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28));
|
||||
|
||||
// Mix into fpscr while preserving sticky bits (FX and OX).
|
||||
Value* bits = LoadFPSCR();
|
||||
bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits);
|
||||
StoreFPSCR(bits);
|
||||
}
|
||||
|
||||
// Generate our new bits.
|
||||
Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31);
|
||||
new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30));
|
||||
new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29));
|
||||
new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28));
|
||||
|
||||
// Mix into fpscr while preserving sticky bits (FX and OX).
|
||||
Value* bits = LoadFPSCR();
|
||||
bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits);
|
||||
StoreFPSCR(bits);
|
||||
}
|
||||
|
||||
void PPCHIRBuilder::CopyFPSCRToCR1() {
|
||||
|
|
|
@ -21,13 +21,7 @@ namespace xe {
|
|||
namespace cpu {
|
||||
namespace ppc {
|
||||
|
||||
// DEPRECATED
|
||||
// TODO(benvanik): move code to PPCDecodeData.
|
||||
struct InstrData {
|
||||
PPCOpcode opcode;
|
||||
const PPCOpcodeInfo* opcode_info;
|
||||
uint32_t address;
|
||||
|
||||
struct PPCOpcodeBits {
|
||||
union {
|
||||
uint32_t code;
|
||||
|
||||
|
@ -329,6 +323,14 @@ struct InstrData {
|
|||
};
|
||||
};
|
||||
|
||||
// DEPRECATED
|
||||
// TODO(benvanik): move code to PPCDecodeData.
|
||||
struct InstrData : public PPCOpcodeBits {
|
||||
PPCOpcode opcode;
|
||||
const PPCOpcodeInfo* opcode_info;
|
||||
uint32_t address;
|
||||
};
|
||||
|
||||
} // namespace ppc
|
||||
} // namespace cpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -31,7 +31,8 @@
|
|||
#include "third_party/crypto/rijndael-alg-fst.c"
|
||||
#include "third_party/crypto/rijndael-alg-fst.h"
|
||||
#include "third_party/pe/pe_image.h"
|
||||
|
||||
#include "xenia/cpu/ppc/ppc_decode_data.h"
|
||||
#include "xenia/cpu/ppc/ppc_instr.h"
|
||||
DEFINE_bool(disable_instruction_infocache, false,
|
||||
"Disables caching records of called instructions/mmio accesses.",
|
||||
"CPU");
|
||||
|
@ -1074,12 +1075,13 @@ bool XexModule::LoadContinue() {
|
|||
image_sha_str_ += &fmtbuf[0];
|
||||
}
|
||||
|
||||
info_cache_.Init(this);
|
||||
// Find __savegprlr_* and __restgprlr_* and the others.
|
||||
// We can flag these for special handling (inlining/etc).
|
||||
if (!FindSaveRest()) {
|
||||
return false;
|
||||
}
|
||||
info_cache_.Init(this);
|
||||
PrecompileDiscoveredFunctions();
|
||||
|
||||
// Load a specified module map and diff.
|
||||
if (cvars::load_module_map.size()) {
|
||||
|
@ -1363,7 +1365,20 @@ InfoCacheFlags* XexModule::GetInstructionAddressFlags(uint32_t guest_addr) {
|
|||
|
||||
return info_cache_.LookupFlags(guest_addr);
|
||||
}
|
||||
void XexModule::PrecompileDiscoveredFunctions() {
|
||||
auto others = PreanalyzeCode();
|
||||
|
||||
for (auto&& other : others) {
|
||||
if (other < low_address_ || other >= high_address_) {
|
||||
continue;
|
||||
}
|
||||
auto sym = processor_->LookupFunction(other);
|
||||
|
||||
if (!sym || sym->status() != Symbol::Status::kDefined) {
|
||||
processor_->ResolveFunction(other);
|
||||
}
|
||||
}
|
||||
}
|
||||
void XexModule::PrecompileKnownFunctions() {
|
||||
if (cvars::disable_function_precompilation) {
|
||||
return;
|
||||
|
@ -1376,10 +1391,157 @@ void XexModule::PrecompileKnownFunctions() {
|
|||
}
|
||||
for (uint32_t i = 0; i < end; i++) {
|
||||
if (flags[i].was_resolved) {
|
||||
processor_->ResolveFunction(low_address_ + (i * 4));
|
||||
uint32_t addr = low_address_ + (i * 4);
|
||||
auto sym = processor_->LookupFunction(addr);
|
||||
|
||||
if (!sym || sym->status() != Symbol::Status::kDefined) {
|
||||
processor_->ResolveFunction(addr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static uint32_t get_bl_called_function(XexModule* xexmod, uint32_t current_base,
|
||||
ppc::PPCOpcodeBits wrd) {
|
||||
int32_t displ = static_cast<int32_t>(ppc::XEEXTS26(wrd.I.LI << 2));
|
||||
|
||||
if (wrd.I.AA) {
|
||||
return static_cast<uint32_t>(displ);
|
||||
} else {
|
||||
return static_cast<uint32_t>(static_cast<int32_t>(current_base) + displ);
|
||||
}
|
||||
}
|
||||
static bool is_bl(unsigned w) {
|
||||
return (w >> (32 - 6)) == 18 && ppc::PPCOpcodeBits{w}.I.LK;
|
||||
}
|
||||
|
||||
std::vector<uint32_t> XexModule::PreanalyzeCode() {
|
||||
uint32_t low_8_aligned = xe::align<uint32_t>(low_address_, 8);
|
||||
uint32_t high_8_aligned = high_address_ & ~(8U - 1);
|
||||
|
||||
uint32_t n_possible_8byte_addresses = (high_8_aligned - low_8_aligned) / 8;
|
||||
uint32_t* funcstart_candidate_stack =
|
||||
new uint32_t[n_possible_8byte_addresses];
|
||||
uint32_t* funcstart_candstack2 = new uint32_t[n_possible_8byte_addresses];
|
||||
|
||||
uint32_t stack_pos = 0;
|
||||
{
|
||||
// all functions seem to start on 8 byte boundaries, except for obvious ones
|
||||
// like the save/rest funcs
|
||||
uint32_t* range_start =
|
||||
(uint32_t*)memory()->TranslateVirtual(low_8_aligned);
|
||||
uint32_t* range_end = (uint32_t*)memory()->TranslateVirtual(
|
||||
high_8_aligned); // align down to multiple of 8
|
||||
|
||||
const uint8_t mfspr_r12_lr[4] = {0x7D, 0x88, 0x02, 0xA6};
|
||||
|
||||
// a blr instruction, with 4 zero bytes afterwards to pad the next address
|
||||
// to 8 byte alignment
|
||||
// if we see this prior to our address, we can assume we are a function
|
||||
// start
|
||||
const uint8_t blr[4] = {0x4E, 0x80, 0x0, 0x20};
|
||||
|
||||
uint32_t blr32 = *reinterpret_cast<const uint32_t*>(&blr[0]);
|
||||
|
||||
uint32_t mfspr_r12_lr32 =
|
||||
*reinterpret_cast<const uint32_t*>(&mfspr_r12_lr[0]);
|
||||
/*
|
||||
First pass: detect save of the link register at an eight byte
|
||||
aligned address
|
||||
*/
|
||||
for (uint32_t* first_pass = range_start; first_pass < range_end;
|
||||
first_pass += 2) {
|
||||
if (*first_pass == mfspr_r12_lr32) {
|
||||
// Push our newly discovered function start into our list
|
||||
// All addresses in the list are sorted until the second pass
|
||||
funcstart_candidate_stack[stack_pos++] =
|
||||
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(first_pass) -
|
||||
reinterpret_cast<uintptr_t>(range_start)) +
|
||||
low_8_aligned;
|
||||
} else if (first_pass[-1] == 0 && *first_pass != 0) {
|
||||
// originally i checked for blr followed by 0, but some functions are
|
||||
// actually aligned to greater boundaries. something that appears to be
|
||||
// longjmp (it occurs in most games, so standard library, and loads ctx,
|
||||
// so longjmp) is aligned to 16 bytes in most games
|
||||
uint32_t* check_iter = &first_pass[-2];
|
||||
|
||||
while (!*check_iter) {
|
||||
--check_iter;
|
||||
}
|
||||
|
||||
XE_LIKELY_IF(*check_iter == blr32) {
|
||||
funcstart_candidate_stack[stack_pos++] =
|
||||
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(first_pass) -
|
||||
reinterpret_cast<uintptr_t>(range_start)) +
|
||||
low_8_aligned;
|
||||
}
|
||||
}
|
||||
}
|
||||
uint32_t current_guestaddr = low_8_aligned;
|
||||
// Second pass: detect branch with link instructions and decode the target
|
||||
// address. We can safely assume that if bl is to address, that address is
|
||||
// the start of the function
|
||||
for (uint32_t* second_pass = range_start; second_pass < range_end;
|
||||
second_pass++, current_guestaddr += 4) {
|
||||
uint32_t current_call = xe::byte_swap(*second_pass);
|
||||
|
||||
if (is_bl(current_call)) {
|
||||
funcstart_candidate_stack[stack_pos++] = get_bl_called_function(
|
||||
this, current_guestaddr, ppc::PPCOpcodeBits{current_call});
|
||||
}
|
||||
}
|
||||
|
||||
auto pdata = this->GetPESection(".pdata");
|
||||
|
||||
if (pdata) {
|
||||
uint32_t* pdata_base =
|
||||
(uint32_t*)this->memory()->TranslateVirtual(pdata->address);
|
||||
|
||||
uint32_t n_pdata_entries = pdata->raw_size / 8;
|
||||
|
||||
for (uint32_t i = 0; i < n_pdata_entries; ++i) {
|
||||
uint32_t funcaddr = xe::load_and_swap<uint32_t>(&pdata_base[i * 2]);
|
||||
if (funcaddr >= low_address_ && funcaddr <= high_address_) {
|
||||
funcstart_candidate_stack[stack_pos++] = funcaddr;
|
||||
} else {
|
||||
// we hit 0 for func addr, that means we're done
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the list of function starts and then ensure that all addresses are
|
||||
// unique
|
||||
uint32_t n_known_funcaddrs = 0;
|
||||
{
|
||||
// make addresses unique
|
||||
|
||||
std::sort(funcstart_candidate_stack, funcstart_candidate_stack + stack_pos);
|
||||
|
||||
uint32_t read_pos = 0;
|
||||
uint32_t write_pos = 0;
|
||||
uint32_t previous_addr = ~0u;
|
||||
while (read_pos < stack_pos) {
|
||||
uint32_t current_addr = funcstart_candidate_stack[read_pos++];
|
||||
|
||||
if (current_addr != previous_addr) {
|
||||
previous_addr = current_addr;
|
||||
funcstart_candstack2[write_pos++] = current_addr;
|
||||
}
|
||||
}
|
||||
n_known_funcaddrs = write_pos;
|
||||
}
|
||||
|
||||
delete[] funcstart_candidate_stack;
|
||||
|
||||
std::vector<uint32_t> result;
|
||||
result.resize(n_known_funcaddrs);
|
||||
memcpy(&result[0], funcstart_candstack2,
|
||||
sizeof(uint32_t) * n_known_funcaddrs);
|
||||
delete[] funcstart_candstack2;
|
||||
return result;
|
||||
}
|
||||
bool XexModule::FindSaveRest() {
|
||||
// Special stack save/restore functions.
|
||||
// http://research.microsoft.com/en-us/um/redmond/projects/invisible/src/crt/md/ppc/xxx.s.htm
|
||||
|
@ -1552,6 +1714,8 @@ bool XexModule::FindSaveRest() {
|
|||
|
||||
auto page_size = base_address_ <= 0x90000000 ? 64 * 1024 : 4 * 1024;
|
||||
auto sec_header = xex_security_info();
|
||||
std::vector<uint32_t> resolve_on_exit{};
|
||||
resolve_on_exit.reserve(256);
|
||||
for (uint32_t i = 0, page = 0; i < sec_header->page_descriptor_count; i++) {
|
||||
// Byteswap the bitfield manually.
|
||||
xex2_page_descriptor desc;
|
||||
|
@ -1586,13 +1750,20 @@ bool XexModule::FindSaveRest() {
|
|||
|
||||
// Add function stubs.
|
||||
char name[32];
|
||||
|
||||
auto AddXexFunction = [this, &resolve_on_exit](uint32_t address,
|
||||
Function** function) {
|
||||
DeclareFunction(address, function);
|
||||
resolve_on_exit.push_back(address);
|
||||
};
|
||||
if (gplr_start) {
|
||||
uint32_t address = gplr_start;
|
||||
for (int n = 14; n <= 31; n++) {
|
||||
auto format_result =
|
||||
fmt::format_to_n(name, xe::countof(name), "__savegprlr_{}", n);
|
||||
Function* function;
|
||||
DeclareFunction(address, &function);
|
||||
|
||||
AddXexFunction(address, &function);
|
||||
function->set_end_address(address + (31 - n) * 4 + 2 * 4);
|
||||
function->set_name(std::string_view(name, format_result.size));
|
||||
// TODO(benvanik): set type fn->type = FunctionSymbol::User;
|
||||
|
@ -1608,7 +1779,7 @@ bool XexModule::FindSaveRest() {
|
|||
auto format_result =
|
||||
fmt::format_to_n(name, xe::countof(name), "__restgprlr_{}", n);
|
||||
Function* function;
|
||||
DeclareFunction(address, &function);
|
||||
AddXexFunction(address, &function);
|
||||
function->set_end_address(address + (31 - n) * 4 + 3 * 4);
|
||||
function->set_name(std::string_view(name, format_result.size));
|
||||
// TODO(benvanik): set type fn->type = FunctionSymbol::User;
|
||||
|
@ -1625,7 +1796,7 @@ bool XexModule::FindSaveRest() {
|
|||
auto format_result =
|
||||
fmt::format_to_n(name, xe::countof(name), "__savefpr_{}", n);
|
||||
Function* function;
|
||||
DeclareFunction(address, &function);
|
||||
AddXexFunction(address, &function);
|
||||
function->set_end_address(address + (31 - n) * 4 + 1 * 4);
|
||||
function->set_name(std::string_view(name, format_result.size));
|
||||
// TODO(benvanik): set type fn->type = FunctionSymbol::User;
|
||||
|
@ -1641,7 +1812,7 @@ bool XexModule::FindSaveRest() {
|
|||
auto format_result =
|
||||
fmt::format_to_n(name, xe::countof(name), "__restfpr_{}", n);
|
||||
Function* function;
|
||||
DeclareFunction(address, &function);
|
||||
AddXexFunction(address, &function);
|
||||
function->set_end_address(address + (31 - n) * 4 + 1 * 4);
|
||||
function->set_name(std::string_view(name, format_result.size));
|
||||
// TODO(benvanik): set type fn->type = FunctionSymbol::User;
|
||||
|
@ -1663,7 +1834,7 @@ bool XexModule::FindSaveRest() {
|
|||
auto format_result =
|
||||
fmt::format_to_n(name, xe::countof(name), "__savevmx_{}", n);
|
||||
Function* function;
|
||||
DeclareFunction(address, &function);
|
||||
AddXexFunction(address, &function);
|
||||
function->set_name(std::string_view(name, format_result.size));
|
||||
// TODO(benvanik): set type fn->type = FunctionSymbol::User;
|
||||
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx;
|
||||
|
@ -1677,7 +1848,7 @@ bool XexModule::FindSaveRest() {
|
|||
auto format_result =
|
||||
fmt::format_to_n(name, xe::countof(name), "__savevmx_{}", n);
|
||||
Function* function;
|
||||
DeclareFunction(address, &function);
|
||||
AddXexFunction(address, &function);
|
||||
function->set_name(std::string_view(name, format_result.size));
|
||||
// TODO(benvanik): set type fn->type = FunctionSymbol::User;
|
||||
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx;
|
||||
|
@ -1691,7 +1862,7 @@ bool XexModule::FindSaveRest() {
|
|||
auto format_result =
|
||||
fmt::format_to_n(name, xe::countof(name), "__restvmx_{}", n);
|
||||
Function* function;
|
||||
DeclareFunction(address, &function);
|
||||
AddXexFunction(address, &function);
|
||||
function->set_name(std::string_view(name, format_result.size));
|
||||
// TODO(benvanik): set type fn->type = FunctionSymbol::User;
|
||||
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx;
|
||||
|
@ -1705,7 +1876,7 @@ bool XexModule::FindSaveRest() {
|
|||
auto format_result =
|
||||
fmt::format_to_n(name, xe::countof(name), "__restvmx_{}", n);
|
||||
Function* function;
|
||||
DeclareFunction(address, &function);
|
||||
AddXexFunction(address, &function);
|
||||
function->set_name(std::string_view(name, format_result.size));
|
||||
// TODO(benvanik): set type fn->type = FunctionSymbol::User;
|
||||
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx;
|
||||
|
@ -1716,6 +1887,13 @@ bool XexModule::FindSaveRest() {
|
|||
}
|
||||
}
|
||||
|
||||
for (auto&& to_ensure_precompiled : resolve_on_exit) {
|
||||
// we want to make sure an address for these functions is available before
|
||||
// any other functions are compiled for code generation purposes but we do
|
||||
// it outside of our loops, because we also want to make sure we've marked
|
||||
// up the symbol with info about it being save/rest and whatnot
|
||||
processor_->ResolveFunction(to_ensure_precompiled);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -34,7 +34,8 @@ struct InfoCacheFlags {
|
|||
uint32_t was_resolved : 1; // has this address ever been called/requested
|
||||
// via resolvefunction?
|
||||
uint32_t accessed_mmio : 1;
|
||||
uint32_t reserved : 30;
|
||||
uint32_t is_syscall_func : 1;
|
||||
uint32_t reserved : 29;
|
||||
};
|
||||
struct XexInfoCache {
|
||||
struct InfoCacheFlagsHeader {
|
||||
|
@ -209,7 +210,8 @@ class XexModule : public xe::cpu::Module {
|
|||
|
||||
InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr);
|
||||
void PrecompileKnownFunctions();
|
||||
|
||||
void PrecompileDiscoveredFunctions();
|
||||
std::vector<uint32_t> PreanalyzeCode();
|
||||
protected:
|
||||
std::unique_ptr<Function> CreateFunction(uint32_t address) override;
|
||||
|
||||
|
|
|
@ -9,7 +9,6 @@
|
|||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
#include "xenia/base/atomic.h"
|
||||
#include "xenia/base/clock.h"
|
||||
#include "xenia/base/logging.h"
|
||||
|
@ -964,7 +963,7 @@ uint32_t xeKeKfAcquireSpinLock(uint32_t* lock, uint64_t r13 = 1) {
|
|||
PrefetchForCAS(lock);
|
||||
assert_true(*lock != static_cast<uint32_t>(r13));
|
||||
// Lock.
|
||||
while (!xe::atomic_cas(0, static_cast<uint32_t>(r13), lock)) {
|
||||
while (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(r13)), lock)) {
|
||||
// Spin!
|
||||
// TODO(benvanik): error on deadlock?
|
||||
xe::threading::MaybeYield();
|
||||
|
@ -978,7 +977,7 @@ uint32_t xeKeKfAcquireSpinLock(uint32_t* lock, uint64_t r13 = 1) {
|
|||
}
|
||||
|
||||
dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr,
|
||||
ppc_context_t& ppc_context) {
|
||||
const ppc_context_t& ppc_context) {
|
||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||
return xeKeKfAcquireSpinLock(lock, ppc_context->r[13]);
|
||||
}
|
||||
|
@ -997,7 +996,7 @@ void xeKeKfReleaseSpinLock(uint32_t* lock, dword_t old_irql) {
|
|||
}
|
||||
|
||||
void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql,
|
||||
ppc_context_t& ppc_ctx) {
|
||||
const ppc_context_t& ppc_ctx) {
|
||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||
|
||||
assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||
|
@ -1014,14 +1013,14 @@ DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented,
|
|||
kHighFrequency);
|
||||
// todo: this is not accurate
|
||||
void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr,
|
||||
ppc_context_t& ppc_ctx) {
|
||||
const ppc_context_t& ppc_ctx) {
|
||||
// Lock.
|
||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||
// must not be our own thread
|
||||
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||
|
||||
PrefetchForCAS(lock);
|
||||
while (!xe::atomic_cas(0, static_cast<uint32_t>(ppc_ctx->r[13]), lock)) {
|
||||
while (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
// todo: this is just a nop if they don't have SMT, which is not great
|
||||
// either...
|
||||
|
@ -1036,12 +1035,12 @@ DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading,
|
|||
kImplemented, kBlocking, kHighFrequency);
|
||||
|
||||
dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(
|
||||
lpdword_t lock_ptr, ppc_context_t& ppc_ctx) {
|
||||
lpdword_t lock_ptr, const ppc_context_t& ppc_ctx) {
|
||||
// Lock.
|
||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||
PrefetchForCAS(lock);
|
||||
if (!xe::atomic_cas(0, static_cast<uint32_t>(ppc_ctx->r[13]), lock)) {
|
||||
if (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
|
@ -1050,7 +1049,7 @@ DECLARE_XBOXKRNL_EXPORT4(KeTryToAcquireSpinLockAtRaisedIrql, kThreading,
|
|||
kImplemented, kBlocking, kHighFrequency, kSketchy);
|
||||
|
||||
void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr,
|
||||
ppc_context_t& ppc_ctx) {
|
||||
const ppc_context_t& ppc_ctx) {
|
||||
// Unlock.
|
||||
assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||
|
@ -1283,7 +1282,8 @@ void ExInitializeReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr) {
|
|||
}
|
||||
DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented);
|
||||
|
||||
void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) {
|
||||
void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr,
|
||||
const ppc_context_t& ppc_context) {
|
||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||
|
||||
int32_t lock_count = ++lock_ptr->lock_count;
|
||||
|
@ -1301,7 +1301,7 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockExclusive, kThreading,
|
|||
kImplemented, kBlocking);
|
||||
|
||||
dword_result_t ExTryToAcquireReadWriteLockExclusive_entry(
|
||||
pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) {
|
||||
pointer_t<X_ERWLOCK> lock_ptr, const ppc_context_t& ppc_context) {
|
||||
auto old_irql =
|
||||
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||
|
||||
|
@ -1320,7 +1320,7 @@ DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading,
|
|||
kImplemented);
|
||||
|
||||
void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr,
|
||||
ppc_context_t& ppc_context) {
|
||||
const ppc_context_t& ppc_context) {
|
||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||
|
||||
int32_t lock_count = ++lock_ptr->lock_count;
|
||||
|
@ -1340,7 +1340,7 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockShared, kThreading, kImplemented,
|
|||
kBlocking);
|
||||
|
||||
dword_result_t ExTryToAcquireReadWriteLockShared_entry(
|
||||
pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) {
|
||||
pointer_t<X_ERWLOCK> lock_ptr, const ppc_context_t& ppc_context) {
|
||||
auto old_irql =
|
||||
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||
|
||||
|
@ -1361,7 +1361,7 @@ DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockShared, kThreading,
|
|||
kImplemented);
|
||||
|
||||
void ExReleaseReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr,
|
||||
ppc_context_t& ppc_context) {
|
||||
const ppc_context_t& ppc_context) {
|
||||
auto old_irql =
|
||||
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ namespace vfs {
|
|||
|
||||
NullDevice::NullDevice(const std::string& mount_path,
|
||||
const std::initializer_list<std::string>& null_paths)
|
||||
: Device(mount_path), null_paths_(null_paths), name_("NullDevice") {}
|
||||
: Device(mount_path), name_("NullDevice"), null_paths_(null_paths) {}
|
||||
|
||||
NullDevice::~NullDevice() = default;
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit a437fe6d8efef17c8ad33d39f5815032e7adf5d7
|
||||
Subproject commit fa4f77cf444cd30894a222148efc5a371b3f76a6
|
Loading…
Reference in New Issue