Merge branch 'canary_experimental' of https://github.com/xenia-canary/xenia-canary into canary_experimental
This commit is contained in:
commit
6730ffb7d3
|
@ -688,7 +688,12 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
|
|||
vmovaps(xmm15, qword[rsp + offsetof(StackLayout::Thunk, xmm[9])]);
|
||||
#endif
|
||||
}
|
||||
|
||||
void X64Backend::InitializeBackendContext(void* ctx) {
|
||||
X64BackendContext* bctx = reinterpret_cast<X64BackendContext*>(
|
||||
reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
|
||||
bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
|
||||
bctx->Ox1000 = 0x1000;
|
||||
}
|
||||
} // namespace x64
|
||||
} // namespace backend
|
||||
} // namespace cpu
|
||||
|
|
|
@ -31,6 +31,16 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
|
|||
typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
|
||||
typedef void (*ResolveFunctionThunk)();
|
||||
|
||||
// located prior to the ctx register
|
||||
// some things it would be nice to have be per-emulator instance instead of per
|
||||
// context (somehow placing a global X64BackendCtx prior to membase, so we can
|
||||
// negatively index the membase reg)
|
||||
struct X64BackendContext {
|
||||
void* ResolveFunction_Ptr; // cached pointer to resolvefunction
|
||||
unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted
|
||||
// add of it by... 2 bytes lol
|
||||
};
|
||||
|
||||
class X64Backend : public Backend {
|
||||
public:
|
||||
static const uint32_t kForceReturnAddress = 0x9FFF0000u;
|
||||
|
@ -65,6 +75,7 @@ class X64Backend : public Backend {
|
|||
void InstallBreakpoint(Breakpoint* breakpoint) override;
|
||||
void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) override;
|
||||
void UninstallBreakpoint(Breakpoint* breakpoint) override;
|
||||
virtual void InitializeBackendContext(void* ctx) override;
|
||||
|
||||
private:
|
||||
static bool ExceptionCallbackThunk(Exception* ex, void* data);
|
||||
|
|
|
@ -105,6 +105,7 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
|||
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
|
||||
TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
|
||||
#undef TEST_EMIT_FEATURE
|
||||
/*
|
||||
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
|
||||
|
@ -121,6 +122,10 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
|||
bool is_zennish = cpu_.displayFamily >= 0x17;
|
||||
|
||||
if (is_zennish) {
|
||||
// ik that i heard somewhere that this is the case for zen, but i need to
|
||||
// verify. cant find my original source for that.
|
||||
// todo: ask agner?
|
||||
feature_flags_ |= kX64FlagsIndependentVars;
|
||||
feature_flags_ |= kX64FastJrcx;
|
||||
|
||||
if (cpu_.displayFamily > 0x17) {
|
||||
|
@ -132,6 +137,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
|||
// for my cpu, which is ripper90
|
||||
}
|
||||
}
|
||||
may_use_membase32_as_zero_reg_ =
|
||||
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(
|
||||
processor()->memory()->virtual_membase())) == 0;
|
||||
}
|
||||
|
||||
X64Emitter::~X64Emitter() = default;
|
||||
|
@ -210,6 +218,11 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
|
|||
top_ = old_address;
|
||||
reset();
|
||||
call_sites_.clear();
|
||||
tail_code_.clear();
|
||||
for (auto&& cached_label : label_cache_) {
|
||||
delete cached_label;
|
||||
}
|
||||
label_cache_.clear();
|
||||
return new_execute_address;
|
||||
}
|
||||
|
||||
|
@ -261,13 +274,14 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
|
||||
code_offsets.prolog_stack_alloc = getSize();
|
||||
code_offsets.body = getSize();
|
||||
|
||||
xor_(eax, eax);
|
||||
/*
|
||||
* chrispy: removed this, it serves no purpose
|
||||
mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
|
||||
*/
|
||||
mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
|
||||
mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);
|
||||
|
||||
mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax); // 0
|
||||
|
||||
// Safe now to do some tracing.
|
||||
if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctions) {
|
||||
|
@ -343,6 +357,13 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
|
||||
add(rsp, (uint32_t)stack_size);
|
||||
ret();
|
||||
// todo: do some kind of sorting by alignment?
|
||||
for (auto&& tail_item : tail_code_) {
|
||||
if (tail_item.alignment) {
|
||||
align(tail_item.alignment);
|
||||
}
|
||||
tail_item.func(*this, tail_item.label);
|
||||
}
|
||||
|
||||
code_offsets.tail = getSize();
|
||||
|
||||
|
@ -605,12 +626,10 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
|
|||
// rdx = arg0
|
||||
// r8 = arg1
|
||||
// r9 = arg2
|
||||
auto thunk = backend()->guest_to_host_thunk();
|
||||
mov(rax, reinterpret_cast<uint64_t>(thunk));
|
||||
mov(rcx, reinterpret_cast<uint64_t>(builtin_function->handler()));
|
||||
mov(rdx, reinterpret_cast<uint64_t>(builtin_function->arg0()));
|
||||
mov(r8, reinterpret_cast<uint64_t>(builtin_function->arg1()));
|
||||
call(rax);
|
||||
call(backend()->guest_to_host_thunk());
|
||||
// rax = host return
|
||||
}
|
||||
} else if (function->behavior() == Function::Behavior::kExtern) {
|
||||
|
@ -621,12 +640,10 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
|
|||
// rdx = arg0
|
||||
// r8 = arg1
|
||||
// r9 = arg2
|
||||
auto thunk = backend()->guest_to_host_thunk();
|
||||
mov(rax, reinterpret_cast<uint64_t>(thunk));
|
||||
mov(rcx, reinterpret_cast<uint64_t>(extern_function->extern_handler()));
|
||||
mov(rdx,
|
||||
qword[GetContextReg() + offsetof(ppc::PPCContext, kernel_state)]);
|
||||
call(rax);
|
||||
call(backend()->guest_to_host_thunk());
|
||||
// rax = host return
|
||||
}
|
||||
}
|
||||
|
@ -656,10 +673,8 @@ void X64Emitter::CallNativeSafe(void* fn) {
|
|||
// rdx = arg0
|
||||
// r8 = arg1
|
||||
// r9 = arg2
|
||||
auto thunk = backend()->guest_to_host_thunk();
|
||||
mov(rax, reinterpret_cast<uint64_t>(thunk));
|
||||
mov(rcx, reinterpret_cast<uint64_t>(fn));
|
||||
call(rax);
|
||||
call(backend()->guest_to_host_thunk());
|
||||
// rax = host return
|
||||
}
|
||||
|
||||
|
@ -715,24 +730,50 @@ bool X64Emitter::ConstantFitsIn32Reg(uint64_t v) {
|
|||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
WARNING: do not use any regs here, addr is often produced by
|
||||
ComputeAddressOffset, which may use rax/rdx/rcx in its addr expression
|
||||
*/
|
||||
void X64Emitter::MovMem64(const Xbyak::RegExp& addr, uint64_t v) {
|
||||
if ((v & ~0x7FFFFFFF) == 0) {
|
||||
uint32_t lowpart = static_cast<uint32_t>(v);
|
||||
uint32_t highpart = static_cast<uint32_t>(v >> 32);
|
||||
// check whether the constant coincidentally collides with our membase
|
||||
if (v == (uintptr_t)processor()->memory()->virtual_membase()) {
|
||||
mov(qword[addr], GetMembaseReg());
|
||||
} else if ((v & ~0x7FFFFFFF) == 0) {
|
||||
// Fits under 31 bits, so just load using normal mov.
|
||||
|
||||
mov(qword[addr], v);
|
||||
} else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) {
|
||||
// Negative number that fits in 32bits.
|
||||
mov(qword[addr], v);
|
||||
} else if (!(v >> 32)) {
|
||||
} else if (!highpart) {
|
||||
// All high bits are zero. It'd be nice if we had a way to load a 32bit
|
||||
// immediate without sign extending!
|
||||
// TODO(benvanik): this is super common, find a better way.
|
||||
mov(dword[addr], static_cast<uint32_t>(v));
|
||||
mov(dword[addr + 4], 0);
|
||||
if (lowpart == 0 && CanUseMembaseLow32As0()) {
|
||||
mov(dword[addr], GetMembaseReg().cvt32());
|
||||
} else {
|
||||
mov(dword[addr], static_cast<uint32_t>(v));
|
||||
}
|
||||
if (CanUseMembaseLow32As0()) {
|
||||
mov(dword[addr + 4], GetMembaseReg().cvt32());
|
||||
} else {
|
||||
mov(dword[addr + 4], 0);
|
||||
}
|
||||
} else {
|
||||
// 64bit number that needs double movs.
|
||||
mov(dword[addr], static_cast<uint32_t>(v));
|
||||
mov(dword[addr + 4], static_cast<uint32_t>(v >> 32));
|
||||
|
||||
if (lowpart == 0 && CanUseMembaseLow32As0()) {
|
||||
mov(dword[addr], GetMembaseReg().cvt32());
|
||||
} else {
|
||||
mov(dword[addr], lowpart);
|
||||
}
|
||||
if (highpart == 0 && CanUseMembaseLow32As0()) {
|
||||
mov(dword[addr + 4], GetMembaseReg().cvt32());
|
||||
} else {
|
||||
mov(dword[addr + 4], highpart);
|
||||
}
|
||||
}
|
||||
}
|
||||
static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,
|
||||
|
@ -893,7 +934,13 @@ static const vec128_t xmm_consts[] = {
|
|||
/* XMMThreeFloatMask */
|
||||
vec128i(~0U, ~0U, ~0U, 0U),
|
||||
/*XMMXenosF16ExtRangeStart*/
|
||||
vec128f(65504)};
|
||||
vec128f(65504),
|
||||
/*XMMVSRShlByteshuf*/
|
||||
v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
|
||||
// XMMVSRMask
|
||||
vec128b(1)
|
||||
|
||||
};
|
||||
|
||||
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
|
||||
for (auto& vec : xmm_consts) {
|
||||
|
@ -1300,6 +1347,27 @@ SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) {
|
|||
|
||||
return SimdDomain::DONTCARE;
|
||||
}
|
||||
Xbyak::Address X64Emitter::GetBackendCtxPtr(int offset_in_x64backendctx) {
|
||||
/*
|
||||
index context ptr negatively to get to backend ctx field
|
||||
*/
|
||||
ptrdiff_t delta = (-static_cast<ptrdiff_t>(sizeof(X64BackendContext))) +
|
||||
offset_in_x64backendctx;
|
||||
return ptr[GetContextReg() + static_cast<int>(delta)];
|
||||
}
|
||||
Xbyak::Label& X64Emitter::AddToTail(TailEmitCallback callback,
|
||||
uint32_t alignment) {
|
||||
TailEmitter emitter{};
|
||||
emitter.func = std::move(callback);
|
||||
emitter.alignment = alignment;
|
||||
tail_code_.push_back(std::move(emitter));
|
||||
return tail_code_.back().label;
|
||||
}
|
||||
Xbyak::Label& X64Emitter::NewCachedLabel() {
|
||||
Xbyak::Label* tmp = new Xbyak::Label;
|
||||
label_cache_.push_back(tmp);
|
||||
return *tmp;
|
||||
}
|
||||
} // namespace x64
|
||||
} // namespace backend
|
||||
} // namespace cpu
|
||||
|
|
|
@ -155,7 +155,15 @@ enum XmmConst {
|
|||
XMMLVSRTableBase,
|
||||
XMMSingleDenormalMask,
|
||||
XMMThreeFloatMask, // for clearing the fourth float prior to DOT_PRODUCT_3
|
||||
XMMXenosF16ExtRangeStart
|
||||
XMMXenosF16ExtRangeStart,
|
||||
XMMVSRShlByteshuf,
|
||||
XMMVSRMask
|
||||
};
|
||||
// X64Backend specific Instr->runtime_flags
|
||||
enum : uint32_t {
|
||||
INSTR_X64_FLAGS_ELIMINATED =
|
||||
1, // another sequence marked this instruction as not needing codegen,
|
||||
// meaning they likely already handled it
|
||||
};
|
||||
|
||||
// Unfortunately due to the design of xbyak we have to pass this to the ctor.
|
||||
|
@ -185,7 +193,13 @@ enum X64EmitterFeatureFlags {
|
|||
kX64FastJrcx = 1 << 12, // jrcxz is as fast as any other jump ( >= Zen1)
|
||||
kX64FastLoop =
|
||||
1 << 13, // loop/loope/loopne is as fast as any other jump ( >= Zen2)
|
||||
kX64EmitAVX512VBMI = 1 << 14
|
||||
kX64EmitAVX512VBMI = 1 << 14,
|
||||
kX64FlagsIndependentVars =
|
||||
1 << 15, // if true, instructions that only modify some flags (like
|
||||
// inc/dec) do not introduce false dependencies on EFLAGS
|
||||
// because the individual flags are treated as different vars by
|
||||
// the processor. (this applies to zen)
|
||||
kX64EmitPrefetchW = 1 << 16
|
||||
};
|
||||
class ResolvableGuestCall {
|
||||
public:
|
||||
|
@ -194,6 +208,13 @@ class ResolvableGuestCall {
|
|||
// rgcid
|
||||
unsigned offset_;
|
||||
};
|
||||
class X64Emitter;
|
||||
using TailEmitCallback = std::function<void(X64Emitter& e, Xbyak::Label& lbl)>;
|
||||
struct TailEmitter {
|
||||
Xbyak::Label label;
|
||||
uint32_t alignment;
|
||||
TailEmitCallback func;
|
||||
};
|
||||
|
||||
class X64Emitter : public Xbyak::CodeGenerator {
|
||||
public:
|
||||
|
@ -264,7 +285,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
|
||||
Xbyak::Reg64 GetContextReg();
|
||||
Xbyak::Reg64 GetMembaseReg();
|
||||
|
||||
bool CanUseMembaseLow32As0() const { return may_use_membase32_as_zero_reg_; }
|
||||
void ReloadMembase();
|
||||
|
||||
void nop(size_t length = 1);
|
||||
|
@ -274,6 +295,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
void MovMem64(const Xbyak::RegExp& addr, uint64_t v);
|
||||
|
||||
Xbyak::Address GetXmmConstPtr(XmmConst id);
|
||||
Xbyak::Address GetBackendCtxPtr(int offset_in_x64backendctx);
|
||||
|
||||
void LoadConstantXmm(Xbyak::Xmm dest, float v);
|
||||
void LoadConstantXmm(Xbyak::Xmm dest, double v);
|
||||
void LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v);
|
||||
|
@ -289,6 +312,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
return (feature_flags_ & feature_flag) == feature_flag;
|
||||
}
|
||||
|
||||
Xbyak::Label& AddToTail(TailEmitCallback callback, uint32_t alignment = 0);
|
||||
Xbyak::Label& NewCachedLabel();
|
||||
FunctionDebugInfo* debug_info() const { return debug_info_; }
|
||||
|
||||
size_t stack_size() const { return stack_size_; }
|
||||
|
@ -324,6 +349,16 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
static const uint32_t xmm_reg_map_[XMM_COUNT];
|
||||
uint32_t current_rgc_id_ = 0xEEDDF00F;
|
||||
std::vector<ResolvableGuestCall> call_sites_;
|
||||
/*
|
||||
set to true if the low 32 bits of membase == 0.
|
||||
only really advantageous if you are storing 32 bit 0 to a displaced address,
|
||||
which would have to represent 0 as 4 bytes
|
||||
*/
|
||||
bool may_use_membase32_as_zero_reg_;
|
||||
std::vector<TailEmitter> tail_code_;
|
||||
std::vector<Xbyak::Label*>
|
||||
label_cache_; // for creating labels that need to be referenced much
|
||||
// later by tail emitters
|
||||
};
|
||||
|
||||
} // namespace x64
|
||||
|
|
|
@ -109,7 +109,6 @@ struct DEBUG_BREAK_TRUE_I32
|
|||
: Sequence<DEBUG_BREAK_TRUE_I32,
|
||||
I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I32Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.ecx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
|
@ -187,77 +186,48 @@ EMITTER_OPCODE_TABLE(OPCODE_TRAP, TRAP);
|
|||
struct TRAP_TRUE_I8
|
||||
: Sequence<TRAP_TRUE_I8, I<OPCODE_TRAP_TRUE, VoidOp, I8Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
Xbyak::Label& after = e.NewCachedLabel();
|
||||
unsigned flags = i.instr->flags;
|
||||
Xbyak::Label& dotrap =
|
||||
e.AddToTail([flags, &after](X64Emitter& e, Xbyak::Label& me) {
|
||||
e.L(me);
|
||||
e.Trap(flags);
|
||||
// does Trap actually return control to the guest?
|
||||
e.jmp(after, X64Emitter::T_NEAR);
|
||||
});
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
e.jnz(dotrap, X64Emitter::T_NEAR);
|
||||
e.L(after);
|
||||
}
|
||||
};
|
||||
struct TRAP_TRUE_I16
|
||||
: Sequence<TRAP_TRUE_I16, I<OPCODE_TRAP_TRUE, VoidOp, I16Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
assert_impossible_sequence(TRAP_TRUE_I16);
|
||||
}
|
||||
};
|
||||
struct TRAP_TRUE_I32
|
||||
: Sequence<TRAP_TRUE_I32, I<OPCODE_TRAP_TRUE, VoidOp, I32Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.ecx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jrcxz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
}
|
||||
assert_impossible_sequence(TRAP_TRUE_I32);
|
||||
}
|
||||
};
|
||||
struct TRAP_TRUE_I64
|
||||
: Sequence<TRAP_TRUE_I64, I<OPCODE_TRAP_TRUE, VoidOp, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.rcx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jrcxz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
}
|
||||
assert_impossible_sequence(TRAP_TRUE_I64);
|
||||
}
|
||||
};
|
||||
struct TRAP_TRUE_F32
|
||||
: Sequence<TRAP_TRUE_F32, I<OPCODE_TRAP_TRUE, VoidOp, F32Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.vptest(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
assert_impossible_sequence(TRAP_TRUE_F32);
|
||||
}
|
||||
};
|
||||
struct TRAP_TRUE_F64
|
||||
: Sequence<TRAP_TRUE_F64, I<OPCODE_TRAP_TRUE, VoidOp, F64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.vptest(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
assert_impossible_sequence(TRAP_TRUE_F64);
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_TRAP_TRUE, TRAP_TRUE_I8, TRAP_TRUE_I16,
|
||||
|
@ -333,6 +303,7 @@ struct CALL_TRUE_F32
|
|||
e.L(skip);
|
||||
}
|
||||
};
|
||||
|
||||
struct CALL_TRUE_F64
|
||||
: Sequence<CALL_TRUE_F64, I<OPCODE_CALL_TRUE, VoidOp, F64Op, SymbolOp>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
|
@ -388,7 +359,6 @@ struct CALL_INDIRECT_TRUE_I32
|
|||
: Sequence<CALL_INDIRECT_TRUE_I32,
|
||||
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I32Op, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.ecx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/base/memory.h"
|
||||
#include "xenia/cpu/backend/x64/x64_backend.h"
|
||||
#include "xenia/cpu/backend/x64/x64_op.h"
|
||||
#include "xenia/cpu/backend/x64/x64_tracers.h"
|
||||
#include "xenia/cpu/ppc/ppc_context.h"
|
||||
|
@ -28,8 +29,127 @@ namespace cpu {
|
|||
namespace backend {
|
||||
namespace x64 {
|
||||
|
||||
struct LoadModStore {
|
||||
const hir::Instr* load;
|
||||
hir::Instr* modify;
|
||||
hir::Instr* store;
|
||||
|
||||
bool is_constant[3];
|
||||
void Consume();
|
||||
};
|
||||
void LoadModStore::Consume() {
|
||||
modify->backend_flags |= INSTR_X64_FLAGS_ELIMINATED;
|
||||
store->backend_flags |= INSTR_X64_FLAGS_ELIMINATED;
|
||||
}
|
||||
static bool GetLoadModStore(const hir::Instr* loadinsn, LoadModStore* out) {
|
||||
if (IsTracingData()) {
|
||||
return false;
|
||||
}
|
||||
// if (!loadinsn->dest->HasSingleUse()) {
|
||||
// allow the value to be used multiple times, as long as it is by the same
|
||||
// instruction
|
||||
if (!loadinsn->dest->AllUsesByOneInsn()) {
|
||||
return false;
|
||||
}
|
||||
hir::Instr* use = loadinsn->dest->use_head->instr;
|
||||
|
||||
if (!use->dest || !use->dest->HasSingleUse() ||
|
||||
use->GetNonFakePrev() != loadinsn) {
|
||||
return false;
|
||||
}
|
||||
|
||||
hir::Instr* shouldbstore = use->dest->use_head->instr;
|
||||
|
||||
if (shouldbstore->dest || shouldbstore->GetNonFakePrev() != use) {
|
||||
return false; // store insns have no destination
|
||||
}
|
||||
use->VisitValueOperands([out](Value* v, uint32_t idx) {
|
||||
out->is_constant[idx] = v->IsConstant();
|
||||
});
|
||||
out->load = loadinsn;
|
||||
out->modify = use;
|
||||
out->store = shouldbstore;
|
||||
return true;
|
||||
}
|
||||
struct LoadModStoreContext : public LoadModStore {
|
||||
uint64_t offset; // ctx offset
|
||||
TypeName type;
|
||||
Opcode op;
|
||||
bool is_commutative;
|
||||
bool is_unary;
|
||||
bool is_binary;
|
||||
bool
|
||||
binary_uses_twice; // true if binary_other == our value. (for instance,
|
||||
// add r11, r10, r10, which can be gen'ed for r10 * 2)
|
||||
hir::Value* binary_other;
|
||||
|
||||
hir::Value::ConstantValue* other_const;
|
||||
uint32_t other_index;
|
||||
};
|
||||
static bool GetLoadModStoreContext(const hir::Instr* loadinsn,
|
||||
LoadModStoreContext* out) {
|
||||
if (!GetLoadModStore(loadinsn, out)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (out->load->opcode->num != OPCODE_LOAD_CONTEXT ||
|
||||
out->store->opcode->num != OPCODE_STORE_CONTEXT) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (out->modify->opcode->flags &
|
||||
(OPCODE_FLAG_VOLATILE | OPCODE_FLAG_MEMORY)) {
|
||||
return false;
|
||||
}
|
||||
uint64_t offs = out->load->src1.offset;
|
||||
|
||||
if (offs != out->store->src1.offset) {
|
||||
return false;
|
||||
}
|
||||
|
||||
TypeName typ = out->load->dest->type;
|
||||
// can happen if op is a conversion
|
||||
if (typ != out->store->src2.value->type) {
|
||||
return false;
|
||||
}
|
||||
/*
|
||||
set up a whole bunch of convenience fields for the caller
|
||||
*/
|
||||
out->offset = offs;
|
||||
out->type = typ;
|
||||
const OpcodeInfo& opinf = *out->modify->opcode;
|
||||
out->op = opinf.num;
|
||||
out->is_commutative = opinf.flags & OPCODE_FLAG_COMMUNATIVE;
|
||||
out->is_unary = IsOpcodeUnaryValue(opinf.signature);
|
||||
out->is_binary = IsOpcodeBinaryValue(opinf.signature);
|
||||
out->binary_uses_twice = false;
|
||||
out->binary_other = nullptr;
|
||||
out->other_const = nullptr;
|
||||
out->other_index = ~0U;
|
||||
if (out->is_binary) {
|
||||
if (out->modify->src1.value == out->load->dest) {
|
||||
out->binary_other = out->modify->src2.value;
|
||||
out->other_index = 1;
|
||||
} else {
|
||||
out->binary_other = out->modify->src1.value;
|
||||
out->other_index = 0;
|
||||
}
|
||||
if (out->binary_other && out->is_constant[out->other_index]) {
|
||||
out->other_const = &out->binary_other->constant;
|
||||
}
|
||||
if (out->binary_other == out->load->dest) {
|
||||
out->binary_uses_twice = true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
volatile int anchor_memory = 0;
|
||||
|
||||
static void Do0x1000Add(X64Emitter& e, Reg32 reg) {
|
||||
e.add(reg, e.GetBackendCtxPtr(offsetof(X64BackendContext, Ox1000)));
|
||||
// e.add(reg, 0x1000);
|
||||
}
|
||||
|
||||
// Note: all types are always aligned in the context.
|
||||
RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) {
|
||||
return e.GetContextReg() + offset.value;
|
||||
|
@ -58,51 +178,6 @@ static bool is_definitely_not_eo(const T& v) {
|
|||
|
||||
return is_eo_def(v.value);
|
||||
}
|
||||
template <typename T>
|
||||
RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
|
||||
const T& offset) {
|
||||
assert_true(offset.is_constant);
|
||||
int32_t offset_const = static_cast<int32_t>(offset.constant());
|
||||
|
||||
if (guest.is_constant) {
|
||||
uint32_t address = static_cast<uint32_t>(guest.constant());
|
||||
address += offset_const;
|
||||
if (address < 0x80000000) {
|
||||
return e.GetMembaseReg() + address;
|
||||
} else {
|
||||
if (address >= 0xE0000000 &&
|
||||
xe::memory::allocation_granularity() > 0x1000) {
|
||||
e.mov(e.eax, address + 0x1000);
|
||||
} else {
|
||||
e.mov(e.eax, address);
|
||||
}
|
||||
return e.GetMembaseReg() + e.rax;
|
||||
}
|
||||
} else {
|
||||
if (xe::memory::allocation_granularity() > 0x1000 &&
|
||||
!is_definitely_not_eo(guest)) {
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
|
||||
// todo: do branching or use an alt membase and cmov
|
||||
e.xor_(e.eax, e.eax);
|
||||
e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]);
|
||||
|
||||
e.cmp(e.edx, e.GetContextReg().cvt32());
|
||||
e.setae(e.al);
|
||||
e.shl(e.eax, 12);
|
||||
e.add(e.eax, e.edx);
|
||||
return e.GetMembaseReg() + e.rax;
|
||||
|
||||
} else {
|
||||
// Clear the top 32 bits, as they are likely garbage.
|
||||
// TODO(benvanik): find a way to avoid doing this.
|
||||
|
||||
e.mov(e.eax, guest.reg().cvt32());
|
||||
}
|
||||
return e.GetMembaseReg() + e.rax + offset_const;
|
||||
}
|
||||
}
|
||||
// Note: most *should* be aligned, but needs to be checked!
|
||||
template <typename T>
|
||||
RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
|
||||
|
@ -127,11 +202,23 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
|
|||
!is_definitely_not_eo(guest)) {
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
e.xor_(e.eax, e.eax);
|
||||
Xbyak::Label& jmpback = e.NewCachedLabel();
|
||||
|
||||
e.mov(e.eax, guest.reg().cvt32());
|
||||
|
||||
e.cmp(guest.reg().cvt32(), e.GetContextReg().cvt32());
|
||||
e.setae(e.al);
|
||||
e.shl(e.eax, 12);
|
||||
e.add(e.eax, guest.reg().cvt32());
|
||||
|
||||
Xbyak::Label& fixup_label =
|
||||
e.AddToTail([&jmpback](X64Emitter& e, Xbyak::Label& our_tail_label) {
|
||||
e.L(our_tail_label);
|
||||
Do0x1000Add(e, e.eax);
|
||||
e.jmp(jmpback, e.T_NEAR);
|
||||
});
|
||||
e.jae(fixup_label, e.T_NEAR);
|
||||
|
||||
e.L(jmpback);
|
||||
return e.GetMembaseReg() + e.rax;
|
||||
|
||||
} else {
|
||||
// Clear the top 32 bits, as they are likely garbage.
|
||||
// TODO(benvanik): find a way to avoid doing this.
|
||||
|
@ -140,6 +227,64 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
|
|||
return e.GetMembaseReg() + e.rax;
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
|
||||
const T& offset) {
|
||||
assert_true(offset.is_constant);
|
||||
int32_t offset_const = static_cast<int32_t>(offset.constant());
|
||||
if (offset_const == 0) {
|
||||
return ComputeMemoryAddress(e, guest);
|
||||
}
|
||||
if (guest.is_constant) {
|
||||
uint32_t address = static_cast<uint32_t>(guest.constant());
|
||||
address += offset_const;
|
||||
if (address < 0x80000000) {
|
||||
return e.GetMembaseReg() + address;
|
||||
} else {
|
||||
if (address >= 0xE0000000 &&
|
||||
xe::memory::allocation_granularity() > 0x1000) {
|
||||
e.mov(e.eax, address + 0x1000);
|
||||
} else {
|
||||
e.mov(e.eax, address);
|
||||
}
|
||||
return e.GetMembaseReg() + e.rax;
|
||||
}
|
||||
} else {
|
||||
if (xe::memory::allocation_granularity() > 0x1000 &&
|
||||
!is_definitely_not_eo(guest)) {
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
|
||||
// todo: do branching or use an alt membase and cmov
|
||||
|
||||
Xbyak::Label& tmplbl = e.NewCachedLabel();
|
||||
|
||||
e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]);
|
||||
|
||||
e.cmp(e.edx, e.GetContextReg().cvt32());
|
||||
|
||||
Xbyak::Label& fixup_label =
|
||||
e.AddToTail([&tmplbl](X64Emitter& e, Xbyak::Label& our_tail_label) {
|
||||
e.L(our_tail_label);
|
||||
|
||||
Do0x1000Add(e, e.edx);
|
||||
|
||||
e.jmp(tmplbl, e.T_NEAR);
|
||||
});
|
||||
e.jae(fixup_label, e.T_NEAR);
|
||||
|
||||
e.L(tmplbl);
|
||||
return e.GetMembaseReg() + e.rdx;
|
||||
|
||||
} else {
|
||||
// Clear the top 32 bits, as they are likely garbage.
|
||||
// TODO(benvanik): find a way to avoid doing this.
|
||||
|
||||
e.mov(e.eax, guest.reg().cvt32());
|
||||
}
|
||||
return e.GetMembaseReg() + e.rax + offset_const;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// OPCODE_ATOMIC_EXCHANGE
|
||||
|
@ -214,11 +359,20 @@ struct ATOMIC_COMPARE_EXCHANGE_I32
|
|||
if (xe::memory::allocation_granularity() > 0x1000) {
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
e.mov(e.ecx, i.src1.reg().cvt32());
|
||||
e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
|
||||
e.setae(e.cl);
|
||||
e.movzx(e.ecx, e.cl);
|
||||
e.shl(e.ecx, 12);
|
||||
e.add(e.ecx, i.src1.reg().cvt32());
|
||||
Xbyak::Label& backtous = e.NewCachedLabel();
|
||||
|
||||
Xbyak::Label& fixup_label =
|
||||
e.AddToTail([&backtous](X64Emitter& e, Xbyak::Label& our_tail_label) {
|
||||
e.L(our_tail_label);
|
||||
|
||||
Do0x1000Add(e, e.ecx);
|
||||
|
||||
e.jmp(backtous, e.T_NEAR);
|
||||
});
|
||||
e.jae(fixup_label, e.T_NEAR);
|
||||
e.L(backtous);
|
||||
} else {
|
||||
e.mov(e.ecx, i.src1.reg().cvt32());
|
||||
}
|
||||
|
@ -235,11 +389,20 @@ struct ATOMIC_COMPARE_EXCHANGE_I64
|
|||
if (xe::memory::allocation_granularity() > 0x1000) {
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
e.mov(e.ecx, i.src1.reg().cvt32());
|
||||
e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
|
||||
e.setae(e.cl);
|
||||
e.movzx(e.ecx, e.cl);
|
||||
e.shl(e.ecx, 12);
|
||||
e.add(e.ecx, i.src1.reg().cvt32());
|
||||
Xbyak::Label& backtous = e.NewCachedLabel();
|
||||
|
||||
Xbyak::Label& fixup_label =
|
||||
e.AddToTail([&backtous](X64Emitter& e, Xbyak::Label& our_tail_label) {
|
||||
e.L(our_tail_label);
|
||||
|
||||
Do0x1000Add(e, e.ecx);
|
||||
|
||||
e.jmp(backtous, e.T_NEAR);
|
||||
});
|
||||
e.jae(fixup_label, e.T_NEAR);
|
||||
e.L(backtous);
|
||||
} else {
|
||||
e.mov(e.ecx, i.src1.reg().cvt32());
|
||||
}
|
||||
|
@ -319,25 +482,44 @@ struct STORE_LOCAL_I8
|
|||
e.mov(e.byte[e.rsp + i.src1.constant()], i.src2);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static bool LocalStoreMayUseMembaseLow(X64Emitter& e, const T& i) {
|
||||
return i.src2.is_constant && i.src2.constant() == 0 &&
|
||||
e.CanUseMembaseLow32As0();
|
||||
}
|
||||
struct STORE_LOCAL_I16
|
||||
: Sequence<STORE_LOCAL_I16, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I16Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
// e.TraceStoreI16(DATA_LOCAL, i.src1.constant, i.src2);
|
||||
e.mov(e.word[e.rsp + i.src1.constant()], i.src2);
|
||||
if (LocalStoreMayUseMembaseLow(e, i)) {
|
||||
e.mov(e.word[e.rsp + i.src1.constant()], e.GetMembaseReg().cvt16());
|
||||
} else {
|
||||
e.mov(e.word[e.rsp + i.src1.constant()], i.src2);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct STORE_LOCAL_I32
|
||||
: Sequence<STORE_LOCAL_I32, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I32Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
// e.TraceStoreI32(DATA_LOCAL, i.src1.constant, i.src2);
|
||||
e.mov(e.dword[e.rsp + i.src1.constant()], i.src2);
|
||||
if (LocalStoreMayUseMembaseLow(e, i)) {
|
||||
e.mov(e.dword[e.rsp + i.src1.constant()], e.GetMembaseReg().cvt32());
|
||||
} else {
|
||||
e.mov(e.dword[e.rsp + i.src1.constant()], i.src2);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct STORE_LOCAL_I64
|
||||
: Sequence<STORE_LOCAL_I64, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
// e.TraceStoreI64(DATA_LOCAL, i.src1.constant, i.src2);
|
||||
e.mov(e.qword[e.rsp + i.src1.constant()], i.src2);
|
||||
if (i.src2.is_constant && i.src2.constant() == 0) {
|
||||
e.xor_(e.eax, e.eax);
|
||||
e.mov(e.qword[e.rsp + i.src1.constant()], e.rax);
|
||||
} else {
|
||||
e.mov(e.qword[e.rsp + i.src1.constant()], i.src2);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct STORE_LOCAL_F32
|
||||
|
@ -404,10 +586,133 @@ struct LOAD_CONTEXT_I32
|
|||
}
|
||||
}
|
||||
};
|
||||
template <typename EmitArgType>
|
||||
static bool HandleLMS64Binary(X64Emitter& e, const EmitArgType& i,
|
||||
LoadModStoreContext& lms, Xbyak::RegExp& addr) {
|
||||
uint64_t other_const_val = 0;
|
||||
bool const_fits_in_insn = false;
|
||||
if (lms.other_const) {
|
||||
other_const_val = lms.other_const->u64;
|
||||
const_fits_in_insn = e.ConstantFitsIn32Reg(other_const_val);
|
||||
}
|
||||
|
||||
/*
|
||||
this check is here because we currently cannot handle other variables
|
||||
with this
|
||||
*/
|
||||
if (!lms.other_const && !lms.binary_uses_twice) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (lms.op == OPCODE_ADD) {
|
||||
if (lms.other_const) {
|
||||
if (const_fits_in_insn) {
|
||||
if (other_const_val == 1 &&
|
||||
e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
|
||||
e.inc(e.qword[addr]);
|
||||
} else {
|
||||
e.add(e.qword[addr], (uint32_t)other_const_val);
|
||||
}
|
||||
|
||||
} else {
|
||||
e.mov(e.rax, other_const_val);
|
||||
e.add(e.qword[addr], e.rax);
|
||||
}
|
||||
return true;
|
||||
} else if (lms.binary_uses_twice) {
|
||||
// we're being added to ourselves, we are a multiply by 2
|
||||
|
||||
e.shl(e.qword[addr], 1);
|
||||
return true;
|
||||
} else if (lms.binary_other) {
|
||||
return false; // cannot handle other variables right now.
|
||||
}
|
||||
} else if (lms.op == OPCODE_SUB) {
|
||||
if (lms.other_index != 1) {
|
||||
return false; // if we are the second operand, we cant combine memory
|
||||
// access and operation
|
||||
}
|
||||
|
||||
if (lms.other_const) {
|
||||
if (const_fits_in_insn) {
|
||||
if (other_const_val == 1 &&
|
||||
e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
|
||||
e.dec(e.qword[addr]);
|
||||
} else {
|
||||
e.sub(e.qword[addr], (uint32_t)other_const_val);
|
||||
}
|
||||
|
||||
} else {
|
||||
e.mov(e.rax, other_const_val);
|
||||
e.sub(e.qword[addr], e.rax);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} else if (lms.op == OPCODE_AND) {
|
||||
if (lms.other_const) {
|
||||
if (const_fits_in_insn) {
|
||||
e.and_(e.qword[addr], (uint32_t)other_const_val);
|
||||
} else {
|
||||
e.mov(e.rax, other_const_val);
|
||||
e.and_(e.qword[addr], e.rax);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} else if (lms.op == OPCODE_OR) {
|
||||
if (lms.other_const) {
|
||||
if (const_fits_in_insn) {
|
||||
e.or_(e.qword[addr], (uint32_t)other_const_val);
|
||||
} else {
|
||||
e.mov(e.rax, other_const_val);
|
||||
e.or_(e.qword[addr], e.rax);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} else if (lms.op == OPCODE_XOR) {
|
||||
if (lms.other_const) {
|
||||
if (const_fits_in_insn) {
|
||||
e.xor_(e.qword[addr], (uint32_t)other_const_val);
|
||||
} else {
|
||||
e.mov(e.rax, other_const_val);
|
||||
e.xor_(e.qword[addr], e.rax);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
template <typename EmitArgType>
|
||||
static bool HandleLMS64Unary(X64Emitter& e, const EmitArgType& i,
|
||||
LoadModStoreContext& lms, Xbyak::RegExp& addr) {
|
||||
Opcode op = lms.op;
|
||||
|
||||
if (op == OPCODE_NOT) {
|
||||
e.not_(e.qword[addr]);
|
||||
return true;
|
||||
} else if (op == OPCODE_NEG) {
|
||||
e.neg(e.qword[addr]);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
struct LOAD_CONTEXT_I64
|
||||
: Sequence<LOAD_CONTEXT_I64, I<OPCODE_LOAD_CONTEXT, I64Op, OffsetOp>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
auto addr = ComputeContextAddress(e, i.src1);
|
||||
LoadModStoreContext lms{};
|
||||
if (GetLoadModStoreContext(i.instr, &lms)) {
|
||||
if (lms.is_binary && HandleLMS64Binary(e, i, lms, addr)) {
|
||||
lms.Consume();
|
||||
return;
|
||||
} else if (lms.is_unary && HandleLMS64Unary(e, i, lms, addr)) {
|
||||
lms.Consume();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
e.mov(i.dest, e.qword[addr]);
|
||||
if (IsTracingData()) {
|
||||
e.mov(e.GetNativeParam(1), e.qword[addr]);
|
||||
|
@ -483,7 +788,11 @@ struct STORE_CONTEXT_I16
|
|||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
auto addr = ComputeContextAddress(e, i.src1);
|
||||
if (i.src2.is_constant) {
|
||||
e.mov(e.word[addr], i.src2.constant());
|
||||
if (i.src2.constant() == 0 && e.CanUseMembaseLow32As0()) {
|
||||
e.mov(e.word[addr], e.GetMembaseReg().cvt16());
|
||||
} else {
|
||||
e.mov(e.word[addr], i.src2.constant());
|
||||
}
|
||||
} else {
|
||||
e.mov(e.word[addr], i.src2);
|
||||
}
|
||||
|
@ -500,7 +809,11 @@ struct STORE_CONTEXT_I32
|
|||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
auto addr = ComputeContextAddress(e, i.src1);
|
||||
if (i.src2.is_constant) {
|
||||
e.mov(e.dword[addr], i.src2.constant());
|
||||
if (i.src2.constant() == 0 && e.CanUseMembaseLow32As0()) {
|
||||
e.mov(e.dword[addr], e.GetMembaseReg().cvt32());
|
||||
} else {
|
||||
e.mov(e.dword[addr], i.src2.constant());
|
||||
}
|
||||
} else {
|
||||
e.mov(e.dword[addr], i.src2);
|
||||
}
|
||||
|
@ -569,9 +882,14 @@ struct STORE_CONTEXT_V128
|
|||
auto addr = ComputeContextAddress(e, i.src1);
|
||||
if (i.src2.is_constant) {
|
||||
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
||||
e.vmovaps(e.ptr[addr], e.xmm0);
|
||||
e.vmovdqa(e.ptr[addr], e.xmm0);
|
||||
} else {
|
||||
e.vmovaps(e.ptr[addr], i.src2);
|
||||
SimdDomain domain = e.DeduceSimdDomain(i.src2.value);
|
||||
if (domain == SimdDomain::FLOATING) {
|
||||
e.vmovaps(e.ptr[addr], i.src2);
|
||||
} else {
|
||||
e.vmovdqa(e.ptr[addr], i.src2);
|
||||
}
|
||||
}
|
||||
if (IsTracingData()) {
|
||||
e.lea(e.GetNativeParam(1), e.ptr[addr]);
|
||||
|
@ -735,7 +1053,11 @@ struct STORE_OFFSET_I16
|
|||
}
|
||||
} else {
|
||||
if (i.src3.is_constant) {
|
||||
e.mov(e.word[addr], i.src3.constant());
|
||||
if (i.src3.constant() == 0 && e.CanUseMembaseLow32As0()) {
|
||||
e.mov(e.word[addr], e.GetMembaseReg().cvt16());
|
||||
} else {
|
||||
e.mov(e.word[addr], i.src3.constant());
|
||||
}
|
||||
} else {
|
||||
e.mov(e.word[addr], i.src3);
|
||||
}
|
||||
|
@ -757,7 +1079,11 @@ struct STORE_OFFSET_I32
|
|||
}
|
||||
} else {
|
||||
if (i.src3.is_constant) {
|
||||
e.mov(e.dword[addr], i.src3.constant());
|
||||
if (i.src3.constant() == 0 && e.CanUseMembaseLow32As0()) {
|
||||
e.mov(e.dword[addr], e.GetMembaseReg().cvt32());
|
||||
} else {
|
||||
e.mov(e.dword[addr], i.src3.constant());
|
||||
}
|
||||
} else {
|
||||
e.mov(e.dword[addr], i.src3);
|
||||
}
|
||||
|
@ -895,7 +1221,7 @@ struct LOAD_V128 : Sequence<LOAD_V128, I<OPCODE_LOAD, V128Op, I64Op>> {
|
|||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
auto addr = ComputeMemoryAddress(e, i.src1);
|
||||
// TODO(benvanik): we should try to stick to movaps if possible.
|
||||
e.vmovups(i.dest, e.ptr[addr]);
|
||||
e.vmovdqa(i.dest, e.ptr[addr]);
|
||||
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
|
||||
// TODO(benvanik): find a way to do this without the memory load.
|
||||
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteSwapMask));
|
||||
|
@ -1054,13 +1380,15 @@ struct STORE_V128
|
|||
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
|
||||
assert_false(i.src2.is_constant);
|
||||
e.vpshufb(e.xmm0, i.src2, e.GetXmmConstPtr(XMMByteSwapMask));
|
||||
e.vmovaps(e.ptr[addr], e.xmm0);
|
||||
// changed from vmovaps, the penalty on the vpshufb is unavoidable but
|
||||
// we dont need to incur another here too
|
||||
e.vmovdqa(e.ptr[addr], e.xmm0);
|
||||
} else {
|
||||
if (i.src2.is_constant) {
|
||||
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
||||
e.vmovaps(e.ptr[addr], e.xmm0);
|
||||
e.vmovdqa(e.ptr[addr], e.xmm0);
|
||||
} else {
|
||||
e.vmovaps(e.ptr[addr], i.src2);
|
||||
e.vmovdqa(e.ptr[addr], i.src2);
|
||||
}
|
||||
}
|
||||
if (IsTracingData()) {
|
||||
|
@ -1081,10 +1409,12 @@ struct CACHE_CONTROL
|
|||
: Sequence<CACHE_CONTROL,
|
||||
I<OPCODE_CACHE_CONTROL, VoidOp, I64Op, OffsetOp>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
bool is_clflush = false, is_prefetch = false;
|
||||
bool is_clflush = false, is_prefetch = false, is_prefetchw = false;
|
||||
switch (CacheControlType(i.instr->flags)) {
|
||||
case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH:
|
||||
case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE:
|
||||
is_prefetchw = true;
|
||||
break;
|
||||
case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH:
|
||||
is_prefetch = true;
|
||||
break;
|
||||
case CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE:
|
||||
|
@ -1095,6 +1425,11 @@ struct CACHE_CONTROL
|
|||
assert_unhandled_case(CacheControlType(i.instr->flags));
|
||||
return;
|
||||
}
|
||||
if (is_prefetchw && !e.IsFeatureEnabled(kX64EmitPrefetchW)) {
|
||||
is_prefetchw = false;
|
||||
is_prefetch = true; // cant prefetchw, cpu doesnt have it (unlikely to
|
||||
// happen). just prefetcht0
|
||||
}
|
||||
size_t cache_line_size = i.src2.value;
|
||||
|
||||
RegExp addr;
|
||||
|
@ -1117,13 +1452,24 @@ struct CACHE_CONTROL
|
|||
}
|
||||
} else {
|
||||
if (xe::memory::allocation_granularity() > 0x1000) {
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't
|
||||
// do it via memory mapping.
|
||||
e.mov(e.eax, i.src1.reg().cvt32());
|
||||
|
||||
e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
|
||||
e.setae(e.al);
|
||||
e.movzx(e.eax, e.al);
|
||||
e.shl(e.eax, 12);
|
||||
e.add(e.eax, i.src1.reg().cvt32());
|
||||
|
||||
Xbyak::Label& tmplbl = e.NewCachedLabel();
|
||||
|
||||
Xbyak::Label& fixup_label =
|
||||
e.AddToTail([&tmplbl](X64Emitter& e, Xbyak::Label& our_tail_label) {
|
||||
e.L(our_tail_label);
|
||||
|
||||
Do0x1000Add(e, e.eax);
|
||||
|
||||
e.jmp(tmplbl, e.T_NEAR);
|
||||
});
|
||||
e.jae(fixup_label, e.T_NEAR);
|
||||
e.L(tmplbl);
|
||||
} else {
|
||||
// Clear the top 32 bits, as they are likely garbage.
|
||||
// TODO(benvanik): find a way to avoid doing this.
|
||||
|
@ -1131,12 +1477,17 @@ struct CACHE_CONTROL
|
|||
}
|
||||
addr = e.GetMembaseReg() + e.rax;
|
||||
}
|
||||
// todo: use clflushopt + sfence on cpus that support it
|
||||
if (is_clflush) {
|
||||
e.clflush(e.ptr[addr]);
|
||||
}
|
||||
|
||||
if (is_prefetch) {
|
||||
e.prefetcht0(e.ptr[addr]);
|
||||
}
|
||||
if (is_prefetchw) {
|
||||
e.prefetchw(e.ptr[addr]);
|
||||
}
|
||||
|
||||
if (cache_line_size >= 128) {
|
||||
// Prefetch the other 64 bytes of the 128-byte cache line.
|
||||
|
@ -1151,6 +1502,9 @@ struct CACHE_CONTROL
|
|||
if (is_prefetch) {
|
||||
e.prefetcht0(e.ptr[addr]);
|
||||
}
|
||||
if (is_prefetchw) {
|
||||
e.prefetchw(e.ptr[addr]);
|
||||
}
|
||||
assert_true(cache_line_size == 128);
|
||||
}
|
||||
}
|
||||
|
@ -1178,20 +1532,24 @@ struct MEMSET_I64_I8_I64
|
|||
assert_true(i.src2.constant() == 0);
|
||||
e.vpxor(e.xmm0, e.xmm0);
|
||||
auto addr = ComputeMemoryAddress(e, i.src1);
|
||||
/*
|
||||
chrispy: changed to vmovdqa, the mismatch between vpxor and vmovaps
|
||||
was causing a 1 cycle stall before the first store
|
||||
*/
|
||||
switch (i.src3.constant()) {
|
||||
case 32:
|
||||
e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0);
|
||||
e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0);
|
||||
|
||||
e.vmovdqa(e.ptr[addr], e.ymm0);
|
||||
break;
|
||||
case 128:
|
||||
e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0);
|
||||
e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0);
|
||||
e.vmovaps(e.ptr[addr + 2 * 16], e.xmm0);
|
||||
e.vmovaps(e.ptr[addr + 3 * 16], e.xmm0);
|
||||
e.vmovaps(e.ptr[addr + 4 * 16], e.xmm0);
|
||||
e.vmovaps(e.ptr[addr + 5 * 16], e.xmm0);
|
||||
e.vmovaps(e.ptr[addr + 6 * 16], e.xmm0);
|
||||
e.vmovaps(e.ptr[addr + 7 * 16], e.xmm0);
|
||||
// probably should lea the address beforehand
|
||||
e.vmovdqa(e.ptr[addr + 0 * 16], e.ymm0);
|
||||
|
||||
e.vmovdqa(e.ptr[addr + 2 * 16], e.ymm0);
|
||||
|
||||
e.vmovdqa(e.ptr[addr + 4 * 16], e.ymm0);
|
||||
|
||||
e.vmovdqa(e.ptr[addr + 6 * 16], e.ymm0);
|
||||
break;
|
||||
default:
|
||||
assert_unhandled_case(i.src3.constant());
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -13,6 +13,8 @@
|
|||
#include "xenia/cpu/hir/instr.h"
|
||||
|
||||
#include <unordered_map>
|
||||
#define assert_impossible_sequence(name) \
|
||||
assert_always("impossible sequence hit" #name);
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
|
|
|
@ -749,7 +749,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
result = true;
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case OPCODE_PERMUTE: {
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
|
||||
i->src3.value->IsConstant() &&
|
||||
|
@ -760,17 +760,20 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
result = true;
|
||||
}
|
||||
|
||||
else if (i->src2.value->IsConstantZero() && i->src3.value->IsConstantZero() &&
|
||||
else if (i->src2.value->IsConstantZero() &&
|
||||
i->src3.value->IsConstantZero() &&
|
||||
i->flags == INT8_TYPE /*probably safe for int16 too*/) {
|
||||
/*
|
||||
chrispy: hoisted this check here from x64_seq_vector where if src1 is not constant, but src2 and src3 are zero, then we know the result will always be zero
|
||||
chrispy: hoisted this check here from x64_seq_vector where if
|
||||
src1 is not constant, but src2 and src3 are zero, then we know
|
||||
the result will always be zero
|
||||
*/
|
||||
|
||||
v->set_zero(VEC128_TYPE);
|
||||
i->Remove();
|
||||
result = true;
|
||||
}
|
||||
|
||||
|
||||
break;
|
||||
}
|
||||
case OPCODE_INSERT:
|
||||
|
@ -930,6 +933,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
result = true;
|
||||
}
|
||||
break;
|
||||
case OPCODE_TO_SINGLE:
|
||||
if (i->src1.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->ToSingle();
|
||||
i->Remove();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// Ignored.
|
||||
break;
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#include "xenia/cpu/compiler/passes/simplification_pass.h"
|
||||
|
||||
#include "xenia/base/byte_order.h"
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/base/profiling.h"
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
|
@ -82,7 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
iter_result |= SimplifyBitArith(builder);
|
||||
iter_result |= EliminateConversions(builder);
|
||||
iter_result |= SimplifyAssignments(builder);
|
||||
iter_result |= BackpropTruncations(builder);
|
||||
|
||||
result |= iter_result;
|
||||
} while (iter_result);
|
||||
return true;
|
||||
|
@ -1207,71 +1208,6 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
|
|||
return result;
|
||||
}
|
||||
|
||||
struct TruncateSimplifier {
|
||||
TypeName type_from, type_to;
|
||||
uint32_t sizeof_from, sizeof_to;
|
||||
uint32_t bit_sizeof_from, bit_sizeof_to;
|
||||
uint64_t typemask_from, typemask_to;
|
||||
hir::HIRBuilder* builder;
|
||||
hir::Instr* truncate_instr;
|
||||
hir::Value* truncated_value;
|
||||
hir::Instr* truncated_value_def;
|
||||
};
|
||||
bool SimplificationPass::BackpropTruncations(hir::Instr* i,
|
||||
hir::HIRBuilder* builder) {
|
||||
if (i->opcode != &OPCODE_TRUNCATE_info) {
|
||||
return false;
|
||||
}
|
||||
TypeName type_from = i->src1.value->type;
|
||||
TypeName type_to = i->dest->type;
|
||||
|
||||
uint32_t sizeof_from = static_cast<uint32_t>(GetTypeSize(type_from));
|
||||
uint32_t sizeof_to = static_cast<uint32_t>(GetTypeSize(type_to));
|
||||
|
||||
Instr* input_def = i->src1.value->GetDefSkipAssigns();
|
||||
if (!input_def) {
|
||||
return false;
|
||||
}
|
||||
Opcode input_opc = input_def->opcode->num;
|
||||
|
||||
if (input_opc == OPCODE_SHL && input_def->src2.value->IsConstant()) {
|
||||
uint32_t src2_shift = input_def->src2.value->AsUint32();
|
||||
if (src2_shift < (sizeof_to * CHAR_BIT)) {
|
||||
Value* truncated_preshift =
|
||||
builder->Truncate(input_def->src1.value, type_to);
|
||||
|
||||
truncated_preshift->def->MoveBefore(i);
|
||||
i->Replace(&OPCODE_SHL_info, 0);
|
||||
i->set_src1(truncated_preshift);
|
||||
i->set_src2(input_def->src2.value);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (input_opc == OPCODE_LOAD_CONTEXT) {
|
||||
if (sizeof_from == 8 && sizeof_to == 4) {
|
||||
Value* loadof = builder->LoadContext(input_def->src1.offset, INT32_TYPE);
|
||||
loadof->def->MoveBefore(input_def);
|
||||
i->Replace(&OPCODE_ASSIGN_info, 0);
|
||||
i->set_src1(loadof);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
bool SimplificationPass::BackpropTruncations(hir::HIRBuilder* builder) {
|
||||
bool result = false;
|
||||
auto block = builder->first_block();
|
||||
while (block) {
|
||||
auto i = block->instr_head;
|
||||
while (i) {
|
||||
result |= BackpropTruncations(i, builder);
|
||||
i = i->next;
|
||||
}
|
||||
block = block->next;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
Value* SimplificationPass::CheckValue(Value* value, bool& result) {
|
||||
auto def = value->def;
|
||||
if (def && def->opcode == &OPCODE_ASSIGN_info) {
|
||||
|
|
|
@ -32,8 +32,6 @@ class SimplificationPass : public ConditionalGroupSubpass {
|
|||
bool SimplifyAssignments(hir::HIRBuilder* builder);
|
||||
hir::Value* CheckValue(hir::Value* value, bool& result);
|
||||
bool SimplifyBitArith(hir::HIRBuilder* builder);
|
||||
bool BackpropTruncations(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
bool BackpropTruncations(hir::HIRBuilder* builder);
|
||||
// handle either or or xor with 0
|
||||
bool CheckOrXorZero(hir::Instr* i);
|
||||
bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
|
|
|
@ -692,6 +692,7 @@ Instr* HIRBuilder::AppendInstr(const OpcodeInfo& opcode_info, uint16_t flags,
|
|||
instr->block = block;
|
||||
instr->opcode = &opcode_info;
|
||||
instr->flags = flags;
|
||||
instr->backend_flags = 0;
|
||||
instr->dest = dest;
|
||||
instr->src1.value = instr->src2.value = instr->src3.value = NULL;
|
||||
instr->src1_use = instr->src2_use = instr->src3_use = NULL;
|
||||
|
@ -1492,7 +1493,6 @@ Value* HIRBuilder::VectorCompareUGE(Value* value1, Value* value2,
|
|||
part_type);
|
||||
}
|
||||
Value* HIRBuilder::VectorDenormFlush(Value* value1) {
|
||||
return value1;
|
||||
ASSERT_VECTOR_TYPE(value1);
|
||||
Instr* i =
|
||||
AppendInstr(OPCODE_VECTOR_DENORMFLUSH_info, 0, AllocValue(VEC128_TYPE));
|
||||
|
@ -1501,6 +1501,14 @@ Value* HIRBuilder::VectorDenormFlush(Value* value1) {
|
|||
i->src3.value = nullptr;
|
||||
return i->dest;
|
||||
}
|
||||
Value* HIRBuilder::ToSingle(Value* value) {
|
||||
assert_true(value->type == FLOAT64_TYPE);
|
||||
Instr* i = AppendInstr(OPCODE_TO_SINGLE_info, 0, AllocValue(FLOAT64_TYPE));
|
||||
i->set_src1(value);
|
||||
i->src2.value = nullptr;
|
||||
i->src3.value = nullptr;
|
||||
return i->dest;
|
||||
}
|
||||
Value* HIRBuilder::Add(Value* value1, Value* value2,
|
||||
uint32_t arithmetic_flags) {
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
|
@ -1720,7 +1728,6 @@ Value* HIRBuilder::Log2(Value* value) {
|
|||
return i->dest;
|
||||
}
|
||||
|
||||
|
||||
Value* HIRBuilder::DotProduct3(Value* value1, Value* value2) {
|
||||
ASSERT_VECTOR_TYPE(value1);
|
||||
ASSERT_VECTOR_TYPE(value2);
|
||||
|
|
|
@ -200,7 +200,7 @@ class HIRBuilder {
|
|||
Value* VectorCompareUGT(Value* value1, Value* value2, TypeName part_type);
|
||||
Value* VectorCompareUGE(Value* value1, Value* value2, TypeName part_type);
|
||||
Value* VectorDenormFlush(Value* value1);
|
||||
|
||||
Value* ToSingle(Value* value);
|
||||
Value* Add(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
|
||||
Value* AddWithCarry(Value* value1, Value* value2, Value* value3,
|
||||
uint32_t arithmetic_flags = 0);
|
||||
|
|
|
@ -180,6 +180,26 @@ exit_loop:
|
|||
*tunnel_flags = traversed_types;
|
||||
return current_def;
|
||||
}
|
||||
bool Instr::IsFake() const {
|
||||
Opcode num = opcode->num;
|
||||
switch (num) {
|
||||
case OPCODE_NOP:
|
||||
case OPCODE_COMMENT:
|
||||
case OPCODE_CONTEXT_BARRIER:
|
||||
case OPCODE_SOURCE_OFFSET:
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
const Instr* Instr::GetNonFakePrev() const {
|
||||
const Instr* curr = prev;
|
||||
|
||||
while (curr && curr->IsFake()) {
|
||||
curr = curr->prev;
|
||||
}
|
||||
return curr;
|
||||
}
|
||||
} // namespace hir
|
||||
} // namespace cpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -42,6 +42,7 @@ class Instr {
|
|||
|
||||
const OpcodeInfo* opcode;
|
||||
uint16_t flags;
|
||||
uint16_t backend_flags; // backends may do whatever they wish with this
|
||||
uint32_t ordinal;
|
||||
|
||||
typedef union {
|
||||
|
@ -158,6 +159,11 @@ if both are constant, return nullptr, nullptr
|
|||
call_for_values(src3.value, 2);
|
||||
}
|
||||
}
|
||||
bool IsFake() const;
|
||||
|
||||
// gets previous instr, skipping instrs like COMMENT, OPCODE_CONTEXT_BARRIER,
|
||||
// OPCODE_SOURCE_OFFSET
|
||||
const hir::Instr* GetNonFakePrev() const;
|
||||
};
|
||||
|
||||
} // namespace hir
|
||||
|
|
|
@ -281,7 +281,10 @@ enum Opcode {
|
|||
OPCODE_ATOMIC_COMPARE_EXCHANGE,
|
||||
OPCODE_SET_ROUNDING_MODE,
|
||||
OPCODE_VECTOR_DENORMFLUSH, // converts denormals to signed zeros in a vector
|
||||
__OPCODE_MAX_VALUE, // Keep at end.
|
||||
OPCODE_TO_SINGLE, // i could not find a decent name to assign to this opcode,
|
||||
// as we already have OPCODE_ROUND. round double to float (
|
||||
// ppc "single" fpu instruction result rounding behavior )
|
||||
__OPCODE_MAX_VALUE, // Keep at end.
|
||||
};
|
||||
|
||||
enum OpcodeFlags {
|
||||
|
@ -352,7 +355,9 @@ static bool IsOpcodeBinaryValue(uint32_t signature) {
|
|||
return (signature & ~(0x7)) ==
|
||||
((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6));
|
||||
}
|
||||
|
||||
static bool IsOpcodeUnaryValue(uint32_t signature) {
|
||||
return (signature & ~(0x7)) == ((OPCODE_SIG_TYPE_V << 3));
|
||||
}
|
||||
static void UnpackOpcodeSig(uint32_t sig, OpcodeSignatureType& dest,
|
||||
OpcodeSignatureType& src1,
|
||||
OpcodeSignatureType& src2,
|
||||
|
|
|
@ -679,4 +679,11 @@ DEFINE_OPCODE(
|
|||
"vector_denormflush",
|
||||
OPCODE_SIG_V_V,
|
||||
0
|
||||
)
|
||||
|
||||
DEFINE_OPCODE(
|
||||
OPCODE_TO_SINGLE,
|
||||
"to_single",
|
||||
OPCODE_SIG_V_V,
|
||||
0
|
||||
)
|
|
@ -1643,6 +1643,11 @@ void Value::DenormalFlush() {
|
|||
constant.v128.u32[i] = current_element;
|
||||
}
|
||||
}
|
||||
void Value::ToSingle() {
|
||||
assert_true(type == FLOAT64_TYPE);
|
||||
|
||||
constant.f64 = static_cast<double>(static_cast<float>(constant.f64));
|
||||
}
|
||||
void Value::CountLeadingZeros(const Value* other) {
|
||||
switch (other->type) {
|
||||
case INT8_TYPE:
|
||||
|
@ -1805,6 +1810,25 @@ hir::Instr* Value::GetDefTunnelMovs(unsigned int* tunnel_flags) {
|
|||
return nullptr;
|
||||
}
|
||||
}
|
||||
// does the value only have one instr that uses it?
|
||||
bool Value::HasSingleUse() const {
|
||||
return use_head && use_head->next == nullptr;
|
||||
}
|
||||
bool Value::AllUsesByOneInsn() const {
|
||||
if (!use_head) {
|
||||
return false;
|
||||
}
|
||||
const Use* first_use = use_head;
|
||||
const Instr* should_match = first_use->instr;
|
||||
|
||||
for (const Use* current_use = first_use->next; current_use;
|
||||
current_use = current_use->next) {
|
||||
if (current_use->instr != should_match) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace hir
|
||||
} // namespace cpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -226,6 +226,15 @@ class Value {
|
|||
return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot;
|
||||
}
|
||||
inline bool IsConstant() const { return !!(flags & VALUE_IS_CONSTANT); }
|
||||
|
||||
inline bool IsEqual(const Value* other) const {
|
||||
if (this == other) {
|
||||
return true;
|
||||
} else if ((this->flags & other->flags) & VALUE_IS_CONSTANT) {
|
||||
return this->IsConstantEQ(other);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool IsConstantTrue() const {
|
||||
if (type == VEC128_TYPE) {
|
||||
assert_always();
|
||||
|
@ -327,7 +336,7 @@ class Value {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
bool IsConstantEQ(Value* other) const {
|
||||
bool IsConstantEQ(const Value* other) const {
|
||||
if (type == VEC128_TYPE) {
|
||||
assert_always();
|
||||
}
|
||||
|
@ -594,13 +603,19 @@ class Value {
|
|||
bool saturate);
|
||||
void ByteSwap();
|
||||
void DenormalFlush();
|
||||
|
||||
void ToSingle();
|
||||
void CountLeadingZeros(const Value* other);
|
||||
bool Compare(Opcode opcode, Value* other);
|
||||
hir::Instr* GetDefSkipAssigns();
|
||||
// tunnel_flags is updated to the kinds we actually traversed
|
||||
hir::Instr* GetDefTunnelMovs(unsigned int* tunnel_flags);
|
||||
|
||||
// does the value only have one instr that uses it?
|
||||
bool HasSingleUse() const;
|
||||
// returns true if every single use is as an operand to a single instruction
|
||||
// (add var2, var1, var1)
|
||||
bool AllUsesByOneInsn() const;
|
||||
|
||||
private:
|
||||
static bool CompareInt8(Opcode opcode, Value* a, Value* b);
|
||||
static bool CompareInt16(Opcode opcode, Value* a, Value* b);
|
||||
|
|
|
@ -379,7 +379,7 @@ typedef struct alignas(64) PPCContext_s {
|
|||
uint64_t lr; // 0x10 Link register
|
||||
double f[32]; // 0x120 Floating-point registers
|
||||
vec128_t v[128]; // 0x220 VMX128 vector registers
|
||||
|
||||
vec128_t vscr_vec;
|
||||
// XER register:
|
||||
// Split to make it easier to do individual updates.
|
||||
uint8_t xer_ca;
|
||||
|
@ -422,7 +422,7 @@ typedef struct alignas(64) PPCContext_s {
|
|||
// Value of last reserved load
|
||||
uint64_t reserved_val;
|
||||
ThreadState* thread_state;
|
||||
uint8_t* virtual_membase;
|
||||
uint8_t* virtual_membase;
|
||||
static std::string GetRegisterName(PPCRegister reg);
|
||||
std::string GetStringFromValue(PPCRegister reg) const;
|
||||
void SetValueFromString(PPCRegister reg, std::string value);
|
||||
|
@ -432,6 +432,7 @@ typedef struct alignas(64) PPCContext_s {
|
|||
std::string& result) const;
|
||||
} PPCContext;
|
||||
#pragma pack(pop)
|
||||
constexpr size_t ppcctx_size = sizeof(PPCContext);
|
||||
static_assert(sizeof(PPCContext) % 64 == 0, "64b padded");
|
||||
|
||||
} // namespace ppc
|
||||
|
|
|
@ -355,13 +355,18 @@ int InstrEmit_stvrxl128(PPCHIRBuilder& f, const InstrData& i) {
|
|||
}
|
||||
|
||||
int InstrEmit_mfvscr(PPCHIRBuilder& f, const InstrData& i) {
|
||||
XEINSTRNOTIMPLEMENTED();
|
||||
return 1;
|
||||
// is this the right format?
|
||||
|
||||
f.StoreVR(i.VX128_1.RB,
|
||||
f.LoadContext(offsetof(PPCContext, vscr_vec), VEC128_TYPE));
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
|
||||
XEINSTRNOTIMPLEMENTED();
|
||||
return 1;
|
||||
// is this the right format?
|
||||
Value* v = f.LoadVR(i.VX128_1.RB);
|
||||
f.StoreContext(offsetof(PPCContext, vscr_vec), v);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_vaddcuw(PPCHIRBuilder& f, const InstrData& i) {
|
||||
|
@ -1105,7 +1110,7 @@ int InstrEmit_vmsum3fp128(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// Dot product XYZ.
|
||||
// (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z)
|
||||
Value* v = f.DotProduct3(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128));
|
||||
//chrispy: denormal outputs for Dot product are unconditionally made 0
|
||||
// chrispy: denormal outputs for Dot product are unconditionally made 0
|
||||
v = f.VectorDenormFlush(v);
|
||||
f.StoreVR(VX128_VD128, v);
|
||||
return 0;
|
||||
|
|
|
@ -336,6 +336,7 @@ int InstrEmit_mulhwx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
XEINSTRNOTIMPLEMENTED();
|
||||
return 1;
|
||||
}
|
||||
|
||||
Value* v = f.SignExtend(f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
|
||||
f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE)),
|
||||
INT64_TYPE);
|
||||
|
@ -353,6 +354,7 @@ int InstrEmit_mulhwux(PPCHIRBuilder& f, const InstrData& i) {
|
|||
XEINSTRNOTIMPLEMENTED();
|
||||
return 1;
|
||||
}
|
||||
|
||||
Value* v = f.ZeroExtend(
|
||||
f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
|
||||
f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), ARITHMETIC_UNSIGNED),
|
||||
|
|
|
@ -46,7 +46,7 @@ int InstrEmit_faddx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
int InstrEmit_faddsx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- (frA) + (frB)
|
||||
Value* v = f.Add(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB));
|
||||
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
|
||||
v = f.ToSingle(v);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
|
@ -63,7 +63,7 @@ int InstrEmit_fdivx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
int InstrEmit_fdivsx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- frA / frB
|
||||
Value* v = f.Div(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB));
|
||||
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
|
||||
v = f.ToSingle(v);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
|
@ -80,7 +80,7 @@ int InstrEmit_fmulx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- (frA) x (frC)
|
||||
Value* v = f.Mul(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC));
|
||||
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
|
||||
v = f.ToSingle(v);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
|
@ -88,9 +88,9 @@ int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
|
||||
int InstrEmit_fresx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- 1.0 / (frB)
|
||||
Value* v = f.Convert(f.Div(f.LoadConstantFloat32(1.0f),
|
||||
f.Convert(f.LoadFPR(i.A.FRB), FLOAT32_TYPE)),
|
||||
FLOAT64_TYPE);
|
||||
|
||||
Value* v = f.Recip(f.LoadFPR(i.A.FRB));
|
||||
v = f.ToSingle(v);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
|
@ -116,7 +116,7 @@ int InstrEmit_fsubx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
int InstrEmit_fsubsx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- (frA) - (frB)
|
||||
Value* v = f.Sub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB));
|
||||
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
|
||||
v = f.ToSingle(v);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
|
@ -132,64 +132,63 @@ int InstrEmit_fselx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_fsqrtx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// Double precision:
|
||||
static int InstrEmit_fsqrt(PPCHIRBuilder& f, const InstrData& i, bool single) {
|
||||
// frD <- sqrt(frB)
|
||||
Value* v = f.Sqrt(f.LoadFPR(i.A.FRB));
|
||||
if (single) {
|
||||
v = f.ToSingle(v);
|
||||
}
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
}
|
||||
int InstrEmit_fsqrtx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
return InstrEmit_fsqrt(f, i, false);
|
||||
}
|
||||
|
||||
int InstrEmit_fsqrtsx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// Single precision:
|
||||
// frD <- sqrt(frB)
|
||||
Value* v = f.Sqrt(f.LoadFPR(i.A.FRB));
|
||||
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
return InstrEmit_fsqrt(f, i, true);
|
||||
}
|
||||
|
||||
// Floating-point multiply-add (A-9)
|
||||
|
||||
int InstrEmit_fmaddx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
static int InstrEmit_fmadd(PPCHIRBuilder& f, const InstrData& i, bool single) {
|
||||
// frD <- (frA x frC) + frB
|
||||
Value* v =
|
||||
f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
|
||||
if (single) {
|
||||
v = f.ToSingle(v);
|
||||
}
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_fmaddx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
return InstrEmit_fmadd(f, i, false);
|
||||
}
|
||||
|
||||
int InstrEmit_fmaddsx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- (frA x frC) + frB
|
||||
return InstrEmit_fmadd(f, i, true);
|
||||
}
|
||||
|
||||
static int InstrEmit_fmsub(PPCHIRBuilder& f, const InstrData& i, bool single) {
|
||||
// frD <- (frA x frC) - frB
|
||||
Value* v =
|
||||
f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
|
||||
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
|
||||
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
|
||||
if (single) {
|
||||
v = f.ToSingle(v);
|
||||
}
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_fmsubx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- (frA x frC) - frB
|
||||
Value* v =
|
||||
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
return InstrEmit_fmsub(f, i, false);
|
||||
}
|
||||
|
||||
int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- (frA x frC) - frB
|
||||
Value* v =
|
||||
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
|
||||
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
return InstrEmit_fmsub(f, i, true);
|
||||
}
|
||||
|
||||
int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
|
@ -205,7 +204,7 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// frD <- -([frA x frC] + frB)
|
||||
Value* v = f.Neg(
|
||||
f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
|
||||
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
|
||||
v = f.ToSingle(v);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
|
@ -224,7 +223,7 @@ int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// frD <- -([frA x frC] - frB)
|
||||
Value* v = f.Neg(
|
||||
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
|
||||
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
|
||||
v = f.ToSingle(v);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
|
|
Loading…
Reference in New Issue