Merge branch 'canary_experimental' of https://github.com/xenia-canary/xenia-canary into canary_experimental

This commit is contained in:
Gliniak 2022-07-24 17:58:48 +02:00
commit 6730ffb7d3
23 changed files with 1299 additions and 786 deletions

View File

@ -688,7 +688,12 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
vmovaps(xmm15, qword[rsp + offsetof(StackLayout::Thunk, xmm[9])]);
#endif
}
void X64Backend::InitializeBackendContext(void* ctx) {
X64BackendContext* bctx = reinterpret_cast<X64BackendContext*>(
reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
bctx->Ox1000 = 0x1000;
}
} // namespace x64
} // namespace backend
} // namespace cpu

View File

@ -31,6 +31,16 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
typedef void (*ResolveFunctionThunk)();
// located prior to the ctx register
// some things it would be nice to have be per-emulator instance instead of per
// context (somehow placing a global X64BackendCtx prior to membase, so we can
// negatively index the membase reg)
struct X64BackendContext {
void* ResolveFunction_Ptr; // cached pointer to resolvefunction
unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted
// add of it by... 2 bytes lol
};
class X64Backend : public Backend {
public:
static const uint32_t kForceReturnAddress = 0x9FFF0000u;
@ -65,6 +75,7 @@ class X64Backend : public Backend {
void InstallBreakpoint(Breakpoint* breakpoint) override;
void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) override;
void UninstallBreakpoint(Breakpoint* breakpoint) override;
virtual void InitializeBackendContext(void* ctx) override;
private:
static bool ExceptionCallbackThunk(Exception* ex, void* data);

View File

@ -105,6 +105,7 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
#undef TEST_EMIT_FEATURE
/*
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
@ -121,6 +122,10 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
bool is_zennish = cpu_.displayFamily >= 0x17;
if (is_zennish) {
// ik that i heard somewhere that this is the case for zen, but i need to
// verify. cant find my original source for that.
// todo: ask agner?
feature_flags_ |= kX64FlagsIndependentVars;
feature_flags_ |= kX64FastJrcx;
if (cpu_.displayFamily > 0x17) {
@ -132,6 +137,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
// for my cpu, which is ripper90
}
}
may_use_membase32_as_zero_reg_ =
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(
processor()->memory()->virtual_membase())) == 0;
}
X64Emitter::~X64Emitter() = default;
@ -210,6 +218,11 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
top_ = old_address;
reset();
call_sites_.clear();
tail_code_.clear();
for (auto&& cached_label : label_cache_) {
delete cached_label;
}
label_cache_.clear();
return new_execute_address;
}
@ -261,13 +274,14 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();
xor_(eax, eax);
/*
* chrispy: removed this, it serves no purpose
mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
*/
mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);
mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax); // 0
// Safe now to do some tracing.
if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctions) {
@ -343,6 +357,13 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
add(rsp, (uint32_t)stack_size);
ret();
// todo: do some kind of sorting by alignment?
for (auto&& tail_item : tail_code_) {
if (tail_item.alignment) {
align(tail_item.alignment);
}
tail_item.func(*this, tail_item.label);
}
code_offsets.tail = getSize();
@ -605,12 +626,10 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
// rdx = arg0
// r8 = arg1
// r9 = arg2
auto thunk = backend()->guest_to_host_thunk();
mov(rax, reinterpret_cast<uint64_t>(thunk));
mov(rcx, reinterpret_cast<uint64_t>(builtin_function->handler()));
mov(rdx, reinterpret_cast<uint64_t>(builtin_function->arg0()));
mov(r8, reinterpret_cast<uint64_t>(builtin_function->arg1()));
call(rax);
call(backend()->guest_to_host_thunk());
// rax = host return
}
} else if (function->behavior() == Function::Behavior::kExtern) {
@ -621,12 +640,10 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
// rdx = arg0
// r8 = arg1
// r9 = arg2
auto thunk = backend()->guest_to_host_thunk();
mov(rax, reinterpret_cast<uint64_t>(thunk));
mov(rcx, reinterpret_cast<uint64_t>(extern_function->extern_handler()));
mov(rdx,
qword[GetContextReg() + offsetof(ppc::PPCContext, kernel_state)]);
call(rax);
call(backend()->guest_to_host_thunk());
// rax = host return
}
}
@ -656,10 +673,8 @@ void X64Emitter::CallNativeSafe(void* fn) {
// rdx = arg0
// r8 = arg1
// r9 = arg2
auto thunk = backend()->guest_to_host_thunk();
mov(rax, reinterpret_cast<uint64_t>(thunk));
mov(rcx, reinterpret_cast<uint64_t>(fn));
call(rax);
call(backend()->guest_to_host_thunk());
// rax = host return
}
@ -715,24 +730,50 @@ bool X64Emitter::ConstantFitsIn32Reg(uint64_t v) {
}
return false;
}
/*
WARNING: do not use any regs here, addr is often produced by
ComputeAddressOffset, which may use rax/rdx/rcx in its addr expression
*/
void X64Emitter::MovMem64(const Xbyak::RegExp& addr, uint64_t v) {
if ((v & ~0x7FFFFFFF) == 0) {
uint32_t lowpart = static_cast<uint32_t>(v);
uint32_t highpart = static_cast<uint32_t>(v >> 32);
// check whether the constant coincidentally collides with our membase
if (v == (uintptr_t)processor()->memory()->virtual_membase()) {
mov(qword[addr], GetMembaseReg());
} else if ((v & ~0x7FFFFFFF) == 0) {
// Fits under 31 bits, so just load using normal mov.
mov(qword[addr], v);
} else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) {
// Negative number that fits in 32bits.
mov(qword[addr], v);
} else if (!(v >> 32)) {
} else if (!highpart) {
// All high bits are zero. It'd be nice if we had a way to load a 32bit
// immediate without sign extending!
// TODO(benvanik): this is super common, find a better way.
mov(dword[addr], static_cast<uint32_t>(v));
mov(dword[addr + 4], 0);
if (lowpart == 0 && CanUseMembaseLow32As0()) {
mov(dword[addr], GetMembaseReg().cvt32());
} else {
mov(dword[addr], static_cast<uint32_t>(v));
}
if (CanUseMembaseLow32As0()) {
mov(dword[addr + 4], GetMembaseReg().cvt32());
} else {
mov(dword[addr + 4], 0);
}
} else {
// 64bit number that needs double movs.
mov(dword[addr], static_cast<uint32_t>(v));
mov(dword[addr + 4], static_cast<uint32_t>(v >> 32));
if (lowpart == 0 && CanUseMembaseLow32As0()) {
mov(dword[addr], GetMembaseReg().cvt32());
} else {
mov(dword[addr], lowpart);
}
if (highpart == 0 && CanUseMembaseLow32As0()) {
mov(dword[addr + 4], GetMembaseReg().cvt32());
} else {
mov(dword[addr + 4], highpart);
}
}
}
static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,
@ -893,7 +934,13 @@ static const vec128_t xmm_consts[] = {
/* XMMThreeFloatMask */
vec128i(~0U, ~0U, ~0U, 0U),
/*XMMXenosF16ExtRangeStart*/
vec128f(65504)};
vec128f(65504),
/*XMMVSRShlByteshuf*/
v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
// XMMVSRMask
vec128b(1)
};
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
for (auto& vec : xmm_consts) {
@ -1300,6 +1347,27 @@ SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) {
return SimdDomain::DONTCARE;
}
Xbyak::Address X64Emitter::GetBackendCtxPtr(int offset_in_x64backendctx) {
/*
index context ptr negatively to get to backend ctx field
*/
ptrdiff_t delta = (-static_cast<ptrdiff_t>(sizeof(X64BackendContext))) +
offset_in_x64backendctx;
return ptr[GetContextReg() + static_cast<int>(delta)];
}
Xbyak::Label& X64Emitter::AddToTail(TailEmitCallback callback,
uint32_t alignment) {
TailEmitter emitter{};
emitter.func = std::move(callback);
emitter.alignment = alignment;
tail_code_.push_back(std::move(emitter));
return tail_code_.back().label;
}
Xbyak::Label& X64Emitter::NewCachedLabel() {
Xbyak::Label* tmp = new Xbyak::Label;
label_cache_.push_back(tmp);
return *tmp;
}
} // namespace x64
} // namespace backend
} // namespace cpu

View File

@ -155,7 +155,15 @@ enum XmmConst {
XMMLVSRTableBase,
XMMSingleDenormalMask,
XMMThreeFloatMask, // for clearing the fourth float prior to DOT_PRODUCT_3
XMMXenosF16ExtRangeStart
XMMXenosF16ExtRangeStart,
XMMVSRShlByteshuf,
XMMVSRMask
};
// X64Backend specific Instr->runtime_flags
enum : uint32_t {
INSTR_X64_FLAGS_ELIMINATED =
1, // another sequence marked this instruction as not needing codegen,
// meaning they likely already handled it
};
// Unfortunately due to the design of xbyak we have to pass this to the ctor.
@ -185,7 +193,13 @@ enum X64EmitterFeatureFlags {
kX64FastJrcx = 1 << 12, // jrcxz is as fast as any other jump ( >= Zen1)
kX64FastLoop =
1 << 13, // loop/loope/loopne is as fast as any other jump ( >= Zen2)
kX64EmitAVX512VBMI = 1 << 14
kX64EmitAVX512VBMI = 1 << 14,
kX64FlagsIndependentVars =
1 << 15, // if true, instructions that only modify some flags (like
// inc/dec) do not introduce false dependencies on EFLAGS
// because the individual flags are treated as different vars by
// the processor. (this applies to zen)
kX64EmitPrefetchW = 1 << 16
};
class ResolvableGuestCall {
public:
@ -194,6 +208,13 @@ class ResolvableGuestCall {
// rgcid
unsigned offset_;
};
class X64Emitter;
using TailEmitCallback = std::function<void(X64Emitter& e, Xbyak::Label& lbl)>;
struct TailEmitter {
Xbyak::Label label;
uint32_t alignment;
TailEmitCallback func;
};
class X64Emitter : public Xbyak::CodeGenerator {
public:
@ -264,7 +285,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
Xbyak::Reg64 GetContextReg();
Xbyak::Reg64 GetMembaseReg();
bool CanUseMembaseLow32As0() const { return may_use_membase32_as_zero_reg_; }
void ReloadMembase();
void nop(size_t length = 1);
@ -274,6 +295,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
void MovMem64(const Xbyak::RegExp& addr, uint64_t v);
Xbyak::Address GetXmmConstPtr(XmmConst id);
Xbyak::Address GetBackendCtxPtr(int offset_in_x64backendctx);
void LoadConstantXmm(Xbyak::Xmm dest, float v);
void LoadConstantXmm(Xbyak::Xmm dest, double v);
void LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v);
@ -289,6 +312,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
return (feature_flags_ & feature_flag) == feature_flag;
}
Xbyak::Label& AddToTail(TailEmitCallback callback, uint32_t alignment = 0);
Xbyak::Label& NewCachedLabel();
FunctionDebugInfo* debug_info() const { return debug_info_; }
size_t stack_size() const { return stack_size_; }
@ -324,6 +349,16 @@ class X64Emitter : public Xbyak::CodeGenerator {
static const uint32_t xmm_reg_map_[XMM_COUNT];
uint32_t current_rgc_id_ = 0xEEDDF00F;
std::vector<ResolvableGuestCall> call_sites_;
/*
set to true if the low 32 bits of membase == 0.
only really advantageous if you are storing 32 bit 0 to a displaced address,
which would have to represent 0 as 4 bytes
*/
bool may_use_membase32_as_zero_reg_;
std::vector<TailEmitter> tail_code_;
std::vector<Xbyak::Label*>
label_cache_; // for creating labels that need to be referenced much
// later by tail emitters
};
} // namespace x64

View File

@ -109,7 +109,6 @@ struct DEBUG_BREAK_TRUE_I32
: Sequence<DEBUG_BREAK_TRUE_I32,
I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I32Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.IsFeatureEnabled(kX64FastJrcx)) {
e.mov(e.ecx, i.src1);
Xbyak::Label skip;
@ -187,77 +186,48 @@ EMITTER_OPCODE_TABLE(OPCODE_TRAP, TRAP);
struct TRAP_TRUE_I8
: Sequence<TRAP_TRUE_I8, I<OPCODE_TRAP_TRUE, VoidOp, I8Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
Xbyak::Label& after = e.NewCachedLabel();
unsigned flags = i.instr->flags;
Xbyak::Label& dotrap =
e.AddToTail([flags, &after](X64Emitter& e, Xbyak::Label& me) {
e.L(me);
e.Trap(flags);
// does Trap actually return control to the guest?
e.jmp(after, X64Emitter::T_NEAR);
});
e.test(i.src1, i.src1);
Xbyak::Label skip;
e.jz(skip);
e.Trap(i.instr->flags);
e.L(skip);
e.jnz(dotrap, X64Emitter::T_NEAR);
e.L(after);
}
};
struct TRAP_TRUE_I16
: Sequence<TRAP_TRUE_I16, I<OPCODE_TRAP_TRUE, VoidOp, I16Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1);
Xbyak::Label skip;
e.jz(skip);
e.Trap(i.instr->flags);
e.L(skip);
assert_impossible_sequence(TRAP_TRUE_I16);
}
};
struct TRAP_TRUE_I32
: Sequence<TRAP_TRUE_I32, I<OPCODE_TRAP_TRUE, VoidOp, I32Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.IsFeatureEnabled(kX64FastJrcx)) {
e.mov(e.ecx, i.src1);
Xbyak::Label skip;
e.jrcxz(skip);
e.Trap(i.instr->flags);
e.L(skip);
} else {
e.test(i.src1, i.src1);
Xbyak::Label skip;
e.jz(skip);
e.Trap(i.instr->flags);
e.L(skip);
}
assert_impossible_sequence(TRAP_TRUE_I32);
}
};
struct TRAP_TRUE_I64
: Sequence<TRAP_TRUE_I64, I<OPCODE_TRAP_TRUE, VoidOp, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.IsFeatureEnabled(kX64FastJrcx)) {
e.mov(e.rcx, i.src1);
Xbyak::Label skip;
e.jrcxz(skip);
e.Trap(i.instr->flags);
e.L(skip);
} else {
e.test(i.src1, i.src1);
Xbyak::Label skip;
e.jz(skip);
e.Trap(i.instr->flags);
e.L(skip);
}
assert_impossible_sequence(TRAP_TRUE_I64);
}
};
struct TRAP_TRUE_F32
: Sequence<TRAP_TRUE_F32, I<OPCODE_TRAP_TRUE, VoidOp, F32Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vptest(i.src1, i.src1);
Xbyak::Label skip;
e.jz(skip);
e.Trap(i.instr->flags);
e.L(skip);
assert_impossible_sequence(TRAP_TRUE_F32);
}
};
struct TRAP_TRUE_F64
: Sequence<TRAP_TRUE_F64, I<OPCODE_TRAP_TRUE, VoidOp, F64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vptest(i.src1, i.src1);
Xbyak::Label skip;
e.jz(skip);
e.Trap(i.instr->flags);
e.L(skip);
assert_impossible_sequence(TRAP_TRUE_F64);
}
};
EMITTER_OPCODE_TABLE(OPCODE_TRAP_TRUE, TRAP_TRUE_I8, TRAP_TRUE_I16,
@ -333,6 +303,7 @@ struct CALL_TRUE_F32
e.L(skip);
}
};
struct CALL_TRUE_F64
: Sequence<CALL_TRUE_F64, I<OPCODE_CALL_TRUE, VoidOp, F64Op, SymbolOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
@ -388,7 +359,6 @@ struct CALL_INDIRECT_TRUE_I32
: Sequence<CALL_INDIRECT_TRUE_I32,
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I32Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.IsFeatureEnabled(kX64FastJrcx)) {
e.mov(e.ecx, i.src1);
Xbyak::Label skip;

View File

@ -14,6 +14,7 @@
#include "xenia/base/cvar.h"
#include "xenia/base/memory.h"
#include "xenia/cpu/backend/x64/x64_backend.h"
#include "xenia/cpu/backend/x64/x64_op.h"
#include "xenia/cpu/backend/x64/x64_tracers.h"
#include "xenia/cpu/ppc/ppc_context.h"
@ -28,8 +29,127 @@ namespace cpu {
namespace backend {
namespace x64 {
struct LoadModStore {
const hir::Instr* load;
hir::Instr* modify;
hir::Instr* store;
bool is_constant[3];
void Consume();
};
void LoadModStore::Consume() {
modify->backend_flags |= INSTR_X64_FLAGS_ELIMINATED;
store->backend_flags |= INSTR_X64_FLAGS_ELIMINATED;
}
static bool GetLoadModStore(const hir::Instr* loadinsn, LoadModStore* out) {
if (IsTracingData()) {
return false;
}
// if (!loadinsn->dest->HasSingleUse()) {
// allow the value to be used multiple times, as long as it is by the same
// instruction
if (!loadinsn->dest->AllUsesByOneInsn()) {
return false;
}
hir::Instr* use = loadinsn->dest->use_head->instr;
if (!use->dest || !use->dest->HasSingleUse() ||
use->GetNonFakePrev() != loadinsn) {
return false;
}
hir::Instr* shouldbstore = use->dest->use_head->instr;
if (shouldbstore->dest || shouldbstore->GetNonFakePrev() != use) {
return false; // store insns have no destination
}
use->VisitValueOperands([out](Value* v, uint32_t idx) {
out->is_constant[idx] = v->IsConstant();
});
out->load = loadinsn;
out->modify = use;
out->store = shouldbstore;
return true;
}
struct LoadModStoreContext : public LoadModStore {
uint64_t offset; // ctx offset
TypeName type;
Opcode op;
bool is_commutative;
bool is_unary;
bool is_binary;
bool
binary_uses_twice; // true if binary_other == our value. (for instance,
// add r11, r10, r10, which can be gen'ed for r10 * 2)
hir::Value* binary_other;
hir::Value::ConstantValue* other_const;
uint32_t other_index;
};
static bool GetLoadModStoreContext(const hir::Instr* loadinsn,
LoadModStoreContext* out) {
if (!GetLoadModStore(loadinsn, out)) {
return false;
}
if (out->load->opcode->num != OPCODE_LOAD_CONTEXT ||
out->store->opcode->num != OPCODE_STORE_CONTEXT) {
return false;
}
if (out->modify->opcode->flags &
(OPCODE_FLAG_VOLATILE | OPCODE_FLAG_MEMORY)) {
return false;
}
uint64_t offs = out->load->src1.offset;
if (offs != out->store->src1.offset) {
return false;
}
TypeName typ = out->load->dest->type;
// can happen if op is a conversion
if (typ != out->store->src2.value->type) {
return false;
}
/*
set up a whole bunch of convenience fields for the caller
*/
out->offset = offs;
out->type = typ;
const OpcodeInfo& opinf = *out->modify->opcode;
out->op = opinf.num;
out->is_commutative = opinf.flags & OPCODE_FLAG_COMMUNATIVE;
out->is_unary = IsOpcodeUnaryValue(opinf.signature);
out->is_binary = IsOpcodeBinaryValue(opinf.signature);
out->binary_uses_twice = false;
out->binary_other = nullptr;
out->other_const = nullptr;
out->other_index = ~0U;
if (out->is_binary) {
if (out->modify->src1.value == out->load->dest) {
out->binary_other = out->modify->src2.value;
out->other_index = 1;
} else {
out->binary_other = out->modify->src1.value;
out->other_index = 0;
}
if (out->binary_other && out->is_constant[out->other_index]) {
out->other_const = &out->binary_other->constant;
}
if (out->binary_other == out->load->dest) {
out->binary_uses_twice = true;
}
}
return true;
}
volatile int anchor_memory = 0;
static void Do0x1000Add(X64Emitter& e, Reg32 reg) {
e.add(reg, e.GetBackendCtxPtr(offsetof(X64BackendContext, Ox1000)));
// e.add(reg, 0x1000);
}
// Note: all types are always aligned in the context.
RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) {
return e.GetContextReg() + offset.value;
@ -58,51 +178,6 @@ static bool is_definitely_not_eo(const T& v) {
return is_eo_def(v.value);
}
template <typename T>
RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
const T& offset) {
assert_true(offset.is_constant);
int32_t offset_const = static_cast<int32_t>(offset.constant());
if (guest.is_constant) {
uint32_t address = static_cast<uint32_t>(guest.constant());
address += offset_const;
if (address < 0x80000000) {
return e.GetMembaseReg() + address;
} else {
if (address >= 0xE0000000 &&
xe::memory::allocation_granularity() > 0x1000) {
e.mov(e.eax, address + 0x1000);
} else {
e.mov(e.eax, address);
}
return e.GetMembaseReg() + e.rax;
}
} else {
if (xe::memory::allocation_granularity() > 0x1000 &&
!is_definitely_not_eo(guest)) {
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
// it via memory mapping.
// todo: do branching or use an alt membase and cmov
e.xor_(e.eax, e.eax);
e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]);
e.cmp(e.edx, e.GetContextReg().cvt32());
e.setae(e.al);
e.shl(e.eax, 12);
e.add(e.eax, e.edx);
return e.GetMembaseReg() + e.rax;
} else {
// Clear the top 32 bits, as they are likely garbage.
// TODO(benvanik): find a way to avoid doing this.
e.mov(e.eax, guest.reg().cvt32());
}
return e.GetMembaseReg() + e.rax + offset_const;
}
}
// Note: most *should* be aligned, but needs to be checked!
template <typename T>
RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
@ -127,11 +202,23 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
!is_definitely_not_eo(guest)) {
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
// it via memory mapping.
e.xor_(e.eax, e.eax);
Xbyak::Label& jmpback = e.NewCachedLabel();
e.mov(e.eax, guest.reg().cvt32());
e.cmp(guest.reg().cvt32(), e.GetContextReg().cvt32());
e.setae(e.al);
e.shl(e.eax, 12);
e.add(e.eax, guest.reg().cvt32());
Xbyak::Label& fixup_label =
e.AddToTail([&jmpback](X64Emitter& e, Xbyak::Label& our_tail_label) {
e.L(our_tail_label);
Do0x1000Add(e, e.eax);
e.jmp(jmpback, e.T_NEAR);
});
e.jae(fixup_label, e.T_NEAR);
e.L(jmpback);
return e.GetMembaseReg() + e.rax;
} else {
// Clear the top 32 bits, as they are likely garbage.
// TODO(benvanik): find a way to avoid doing this.
@ -140,6 +227,64 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
return e.GetMembaseReg() + e.rax;
}
}
template <typename T>
RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
const T& offset) {
assert_true(offset.is_constant);
int32_t offset_const = static_cast<int32_t>(offset.constant());
if (offset_const == 0) {
return ComputeMemoryAddress(e, guest);
}
if (guest.is_constant) {
uint32_t address = static_cast<uint32_t>(guest.constant());
address += offset_const;
if (address < 0x80000000) {
return e.GetMembaseReg() + address;
} else {
if (address >= 0xE0000000 &&
xe::memory::allocation_granularity() > 0x1000) {
e.mov(e.eax, address + 0x1000);
} else {
e.mov(e.eax, address);
}
return e.GetMembaseReg() + e.rax;
}
} else {
if (xe::memory::allocation_granularity() > 0x1000 &&
!is_definitely_not_eo(guest)) {
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
// it via memory mapping.
// todo: do branching or use an alt membase and cmov
Xbyak::Label& tmplbl = e.NewCachedLabel();
e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]);
e.cmp(e.edx, e.GetContextReg().cvt32());
Xbyak::Label& fixup_label =
e.AddToTail([&tmplbl](X64Emitter& e, Xbyak::Label& our_tail_label) {
e.L(our_tail_label);
Do0x1000Add(e, e.edx);
e.jmp(tmplbl, e.T_NEAR);
});
e.jae(fixup_label, e.T_NEAR);
e.L(tmplbl);
return e.GetMembaseReg() + e.rdx;
} else {
// Clear the top 32 bits, as they are likely garbage.
// TODO(benvanik): find a way to avoid doing this.
e.mov(e.eax, guest.reg().cvt32());
}
return e.GetMembaseReg() + e.rax + offset_const;
}
}
// ============================================================================
// OPCODE_ATOMIC_EXCHANGE
@ -214,11 +359,20 @@ struct ATOMIC_COMPARE_EXCHANGE_I32
if (xe::memory::allocation_granularity() > 0x1000) {
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
// it via memory mapping.
e.mov(e.ecx, i.src1.reg().cvt32());
e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
e.setae(e.cl);
e.movzx(e.ecx, e.cl);
e.shl(e.ecx, 12);
e.add(e.ecx, i.src1.reg().cvt32());
Xbyak::Label& backtous = e.NewCachedLabel();
Xbyak::Label& fixup_label =
e.AddToTail([&backtous](X64Emitter& e, Xbyak::Label& our_tail_label) {
e.L(our_tail_label);
Do0x1000Add(e, e.ecx);
e.jmp(backtous, e.T_NEAR);
});
e.jae(fixup_label, e.T_NEAR);
e.L(backtous);
} else {
e.mov(e.ecx, i.src1.reg().cvt32());
}
@ -235,11 +389,20 @@ struct ATOMIC_COMPARE_EXCHANGE_I64
if (xe::memory::allocation_granularity() > 0x1000) {
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
// it via memory mapping.
e.mov(e.ecx, i.src1.reg().cvt32());
e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
e.setae(e.cl);
e.movzx(e.ecx, e.cl);
e.shl(e.ecx, 12);
e.add(e.ecx, i.src1.reg().cvt32());
Xbyak::Label& backtous = e.NewCachedLabel();
Xbyak::Label& fixup_label =
e.AddToTail([&backtous](X64Emitter& e, Xbyak::Label& our_tail_label) {
e.L(our_tail_label);
Do0x1000Add(e, e.ecx);
e.jmp(backtous, e.T_NEAR);
});
e.jae(fixup_label, e.T_NEAR);
e.L(backtous);
} else {
e.mov(e.ecx, i.src1.reg().cvt32());
}
@ -319,25 +482,44 @@ struct STORE_LOCAL_I8
e.mov(e.byte[e.rsp + i.src1.constant()], i.src2);
}
};
template <typename T>
static bool LocalStoreMayUseMembaseLow(X64Emitter& e, const T& i) {
return i.src2.is_constant && i.src2.constant() == 0 &&
e.CanUseMembaseLow32As0();
}
struct STORE_LOCAL_I16
: Sequence<STORE_LOCAL_I16, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I16Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
// e.TraceStoreI16(DATA_LOCAL, i.src1.constant, i.src2);
e.mov(e.word[e.rsp + i.src1.constant()], i.src2);
if (LocalStoreMayUseMembaseLow(e, i)) {
e.mov(e.word[e.rsp + i.src1.constant()], e.GetMembaseReg().cvt16());
} else {
e.mov(e.word[e.rsp + i.src1.constant()], i.src2);
}
}
};
struct STORE_LOCAL_I32
: Sequence<STORE_LOCAL_I32, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I32Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
// e.TraceStoreI32(DATA_LOCAL, i.src1.constant, i.src2);
e.mov(e.dword[e.rsp + i.src1.constant()], i.src2);
if (LocalStoreMayUseMembaseLow(e, i)) {
e.mov(e.dword[e.rsp + i.src1.constant()], e.GetMembaseReg().cvt32());
} else {
e.mov(e.dword[e.rsp + i.src1.constant()], i.src2);
}
}
};
struct STORE_LOCAL_I64
: Sequence<STORE_LOCAL_I64, I<OPCODE_STORE_LOCAL, VoidOp, I32Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
// e.TraceStoreI64(DATA_LOCAL, i.src1.constant, i.src2);
e.mov(e.qword[e.rsp + i.src1.constant()], i.src2);
if (i.src2.is_constant && i.src2.constant() == 0) {
e.xor_(e.eax, e.eax);
e.mov(e.qword[e.rsp + i.src1.constant()], e.rax);
} else {
e.mov(e.qword[e.rsp + i.src1.constant()], i.src2);
}
}
};
struct STORE_LOCAL_F32
@ -404,10 +586,133 @@ struct LOAD_CONTEXT_I32
}
}
};
template <typename EmitArgType>
static bool HandleLMS64Binary(X64Emitter& e, const EmitArgType& i,
LoadModStoreContext& lms, Xbyak::RegExp& addr) {
uint64_t other_const_val = 0;
bool const_fits_in_insn = false;
if (lms.other_const) {
other_const_val = lms.other_const->u64;
const_fits_in_insn = e.ConstantFitsIn32Reg(other_const_val);
}
/*
this check is here because we currently cannot handle other variables
with this
*/
if (!lms.other_const && !lms.binary_uses_twice) {
return false;
}
if (lms.op == OPCODE_ADD) {
if (lms.other_const) {
if (const_fits_in_insn) {
if (other_const_val == 1 &&
e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
e.inc(e.qword[addr]);
} else {
e.add(e.qword[addr], (uint32_t)other_const_val);
}
} else {
e.mov(e.rax, other_const_val);
e.add(e.qword[addr], e.rax);
}
return true;
} else if (lms.binary_uses_twice) {
// we're being added to ourselves, we are a multiply by 2
e.shl(e.qword[addr], 1);
return true;
} else if (lms.binary_other) {
return false; // cannot handle other variables right now.
}
} else if (lms.op == OPCODE_SUB) {
if (lms.other_index != 1) {
return false; // if we are the second operand, we cant combine memory
// access and operation
}
if (lms.other_const) {
if (const_fits_in_insn) {
if (other_const_val == 1 &&
e.IsFeatureEnabled(kX64FlagsIndependentVars)) {
e.dec(e.qword[addr]);
} else {
e.sub(e.qword[addr], (uint32_t)other_const_val);
}
} else {
e.mov(e.rax, other_const_val);
e.sub(e.qword[addr], e.rax);
}
return true;
}
} else if (lms.op == OPCODE_AND) {
if (lms.other_const) {
if (const_fits_in_insn) {
e.and_(e.qword[addr], (uint32_t)other_const_val);
} else {
e.mov(e.rax, other_const_val);
e.and_(e.qword[addr], e.rax);
}
return true;
}
} else if (lms.op == OPCODE_OR) {
if (lms.other_const) {
if (const_fits_in_insn) {
e.or_(e.qword[addr], (uint32_t)other_const_val);
} else {
e.mov(e.rax, other_const_val);
e.or_(e.qword[addr], e.rax);
}
return true;
}
} else if (lms.op == OPCODE_XOR) {
if (lms.other_const) {
if (const_fits_in_insn) {
e.xor_(e.qword[addr], (uint32_t)other_const_val);
} else {
e.mov(e.rax, other_const_val);
e.xor_(e.qword[addr], e.rax);
}
return true;
}
}
return false;
}
template <typename EmitArgType>
static bool HandleLMS64Unary(X64Emitter& e, const EmitArgType& i,
LoadModStoreContext& lms, Xbyak::RegExp& addr) {
Opcode op = lms.op;
if (op == OPCODE_NOT) {
e.not_(e.qword[addr]);
return true;
} else if (op == OPCODE_NEG) {
e.neg(e.qword[addr]);
return true;
}
return false;
}
struct LOAD_CONTEXT_I64
: Sequence<LOAD_CONTEXT_I64, I<OPCODE_LOAD_CONTEXT, I64Op, OffsetOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
auto addr = ComputeContextAddress(e, i.src1);
LoadModStoreContext lms{};
if (GetLoadModStoreContext(i.instr, &lms)) {
if (lms.is_binary && HandleLMS64Binary(e, i, lms, addr)) {
lms.Consume();
return;
} else if (lms.is_unary && HandleLMS64Unary(e, i, lms, addr)) {
lms.Consume();
return;
}
}
e.mov(i.dest, e.qword[addr]);
if (IsTracingData()) {
e.mov(e.GetNativeParam(1), e.qword[addr]);
@ -483,7 +788,11 @@ struct STORE_CONTEXT_I16
static void Emit(X64Emitter& e, const EmitArgType& i) {
auto addr = ComputeContextAddress(e, i.src1);
if (i.src2.is_constant) {
e.mov(e.word[addr], i.src2.constant());
if (i.src2.constant() == 0 && e.CanUseMembaseLow32As0()) {
e.mov(e.word[addr], e.GetMembaseReg().cvt16());
} else {
e.mov(e.word[addr], i.src2.constant());
}
} else {
e.mov(e.word[addr], i.src2);
}
@ -500,7 +809,11 @@ struct STORE_CONTEXT_I32
static void Emit(X64Emitter& e, const EmitArgType& i) {
auto addr = ComputeContextAddress(e, i.src1);
if (i.src2.is_constant) {
e.mov(e.dword[addr], i.src2.constant());
if (i.src2.constant() == 0 && e.CanUseMembaseLow32As0()) {
e.mov(e.dword[addr], e.GetMembaseReg().cvt32());
} else {
e.mov(e.dword[addr], i.src2.constant());
}
} else {
e.mov(e.dword[addr], i.src2);
}
@ -569,9 +882,14 @@ struct STORE_CONTEXT_V128
auto addr = ComputeContextAddress(e, i.src1);
if (i.src2.is_constant) {
e.LoadConstantXmm(e.xmm0, i.src2.constant());
e.vmovaps(e.ptr[addr], e.xmm0);
e.vmovdqa(e.ptr[addr], e.xmm0);
} else {
e.vmovaps(e.ptr[addr], i.src2);
SimdDomain domain = e.DeduceSimdDomain(i.src2.value);
if (domain == SimdDomain::FLOATING) {
e.vmovaps(e.ptr[addr], i.src2);
} else {
e.vmovdqa(e.ptr[addr], i.src2);
}
}
if (IsTracingData()) {
e.lea(e.GetNativeParam(1), e.ptr[addr]);
@ -735,7 +1053,11 @@ struct STORE_OFFSET_I16
}
} else {
if (i.src3.is_constant) {
e.mov(e.word[addr], i.src3.constant());
if (i.src3.constant() == 0 && e.CanUseMembaseLow32As0()) {
e.mov(e.word[addr], e.GetMembaseReg().cvt16());
} else {
e.mov(e.word[addr], i.src3.constant());
}
} else {
e.mov(e.word[addr], i.src3);
}
@ -757,7 +1079,11 @@ struct STORE_OFFSET_I32
}
} else {
if (i.src3.is_constant) {
e.mov(e.dword[addr], i.src3.constant());
if (i.src3.constant() == 0 && e.CanUseMembaseLow32As0()) {
e.mov(e.dword[addr], e.GetMembaseReg().cvt32());
} else {
e.mov(e.dword[addr], i.src3.constant());
}
} else {
e.mov(e.dword[addr], i.src3);
}
@ -895,7 +1221,7 @@ struct LOAD_V128 : Sequence<LOAD_V128, I<OPCODE_LOAD, V128Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
auto addr = ComputeMemoryAddress(e, i.src1);
// TODO(benvanik): we should try to stick to movaps if possible.
e.vmovups(i.dest, e.ptr[addr]);
e.vmovdqa(i.dest, e.ptr[addr]);
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
// TODO(benvanik): find a way to do this without the memory load.
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteSwapMask));
@ -1054,13 +1380,15 @@ struct STORE_V128
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
assert_false(i.src2.is_constant);
e.vpshufb(e.xmm0, i.src2, e.GetXmmConstPtr(XMMByteSwapMask));
e.vmovaps(e.ptr[addr], e.xmm0);
// changed from vmovaps, the penalty on the vpshufb is unavoidable but
// we dont need to incur another here too
e.vmovdqa(e.ptr[addr], e.xmm0);
} else {
if (i.src2.is_constant) {
e.LoadConstantXmm(e.xmm0, i.src2.constant());
e.vmovaps(e.ptr[addr], e.xmm0);
e.vmovdqa(e.ptr[addr], e.xmm0);
} else {
e.vmovaps(e.ptr[addr], i.src2);
e.vmovdqa(e.ptr[addr], i.src2);
}
}
if (IsTracingData()) {
@ -1081,10 +1409,12 @@ struct CACHE_CONTROL
: Sequence<CACHE_CONTROL,
I<OPCODE_CACHE_CONTROL, VoidOp, I64Op, OffsetOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
bool is_clflush = false, is_prefetch = false;
bool is_clflush = false, is_prefetch = false, is_prefetchw = false;
switch (CacheControlType(i.instr->flags)) {
case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH:
case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE:
is_prefetchw = true;
break;
case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH:
is_prefetch = true;
break;
case CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE:
@ -1095,6 +1425,11 @@ struct CACHE_CONTROL
assert_unhandled_case(CacheControlType(i.instr->flags));
return;
}
if (is_prefetchw && !e.IsFeatureEnabled(kX64EmitPrefetchW)) {
is_prefetchw = false;
is_prefetch = true; // cant prefetchw, cpu doesnt have it (unlikely to
// happen). just prefetcht0
}
size_t cache_line_size = i.src2.value;
RegExp addr;
@ -1117,13 +1452,24 @@ struct CACHE_CONTROL
}
} else {
if (xe::memory::allocation_granularity() > 0x1000) {
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
// it via memory mapping.
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't
// do it via memory mapping.
e.mov(e.eax, i.src1.reg().cvt32());
e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
e.setae(e.al);
e.movzx(e.eax, e.al);
e.shl(e.eax, 12);
e.add(e.eax, i.src1.reg().cvt32());
Xbyak::Label& tmplbl = e.NewCachedLabel();
Xbyak::Label& fixup_label =
e.AddToTail([&tmplbl](X64Emitter& e, Xbyak::Label& our_tail_label) {
e.L(our_tail_label);
Do0x1000Add(e, e.eax);
e.jmp(tmplbl, e.T_NEAR);
});
e.jae(fixup_label, e.T_NEAR);
e.L(tmplbl);
} else {
// Clear the top 32 bits, as they are likely garbage.
// TODO(benvanik): find a way to avoid doing this.
@ -1131,12 +1477,17 @@ struct CACHE_CONTROL
}
addr = e.GetMembaseReg() + e.rax;
}
// todo: use clflushopt + sfence on cpus that support it
if (is_clflush) {
e.clflush(e.ptr[addr]);
}
if (is_prefetch) {
e.prefetcht0(e.ptr[addr]);
}
if (is_prefetchw) {
e.prefetchw(e.ptr[addr]);
}
if (cache_line_size >= 128) {
// Prefetch the other 64 bytes of the 128-byte cache line.
@ -1151,6 +1502,9 @@ struct CACHE_CONTROL
if (is_prefetch) {
e.prefetcht0(e.ptr[addr]);
}
if (is_prefetchw) {
e.prefetchw(e.ptr[addr]);
}
assert_true(cache_line_size == 128);
}
}
@ -1178,20 +1532,24 @@ struct MEMSET_I64_I8_I64
assert_true(i.src2.constant() == 0);
e.vpxor(e.xmm0, e.xmm0);
auto addr = ComputeMemoryAddress(e, i.src1);
/*
chrispy: changed to vmovdqa, the mismatch between vpxor and vmovaps
was causing a 1 cycle stall before the first store
*/
switch (i.src3.constant()) {
case 32:
e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0);
e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0);
e.vmovdqa(e.ptr[addr], e.ymm0);
break;
case 128:
e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0);
e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0);
e.vmovaps(e.ptr[addr + 2 * 16], e.xmm0);
e.vmovaps(e.ptr[addr + 3 * 16], e.xmm0);
e.vmovaps(e.ptr[addr + 4 * 16], e.xmm0);
e.vmovaps(e.ptr[addr + 5 * 16], e.xmm0);
e.vmovaps(e.ptr[addr + 6 * 16], e.xmm0);
e.vmovaps(e.ptr[addr + 7 * 16], e.xmm0);
// probably should lea the address beforehand
e.vmovdqa(e.ptr[addr + 0 * 16], e.ymm0);
e.vmovdqa(e.ptr[addr + 2 * 16], e.ymm0);
e.vmovdqa(e.ptr[addr + 4 * 16], e.ymm0);
e.vmovdqa(e.ptr[addr + 6 * 16], e.ymm0);
break;
default:
assert_unhandled_case(i.src3.constant());

File diff suppressed because one or more lines are too long

View File

@ -13,6 +13,8 @@
#include "xenia/cpu/hir/instr.h"
#include <unordered_map>
#define assert_impossible_sequence(name) \
assert_always("impossible sequence hit" #name);
namespace xe {
namespace cpu {

View File

@ -749,7 +749,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
result = true;
}
break;
case OPCODE_PERMUTE: {
if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
i->src3.value->IsConstant() &&
@ -760,17 +760,20 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
result = true;
}
else if (i->src2.value->IsConstantZero() && i->src3.value->IsConstantZero() &&
else if (i->src2.value->IsConstantZero() &&
i->src3.value->IsConstantZero() &&
i->flags == INT8_TYPE /*probably safe for int16 too*/) {
/*
chrispy: hoisted this check here from x64_seq_vector where if src1 is not constant, but src2 and src3 are zero, then we know the result will always be zero
chrispy: hoisted this check here from x64_seq_vector where if
src1 is not constant, but src2 and src3 are zero, then we know
the result will always be zero
*/
v->set_zero(VEC128_TYPE);
i->Remove();
result = true;
}
break;
}
case OPCODE_INSERT:
@ -930,6 +933,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
result = true;
}
break;
case OPCODE_TO_SINGLE:
if (i->src1.value->IsConstant()) {
v->set_from(i->src1.value);
v->ToSingle();
i->Remove();
result = true;
}
break;
default:
// Ignored.
break;

View File

@ -10,6 +10,7 @@
#include "xenia/cpu/compiler/passes/simplification_pass.h"
#include "xenia/base/byte_order.h"
#include "xenia/base/logging.h"
#include "xenia/base/profiling.h"
namespace xe {
namespace cpu {
@ -82,7 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
iter_result |= SimplifyBitArith(builder);
iter_result |= EliminateConversions(builder);
iter_result |= SimplifyAssignments(builder);
iter_result |= BackpropTruncations(builder);
result |= iter_result;
} while (iter_result);
return true;
@ -1207,71 +1208,6 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
return result;
}
struct TruncateSimplifier {
TypeName type_from, type_to;
uint32_t sizeof_from, sizeof_to;
uint32_t bit_sizeof_from, bit_sizeof_to;
uint64_t typemask_from, typemask_to;
hir::HIRBuilder* builder;
hir::Instr* truncate_instr;
hir::Value* truncated_value;
hir::Instr* truncated_value_def;
};
bool SimplificationPass::BackpropTruncations(hir::Instr* i,
hir::HIRBuilder* builder) {
if (i->opcode != &OPCODE_TRUNCATE_info) {
return false;
}
TypeName type_from = i->src1.value->type;
TypeName type_to = i->dest->type;
uint32_t sizeof_from = static_cast<uint32_t>(GetTypeSize(type_from));
uint32_t sizeof_to = static_cast<uint32_t>(GetTypeSize(type_to));
Instr* input_def = i->src1.value->GetDefSkipAssigns();
if (!input_def) {
return false;
}
Opcode input_opc = input_def->opcode->num;
if (input_opc == OPCODE_SHL && input_def->src2.value->IsConstant()) {
uint32_t src2_shift = input_def->src2.value->AsUint32();
if (src2_shift < (sizeof_to * CHAR_BIT)) {
Value* truncated_preshift =
builder->Truncate(input_def->src1.value, type_to);
truncated_preshift->def->MoveBefore(i);
i->Replace(&OPCODE_SHL_info, 0);
i->set_src1(truncated_preshift);
i->set_src2(input_def->src2.value);
return true;
}
}
if (input_opc == OPCODE_LOAD_CONTEXT) {
if (sizeof_from == 8 && sizeof_to == 4) {
Value* loadof = builder->LoadContext(input_def->src1.offset, INT32_TYPE);
loadof->def->MoveBefore(input_def);
i->Replace(&OPCODE_ASSIGN_info, 0);
i->set_src1(loadof);
return true;
}
}
return false;
}
bool SimplificationPass::BackpropTruncations(hir::HIRBuilder* builder) {
bool result = false;
auto block = builder->first_block();
while (block) {
auto i = block->instr_head;
while (i) {
result |= BackpropTruncations(i, builder);
i = i->next;
}
block = block->next;
}
return result;
}
Value* SimplificationPass::CheckValue(Value* value, bool& result) {
auto def = value->def;
if (def && def->opcode == &OPCODE_ASSIGN_info) {

View File

@ -32,8 +32,6 @@ class SimplificationPass : public ConditionalGroupSubpass {
bool SimplifyAssignments(hir::HIRBuilder* builder);
hir::Value* CheckValue(hir::Value* value, bool& result);
bool SimplifyBitArith(hir::HIRBuilder* builder);
bool BackpropTruncations(hir::Instr* i, hir::HIRBuilder* builder);
bool BackpropTruncations(hir::HIRBuilder* builder);
// handle either or or xor with 0
bool CheckOrXorZero(hir::Instr* i);
bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);

View File

@ -692,6 +692,7 @@ Instr* HIRBuilder::AppendInstr(const OpcodeInfo& opcode_info, uint16_t flags,
instr->block = block;
instr->opcode = &opcode_info;
instr->flags = flags;
instr->backend_flags = 0;
instr->dest = dest;
instr->src1.value = instr->src2.value = instr->src3.value = NULL;
instr->src1_use = instr->src2_use = instr->src3_use = NULL;
@ -1492,7 +1493,6 @@ Value* HIRBuilder::VectorCompareUGE(Value* value1, Value* value2,
part_type);
}
Value* HIRBuilder::VectorDenormFlush(Value* value1) {
return value1;
ASSERT_VECTOR_TYPE(value1);
Instr* i =
AppendInstr(OPCODE_VECTOR_DENORMFLUSH_info, 0, AllocValue(VEC128_TYPE));
@ -1501,6 +1501,14 @@ Value* HIRBuilder::VectorDenormFlush(Value* value1) {
i->src3.value = nullptr;
return i->dest;
}
Value* HIRBuilder::ToSingle(Value* value) {
assert_true(value->type == FLOAT64_TYPE);
Instr* i = AppendInstr(OPCODE_TO_SINGLE_info, 0, AllocValue(FLOAT64_TYPE));
i->set_src1(value);
i->src2.value = nullptr;
i->src3.value = nullptr;
return i->dest;
}
Value* HIRBuilder::Add(Value* value1, Value* value2,
uint32_t arithmetic_flags) {
ASSERT_TYPES_EQUAL(value1, value2);
@ -1720,7 +1728,6 @@ Value* HIRBuilder::Log2(Value* value) {
return i->dest;
}
Value* HIRBuilder::DotProduct3(Value* value1, Value* value2) {
ASSERT_VECTOR_TYPE(value1);
ASSERT_VECTOR_TYPE(value2);

View File

@ -200,7 +200,7 @@ class HIRBuilder {
Value* VectorCompareUGT(Value* value1, Value* value2, TypeName part_type);
Value* VectorCompareUGE(Value* value1, Value* value2, TypeName part_type);
Value* VectorDenormFlush(Value* value1);
Value* ToSingle(Value* value);
Value* Add(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
Value* AddWithCarry(Value* value1, Value* value2, Value* value3,
uint32_t arithmetic_flags = 0);

View File

@ -180,6 +180,26 @@ exit_loop:
*tunnel_flags = traversed_types;
return current_def;
}
bool Instr::IsFake() const {
Opcode num = opcode->num;
switch (num) {
case OPCODE_NOP:
case OPCODE_COMMENT:
case OPCODE_CONTEXT_BARRIER:
case OPCODE_SOURCE_OFFSET:
return true;
}
return false;
}
const Instr* Instr::GetNonFakePrev() const {
const Instr* curr = prev;
while (curr && curr->IsFake()) {
curr = curr->prev;
}
return curr;
}
} // namespace hir
} // namespace cpu
} // namespace xe

View File

@ -42,6 +42,7 @@ class Instr {
const OpcodeInfo* opcode;
uint16_t flags;
uint16_t backend_flags; // backends may do whatever they wish with this
uint32_t ordinal;
typedef union {
@ -158,6 +159,11 @@ if both are constant, return nullptr, nullptr
call_for_values(src3.value, 2);
}
}
bool IsFake() const;
// gets previous instr, skipping instrs like COMMENT, OPCODE_CONTEXT_BARRIER,
// OPCODE_SOURCE_OFFSET
const hir::Instr* GetNonFakePrev() const;
};
} // namespace hir

View File

@ -281,7 +281,10 @@ enum Opcode {
OPCODE_ATOMIC_COMPARE_EXCHANGE,
OPCODE_SET_ROUNDING_MODE,
OPCODE_VECTOR_DENORMFLUSH, // converts denormals to signed zeros in a vector
__OPCODE_MAX_VALUE, // Keep at end.
OPCODE_TO_SINGLE, // i could not find a decent name to assign to this opcode,
// as we already have OPCODE_ROUND. round double to float (
// ppc "single" fpu instruction result rounding behavior )
__OPCODE_MAX_VALUE, // Keep at end.
};
enum OpcodeFlags {
@ -352,7 +355,9 @@ static bool IsOpcodeBinaryValue(uint32_t signature) {
return (signature & ~(0x7)) ==
((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6));
}
static bool IsOpcodeUnaryValue(uint32_t signature) {
return (signature & ~(0x7)) == ((OPCODE_SIG_TYPE_V << 3));
}
static void UnpackOpcodeSig(uint32_t sig, OpcodeSignatureType& dest,
OpcodeSignatureType& src1,
OpcodeSignatureType& src2,

View File

@ -679,4 +679,11 @@ DEFINE_OPCODE(
"vector_denormflush",
OPCODE_SIG_V_V,
0
)
DEFINE_OPCODE(
OPCODE_TO_SINGLE,
"to_single",
OPCODE_SIG_V_V,
0
)

View File

@ -1643,6 +1643,11 @@ void Value::DenormalFlush() {
constant.v128.u32[i] = current_element;
}
}
void Value::ToSingle() {
assert_true(type == FLOAT64_TYPE);
constant.f64 = static_cast<double>(static_cast<float>(constant.f64));
}
void Value::CountLeadingZeros(const Value* other) {
switch (other->type) {
case INT8_TYPE:
@ -1805,6 +1810,25 @@ hir::Instr* Value::GetDefTunnelMovs(unsigned int* tunnel_flags) {
return nullptr;
}
}
// does the value only have one instr that uses it?
bool Value::HasSingleUse() const {
return use_head && use_head->next == nullptr;
}
bool Value::AllUsesByOneInsn() const {
if (!use_head) {
return false;
}
const Use* first_use = use_head;
const Instr* should_match = first_use->instr;
for (const Use* current_use = first_use->next; current_use;
current_use = current_use->next) {
if (current_use->instr != should_match) {
return false;
}
}
return true;
}
} // namespace hir
} // namespace cpu
} // namespace xe

View File

@ -226,6 +226,15 @@ class Value {
return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot;
}
inline bool IsConstant() const { return !!(flags & VALUE_IS_CONSTANT); }
inline bool IsEqual(const Value* other) const {
if (this == other) {
return true;
} else if ((this->flags & other->flags) & VALUE_IS_CONSTANT) {
return this->IsConstantEQ(other);
}
return false;
}
bool IsConstantTrue() const {
if (type == VEC128_TYPE) {
assert_always();
@ -327,7 +336,7 @@ class Value {
return false;
}
}
bool IsConstantEQ(Value* other) const {
bool IsConstantEQ(const Value* other) const {
if (type == VEC128_TYPE) {
assert_always();
}
@ -594,13 +603,19 @@ class Value {
bool saturate);
void ByteSwap();
void DenormalFlush();
void ToSingle();
void CountLeadingZeros(const Value* other);
bool Compare(Opcode opcode, Value* other);
hir::Instr* GetDefSkipAssigns();
// tunnel_flags is updated to the kinds we actually traversed
hir::Instr* GetDefTunnelMovs(unsigned int* tunnel_flags);
// does the value only have one instr that uses it?
bool HasSingleUse() const;
// returns true if every single use is as an operand to a single instruction
// (add var2, var1, var1)
bool AllUsesByOneInsn() const;
private:
static bool CompareInt8(Opcode opcode, Value* a, Value* b);
static bool CompareInt16(Opcode opcode, Value* a, Value* b);

View File

@ -379,7 +379,7 @@ typedef struct alignas(64) PPCContext_s {
uint64_t lr; // 0x10 Link register
double f[32]; // 0x120 Floating-point registers
vec128_t v[128]; // 0x220 VMX128 vector registers
vec128_t vscr_vec;
// XER register:
// Split to make it easier to do individual updates.
uint8_t xer_ca;
@ -422,7 +422,7 @@ typedef struct alignas(64) PPCContext_s {
// Value of last reserved load
uint64_t reserved_val;
ThreadState* thread_state;
uint8_t* virtual_membase;
uint8_t* virtual_membase;
static std::string GetRegisterName(PPCRegister reg);
std::string GetStringFromValue(PPCRegister reg) const;
void SetValueFromString(PPCRegister reg, std::string value);
@ -432,6 +432,7 @@ typedef struct alignas(64) PPCContext_s {
std::string& result) const;
} PPCContext;
#pragma pack(pop)
constexpr size_t ppcctx_size = sizeof(PPCContext);
static_assert(sizeof(PPCContext) % 64 == 0, "64b padded");
} // namespace ppc

View File

@ -355,13 +355,18 @@ int InstrEmit_stvrxl128(PPCHIRBuilder& f, const InstrData& i) {
}
int InstrEmit_mfvscr(PPCHIRBuilder& f, const InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
// is this the right format?
f.StoreVR(i.VX128_1.RB,
f.LoadContext(offsetof(PPCContext, vscr_vec), VEC128_TYPE));
return 0;
}
int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
// is this the right format?
Value* v = f.LoadVR(i.VX128_1.RB);
f.StoreContext(offsetof(PPCContext, vscr_vec), v);
return 0;
}
int InstrEmit_vaddcuw(PPCHIRBuilder& f, const InstrData& i) {
@ -1105,7 +1110,7 @@ int InstrEmit_vmsum3fp128(PPCHIRBuilder& f, const InstrData& i) {
// Dot product XYZ.
// (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z)
Value* v = f.DotProduct3(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128));
//chrispy: denormal outputs for Dot product are unconditionally made 0
// chrispy: denormal outputs for Dot product are unconditionally made 0
v = f.VectorDenormFlush(v);
f.StoreVR(VX128_VD128, v);
return 0;

View File

@ -336,6 +336,7 @@ int InstrEmit_mulhwx(PPCHIRBuilder& f, const InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
}
Value* v = f.SignExtend(f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE)),
INT64_TYPE);
@ -353,6 +354,7 @@ int InstrEmit_mulhwux(PPCHIRBuilder& f, const InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
}
Value* v = f.ZeroExtend(
f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), ARITHMETIC_UNSIGNED),

View File

@ -46,7 +46,7 @@ int InstrEmit_faddx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_faddsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- (frA) + (frB)
Value* v = f.Add(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB));
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
v = f.ToSingle(v);
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
@ -63,7 +63,7 @@ int InstrEmit_fdivx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fdivsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- frA / frB
Value* v = f.Div(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB));
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
v = f.ToSingle(v);
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
@ -80,7 +80,7 @@ int InstrEmit_fmulx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- (frA) x (frC)
Value* v = f.Mul(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC));
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
v = f.ToSingle(v);
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
@ -88,9 +88,9 @@ int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fresx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- 1.0 / (frB)
Value* v = f.Convert(f.Div(f.LoadConstantFloat32(1.0f),
f.Convert(f.LoadFPR(i.A.FRB), FLOAT32_TYPE)),
FLOAT64_TYPE);
Value* v = f.Recip(f.LoadFPR(i.A.FRB));
v = f.ToSingle(v);
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
@ -116,7 +116,7 @@ int InstrEmit_fsubx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fsubsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- (frA) - (frB)
Value* v = f.Sub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB));
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
v = f.ToSingle(v);
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
@ -132,64 +132,63 @@ int InstrEmit_fselx(PPCHIRBuilder& f, const InstrData& i) {
f.UpdateFPSCR(v, i.A.Rc);
return 0;
}
int InstrEmit_fsqrtx(PPCHIRBuilder& f, const InstrData& i) {
// Double precision:
static int InstrEmit_fsqrt(PPCHIRBuilder& f, const InstrData& i, bool single) {
// frD <- sqrt(frB)
Value* v = f.Sqrt(f.LoadFPR(i.A.FRB));
if (single) {
v = f.ToSingle(v);
}
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
}
int InstrEmit_fsqrtx(PPCHIRBuilder& f, const InstrData& i) {
return InstrEmit_fsqrt(f, i, false);
}
int InstrEmit_fsqrtsx(PPCHIRBuilder& f, const InstrData& i) {
// Single precision:
// frD <- sqrt(frB)
Value* v = f.Sqrt(f.LoadFPR(i.A.FRB));
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
return InstrEmit_fsqrt(f, i, true);
}
// Floating-point multiply-add (A-9)
int InstrEmit_fmaddx(PPCHIRBuilder& f, const InstrData& i) {
static int InstrEmit_fmadd(PPCHIRBuilder& f, const InstrData& i, bool single) {
// frD <- (frA x frC) + frB
Value* v =
f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
if (single) {
v = f.ToSingle(v);
}
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
}
int InstrEmit_fmaddx(PPCHIRBuilder& f, const InstrData& i) {
return InstrEmit_fmadd(f, i, false);
}
int InstrEmit_fmaddsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- (frA x frC) + frB
return InstrEmit_fmadd(f, i, true);
}
static int InstrEmit_fmsub(PPCHIRBuilder& f, const InstrData& i, bool single) {
// frD <- (frA x frC) - frB
Value* v =
f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
if (single) {
v = f.ToSingle(v);
}
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
}
int InstrEmit_fmsubx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- (frA x frC) - frB
Value* v =
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
return InstrEmit_fmsub(f, i, false);
}
int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- (frA x frC) - frB
Value* v =
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB));
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
return InstrEmit_fmsub(f, i, true);
}
int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
@ -205,7 +204,7 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- -([frA x frC] + frB)
Value* v = f.Neg(
f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
v = f.ToSingle(v);
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
@ -224,7 +223,7 @@ int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- -([frA x frC] - frB)
Value* v = f.Neg(
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE);
v = f.ToSingle(v);
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;