Merge pull request #52 from chrisps/canary_experimental

Fix previous batch of CPU changes
This commit is contained in:
Radosław Gliński 2022-07-18 09:20:35 +02:00 committed by GitHub
commit 3757580f45
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 856 additions and 170 deletions

View File

@ -63,6 +63,10 @@ class Backend {
virtual void InstallBreakpoint(Breakpoint* breakpoint) {}
virtual void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) {}
virtual void UninstallBreakpoint(Breakpoint* breakpoint) {}
// ctx points to the start of a ppccontext, ctx - page_allocation_granularity
// up until the start of ctx may be used by the backend to store whatever data
// they want
virtual void InitializeBackendContext(void* ctx) {}
protected:
Processor* processor_ = nullptr;

View File

@ -32,6 +32,9 @@
#include "xenia/cpu/cpu_flags.h"
#include "xenia/cpu/function.h"
#include "xenia/cpu/function_debug_info.h"
#include "xenia/cpu/hir/instr.h"
#include "xenia/cpu/hir/opcodes.h"
#include "xenia/cpu/hir/value.h"
#include "xenia/cpu/processor.h"
#include "xenia/cpu/symbol.h"
#include "xenia/cpu/thread_state.h"
@ -393,7 +396,8 @@ void X64Emitter::DebugBreak() {
}
uint64_t TrapDebugPrint(void* raw_context, uint64_t address) {
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
auto thread_state =
reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
uint32_t str_ptr = uint32_t(thread_state->context()->r[3]);
// uint16_t str_len = uint16_t(thread_state->context()->r[4]);
auto str = thread_state->memory()->TranslateVirtual<const char*>(str_ptr);
@ -408,7 +412,8 @@ uint64_t TrapDebugPrint(void* raw_context, uint64_t address) {
}
uint64_t TrapDebugBreak(void* raw_context, uint64_t address) {
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
auto thread_state =
reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
XELOGE("tw/td forced trap hit! This should be a crash!");
if (cvars::break_on_debugbreak) {
xe::debugging::Break();
@ -447,7 +452,8 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
// This is used by the X64ThunkEmitter's ResolveFunctionThunk.
uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
auto thread_state =
reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
// TODO(benvanik): required?
assert_not_zero(target_address);
@ -1191,7 +1197,109 @@ Xbyak::Address X64Emitter::StashConstantXmm(int index, const vec128_t& v) {
MovMem64(addr + 8, v.high);
return ptr[addr];
}
static bool IsVectorCompare(const Instr* i) {
hir::Opcode op = i->opcode->num;
return op >= hir::OPCODE_VECTOR_COMPARE_EQ &&
op <= hir::OPCODE_VECTOR_COMPARE_UGE;
}
static bool IsFlaggedVectorOp(const Instr* i) {
if (IsVectorCompare(i)) {
return true;
}
hir::Opcode op = i->opcode->num;
using namespace hir;
switch (op) {
case OPCODE_VECTOR_SUB:
case OPCODE_VECTOR_ADD:
case OPCODE_SWIZZLE:
return true;
}
return false;
}
static SimdDomain GetDomainForFlaggedVectorOp(const hir::Instr* df) {
switch (df->flags) { // check what datatype we compared as
case hir::INT16_TYPE:
case hir::INT32_TYPE:
case hir::INT8_TYPE:
case hir::INT64_TYPE:
return SimdDomain::INTEGER;
case hir::FLOAT32_TYPE:
case hir::FLOAT64_TYPE: // pretty sure float64 doesnt occur with vectors.
// here for completeness
return SimdDomain::FLOATING;
default:
return SimdDomain::DONTCARE;
}
return SimdDomain::DONTCARE;
}
// this list is incomplete
static bool IsDefiniteIntegerDomainOpcode(hir::Opcode opc) {
using namespace hir;
switch (opc) {
case OPCODE_LOAD_VECTOR_SHL:
case OPCODE_LOAD_VECTOR_SHR:
case OPCODE_VECTOR_CONVERT_F2I:
case OPCODE_VECTOR_MIN: // there apparently is no FLOAT32_TYPE for min/maxs
// flags
case OPCODE_VECTOR_MAX:
case OPCODE_VECTOR_SHL:
case OPCODE_VECTOR_SHR:
case OPCODE_VECTOR_SHA:
case OPCODE_VECTOR_ROTATE_LEFT:
case OPCODE_VECTOR_AVERAGE: // apparently no float32 type for this
case OPCODE_EXTRACT:
case OPCODE_INSERT: // apparently no f32 type for these two
return true;
}
return false;
}
static bool IsDefiniteFloatingDomainOpcode(hir::Opcode opc) {
using namespace hir;
switch (opc) {
case OPCODE_VECTOR_CONVERT_I2F:
case OPCODE_VECTOR_DENORMFLUSH:
case OPCODE_DOT_PRODUCT_3:
case OPCODE_DOT_PRODUCT_4:
case OPCODE_LOG2:
case OPCODE_POW2:
case OPCODE_RECIP:
case OPCODE_ROUND:
case OPCODE_SQRT:
case OPCODE_MUL:
case OPCODE_MUL_SUB:
case OPCODE_MUL_ADD:
case OPCODE_ABS:
return true;
}
return false;
}
SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) {
hir::Instr* df = for_value->def;
if (!df) {
// todo: visit uses to figure out domain
return SimdDomain::DONTCARE;
} else {
SimdDomain result = SimdDomain::DONTCARE;
if (IsFlaggedVectorOp(df)) {
result = GetDomainForFlaggedVectorOp(df);
} else if (IsDefiniteIntegerDomainOpcode(df->opcode->num)) {
result = SimdDomain::INTEGER;
} else if (IsDefiniteFloatingDomainOpcode(df->opcode->num)) {
result = SimdDomain::FLOATING;
}
// todo: check if still dontcare, if so, visit uses of the value to figure
// it out
return result;
}
return SimdDomain::DONTCARE;
}
} // namespace x64
} // namespace backend
} // namespace cpu

View File

@ -44,7 +44,39 @@ enum RegisterFlags {
REG_DEST = (1 << 0),
REG_ABCD = (1 << 1),
};
/*
SSE/AVX/AVX512 has seperate move instructions/shuffle instructions for float
data and int data for a reason most processors implement two distinct
pipelines, one for the integer domain and one for the floating point domain
currently, xenia makes no distinction between the two. Crossing domains is
expensive. On Zen processors the penalty is one cycle each time you cross,
plus the two pipelines need to synchronize Often xenia will emit an integer
instruction, then a floating instruction, then integer again. this
effectively adds at least two cycles to the time taken These values will in
the future be used as tags to operations that tell them which domain to
operate in, if its at all possible to avoid crossing
*/
enum class SimdDomain : uint32_t {
FLOATING,
INTEGER,
DONTCARE,
CONFLICTING // just used as a special result for PickDomain, different from
// dontcare (dontcare means we just dont know the domain,
// CONFLICTING means its used in multiple domains)
};
static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) {
if (dom1 == dom2) {
return dom1;
}
if (dom1 == SimdDomain::DONTCARE) {
return dom2;
}
if (dom2 == SimdDomain::DONTCARE) {
return dom1;
}
return SimdDomain::CONFLICTING;
}
enum XmmConst {
XMMZero = 0,
XMMOne,
@ -122,7 +154,7 @@ enum XmmConst {
XMMLVSLTableBase,
XMMLVSRTableBase,
XMMSingleDenormalMask,
XMMThreeFloatMask, //for clearing the fourth float prior to DOT_PRODUCT_3
XMMThreeFloatMask, // for clearing the fourth float prior to DOT_PRODUCT_3
XMMXenosF16ExtRangeStart
};
@ -150,8 +182,9 @@ enum X64EmitterFeatureFlags {
kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
kX64FastJrcx = 1 << 12, // jrcxz is as fast as any other jump ( >= Zen1)
kX64FastLoop =
1 << 13, // loop/loope/loopne is as fast as any other jump ( >= Zen2)
kX64EmitAVX512VBMI = 1 << 14
};
class ResolvableGuestCall {
@ -259,6 +292,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
FunctionDebugInfo* debug_info() const { return debug_info_; }
size_t stack_size() const { return stack_size_; }
SimdDomain DeduceSimdDomain(const hir::Value* for_value);
protected:
void* Emplace(const EmitFunctionInfo& func_info,

View File

@ -12,11 +12,11 @@
#include <algorithm>
#include <cstring>
#include "xenia/base/cvar.h"
#include "xenia/base/memory.h"
#include "xenia/cpu/backend/x64/x64_op.h"
#include "xenia/cpu/backend/x64/x64_tracers.h"
#include "xenia/cpu/ppc/ppc_context.h"
#include "xenia/base/cvar.h"
DEFINE_bool(
elide_e0_check, false,
@ -83,11 +83,17 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
!is_definitely_not_eo(guest)) {
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
// it via memory mapping.
// todo: do branching or use an alt membase and cmov
e.xor_(e.eax, e.eax);
e.cmp(guest.reg().cvt32(), 0xE0000000 - offset_const);
e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]);
e.cmp(e.edx, e.GetContextReg().cvt32());
e.setae(e.al);
e.shl(e.eax, 12);
e.add(e.eax, guest.reg().cvt32());
e.add(e.eax, e.edx);
return e.GetMembaseReg() + e.rax;
} else {
// Clear the top 32 bits, as they are likely garbage.
// TODO(benvanik): find a way to avoid doing this.
@ -122,7 +128,7 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
// it via memory mapping.
e.xor_(e.eax, e.eax);
e.cmp(guest.reg().cvt32(), 0xE0000000);
e.cmp(guest.reg().cvt32(), e.GetContextReg().cvt32());
e.setae(e.al);
e.shl(e.eax, 12);
e.add(e.eax, guest.reg().cvt32());
@ -208,7 +214,7 @@ struct ATOMIC_COMPARE_EXCHANGE_I32
if (xe::memory::allocation_granularity() > 0x1000) {
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
// it via memory mapping.
e.cmp(i.src1.reg().cvt32(), 0xE0000000);
e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
e.setae(e.cl);
e.movzx(e.ecx, e.cl);
e.shl(e.ecx, 12);
@ -229,7 +235,7 @@ struct ATOMIC_COMPARE_EXCHANGE_I64
if (xe::memory::allocation_granularity() > 0x1000) {
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
// it via memory mapping.
e.cmp(i.src1.reg().cvt32(), 0xE0000000);
e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
e.setae(e.cl);
e.movzx(e.ecx, e.cl);
e.shl(e.ecx, 12);
@ -1113,7 +1119,7 @@ struct CACHE_CONTROL
if (xe::memory::allocation_granularity() > 0x1000) {
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
// it via memory mapping.
e.cmp(i.src1.reg().cvt32(), 0xE0000000);
e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
e.setae(e.al);
e.movzx(e.eax, e.al);
e.shl(e.eax, 12);

View File

@ -1826,7 +1826,7 @@ struct PERMUTE_I32
}
}
};
//todo: use this on const src1
// todo: use this on const src1
static vec128_t FixupConstantShuf8(vec128_t input) {
for (uint32_t i = 0; i < 16; ++i) {
input.u8[i] ^= 0x03;
@ -1984,7 +1984,11 @@ struct SWIZZLE
} else {
src1 = i.src1;
}
if (element_type == INT32_TYPE) {
e.vpshufd(i.dest, src1, swizzle_mask);
} else if (element_type == FLOAT32_TYPE) {
e.vshufps(i.dest, src1, src1, swizzle_mask);
}
} else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) {
assert_always();
} else {

View File

@ -717,6 +717,9 @@ struct SELECT_V128_I8
static void Emit(X64Emitter& e, const EmitArgType& i) {
// TODO(benvanik): find a shorter sequence.
// dest = src1 != 0 ? src2 : src3
/*
chrispy: this is dead code, this sequence is never emitted
*/
e.movzx(e.eax, i.src1);
e.vmovd(e.xmm1, e.eax);
e.vpbroadcastd(e.xmm1, e.xmm1);
@ -737,11 +740,46 @@ struct SELECT_V128_I8
e.vpor(i.dest, e.xmm1);
}
};
enum class PermittedBlend : uint32_t { NotPermitted, Int8, Ps };
static bool IsVectorCompare(const Instr* i) {
Opcode op = i->opcode->num;
return op >= OPCODE_VECTOR_COMPARE_EQ && op <= OPCODE_VECTOR_COMPARE_UGE;
}
/*
OPCODE_SELECT does a bit by bit selection, however, if the selector is the
result of a comparison or if each element may only be 0xff or 0 we may use a
blend instruction instead
*/
static PermittedBlend GetPermittedBlendForSelectV128(const Value* src1v) {
const Instr* df = src1v->def;
if (!df) {
return PermittedBlend::NotPermitted;
} else {
if (!IsVectorCompare(df)) {
return PermittedBlend::NotPermitted; // todo: check ors, ands of
// condition
} else {
switch (df->flags) { // check what datatype we compared as
case INT16_TYPE:
case INT32_TYPE:
case INT8_TYPE:
return PermittedBlend::Int8; // use vpblendvb
case FLOAT32_TYPE:
return PermittedBlend::Ps; // use vblendvps
default: // unknown type! just ignore
return PermittedBlend::NotPermitted;
}
}
}
}
struct SELECT_V128_V128
: Sequence<SELECT_V128_V128,
I<OPCODE_SELECT, V128Op, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
Xmm src1 = i.src1.is_constant ? e.xmm0 : i.src1;
PermittedBlend mayblend = GetPermittedBlendForSelectV128(i.src1.value);
//todo: detect whether src1 is only 0 or FFFF and use blends if so. currently we only detect cmps
if (i.src1.is_constant) {
e.LoadConstantXmm(src1, i.src1.constant());
}
@ -756,11 +794,17 @@ struct SELECT_V128_V128
e.LoadConstantXmm(src3, i.src3.constant());
}
if (mayblend == PermittedBlend::Int8) {
e.vpblendvb(i.dest, src2, src3, src1);
} else if (mayblend == PermittedBlend::Ps) {
e.vblendvps(i.dest, src2, src3, src1);
} else {
// src1 ? src2 : src3;
e.vpandn(e.xmm3, src1, src2);
e.vpand(i.dest, src1, src3);
e.vpor(i.dest, i.dest, e.xmm3);
}
}
};
EMITTER_OPCODE_TABLE(OPCODE_SELECT, SELECT_I8, SELECT_I16, SELECT_I32,
SELECT_I64, SELECT_F32, SELECT_F64, SELECT_V128_I8,
@ -2122,7 +2166,8 @@ struct MUL_ADD_V128
// TODO(benvanik): the vfmadd sequence produces slightly different results
// than vmul+vadd and it'd be nice to know why. Until we know, it's
// disabled so tests pass.
if (false && e.IsFeatureEnabled(kX64EmitFMA)) {
// chrispy: reenabled, i have added the DAZ behavior that was missing
if (true && e.IsFeatureEnabled(kX64EmitFMA)) {
EmitCommutativeBinaryXmmOp(e, i,
[&i](X64Emitter& e, const Xmm& dest,
const Xmm& src1, const Xmm& src2) {
@ -2139,7 +2184,11 @@ struct MUL_ADD_V128
e.vfmadd231ps(i.dest, src1, src2);
} else {
// Dest not equal to anything
e.vmovdqa(i.dest, src1);
// e.vmovdqa(i.dest,
// src1);
// chrispy: vmovdqa was a domain pipeline
// hazard
e.vmovaps(i.dest, src1);
e.vfmadd213ps(i.dest, src2, src3);
}
});
@ -2152,7 +2201,8 @@ struct MUL_ADD_V128
// If i.dest == i.src3, back up i.src3 so we don't overwrite it.
src3 = i.src3;
if (i.dest == i.src3) {
e.vmovdqa(e.xmm1, i.src3);
// e.vmovdqa(e.xmm1, i.src3);
e.vmovaps(e.xmm1, i.src3);
src3 = e.xmm1;
}
}
@ -2384,17 +2434,17 @@ EMITTER_OPCODE_TABLE(OPCODE_NEG, NEG_I8, NEG_I16, NEG_I32, NEG_I64, NEG_F32,
// ============================================================================
struct ABS_F32 : Sequence<ABS_F32, I<OPCODE_ABS, F32Op, F32Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
}
};
struct ABS_F64 : Sequence<ABS_F64, I<OPCODE_ABS, F64Op, F64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD));
e.vandpd(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD));
}
};
struct ABS_V128 : Sequence<ABS_V128, I<OPCODE_ABS, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
}
};
EMITTER_OPCODE_TABLE(OPCODE_ABS, ABS_F32, ABS_F64, ABS_V128);
@ -2634,6 +2684,8 @@ struct DOT_PRODUCT_3_V128
*/
e.vstmxcsr(mxcsr_storage);
e.vmovaps(e.xmm2, e.GetXmmConstPtr(XMMThreeFloatMask));
e.mov(e.eax, 8);
auto src1v = e.xmm0;
@ -2655,8 +2707,8 @@ struct DOT_PRODUCT_3_V128
// so that in the future this could be optimized away if the top is known to
// be zero. Right now im not sure that happens often though and its
// currently not worth it also, maybe pre-and if constant
e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
e.vandps(e.xmm3, src1v, e.xmm2);
e.vandps(e.xmm2, src2v, e.xmm2);
e.and_(mxcsr_storage, e.eax);
e.vldmxcsr(mxcsr_storage); // overflow flag is cleared, now we're good to
@ -2682,8 +2734,7 @@ struct DOT_PRODUCT_3_V128
Xbyak::Label ret_qnan;
Xbyak::Label done;
e.jnz(ret_qnan);
// e.vshufps(i.dest, e.xmm1,e.xmm1, 0); // broadcast
e.vbroadcastss(i.dest, e.xmm1);
e.vshufps(i.dest, e.xmm1, e.xmm1, 0); // broadcast
e.jmp(done);
e.L(ret_qnan);
e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
@ -2728,27 +2779,7 @@ struct DOT_PRODUCT_4_V128
e.vcvtps2pd(e.ymm0, src1v);
e.vcvtps2pd(e.ymm1, src2v);
/*
e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
e.and_(mxcsr_storage, e.eax);
e.vldmxcsr(mxcsr_storage); // overflow flag is cleared, now we're good to
// go
e.vcvtps2pd(e.ymm0, e.xmm3);
e.vcvtps2pd(e.ymm1, e.xmm2);
e.vmulpd(e.ymm5, e.ymm0, e.ymm1);
e.vextractf128(e.xmm4, e.ymm5, 1);
e.vunpckhpd(e.xmm3, e.xmm5, e.xmm5); // get element [1] in xmm3
e.vaddsd(e.xmm5, e.xmm5, e.xmm4);
e.not_(e.eax);
e.vaddsd(e.xmm2, e.xmm5, e.xmm3);
e.vcvtsd2ss(e.xmm1, e.xmm2);
*/
e.vmulpd(e.ymm3, e.ymm0, e.ymm1);
e.vextractf128(e.xmm2, e.ymm3, 1);
e.vaddpd(e.xmm3, e.xmm3, e.xmm2);
@ -2765,8 +2796,7 @@ struct DOT_PRODUCT_4_V128
Xbyak::Label ret_qnan;
Xbyak::Label done;
e.jnz(ret_qnan); // reorder these jmps later, just want to get this fix in
// e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
e.vbroadcastss(i.dest, e.xmm1);
e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
e.jmp(done);
e.L(ret_qnan);
e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
@ -2846,9 +2876,16 @@ struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> {
};
struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
EmitCommutativeBinaryXmmOp(e, i,
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
e.vpand(dest, src1, src2);
SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
e.DeduceSimdDomain(i.src2.value));
EmitCommutativeBinaryXmmOp(
e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
if (dom == SimdDomain::FLOATING) {
e.vandps(dest, src2, src1);
} else {
e.vpand(dest, src2, src1);
}
});
}
};
@ -2948,9 +2985,16 @@ struct AND_NOT_I64
struct AND_NOT_V128
: Sequence<AND_NOT_V128, I<OPCODE_AND_NOT, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
EmitCommutativeBinaryXmmOp(e, i,
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
e.DeduceSimdDomain(i.src2.value));
EmitCommutativeBinaryXmmOp(
e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
if (dom == SimdDomain::FLOATING) {
e.vandnps(dest, src2, src1);
} else {
e.vpandn(dest, src2, src1);
}
});
}
};
@ -2994,9 +3038,16 @@ struct OR_I64 : Sequence<OR_I64, I<OPCODE_OR, I64Op, I64Op, I64Op>> {
};
struct OR_V128 : Sequence<OR_V128, I<OPCODE_OR, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
EmitCommutativeBinaryXmmOp(e, i,
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
e.DeduceSimdDomain(i.src2.value));
EmitCommutativeBinaryXmmOp(
e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
if (dom == SimdDomain::FLOATING) {
e.vorps(dest, src1, src2);
} else {
e.vpor(dest, src1, src2);
}
});
}
};
@ -3039,9 +3090,16 @@ struct XOR_I64 : Sequence<XOR_I64, I<OPCODE_XOR, I64Op, I64Op, I64Op>> {
};
struct XOR_V128 : Sequence<XOR_V128, I<OPCODE_XOR, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
EmitCommutativeBinaryXmmOp(e, i,
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
e.DeduceSimdDomain(i.src2.value));
EmitCommutativeBinaryXmmOp(
e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
if (dom == SimdDomain::FLOATING) {
e.vxorps(dest, src1, src2);
} else {
e.vpxor(dest, src1, src2);
}
});
}
};
@ -3078,9 +3136,16 @@ struct NOT_I64 : Sequence<NOT_I64, I<OPCODE_NOT, I64Op, I64Op>> {
};
struct NOT_V128 : Sequence<NOT_V128, I<OPCODE_NOT, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
SimdDomain domain =
e.DeduceSimdDomain(i.src1.value);
if (domain == SimdDomain::FLOATING) {
e.vxorps(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
} else {
// dest = src ^ 0xFFFF...
e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
}
}
};
EMITTER_OPCODE_TABLE(OPCODE_NOT, NOT_I8, NOT_I16, NOT_I32, NOT_I64, NOT_V128);
@ -3217,7 +3282,7 @@ struct SHR_V128 : Sequence<SHR_V128, I<OPCODE_SHR, V128Op, V128Op, I8Op>> {
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateShrV128));
e.vmovaps(i.dest, e.xmm0);
e.vmovdqa(i.dest, e.xmm0);
}
static __m128i EmulateShrV128(void*, __m128i src1, uint8_t src2) {
// Almost all instances are shamt = 1, but non-constant.

View File

@ -759,6 +759,18 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
i->Remove();
result = true;
}
else if (i->src2.value->IsConstantZero() && i->src3.value->IsConstantZero() &&
i->flags == INT8_TYPE /*probably safe for int16 too*/) {
/*
chrispy: hoisted this check here from x64_seq_vector where if src1 is not constant, but src2 and src3 are zero, then we know the result will always be zero
*/
v->set_zero(VEC128_TYPE);
i->Remove();
result = true;
}
break;
}
case OPCODE_INSERT:

View File

@ -9,6 +9,7 @@
#include "xenia/cpu/compiler/passes/simplification_pass.h"
#include <__msvc_int128.hpp>
#include "xenia/base/byte_order.h"
#include "xenia/base/profiling.h"
namespace xe {
@ -22,6 +23,52 @@ using namespace xe::cpu::hir;
using xe::cpu::hir::HIRBuilder;
using xe::cpu::hir::Instr;
using xe::cpu::hir::Value;
using vmask_portion_t = uint64_t;
template <uint32_t Ndwords>
struct Valuemask_t {
vmask_portion_t bits[Ndwords];
static Valuemask_t create_empty(vmask_portion_t fill = 0) {
Valuemask_t result;
for (uint32_t i = 0; i < Ndwords; ++i) {
result.bits[i] = fill;
}
return result;
}
template <typename TCallable>
Valuemask_t operate(TCallable&& oper) const {
Valuemask_t result = create_empty();
for (uint32_t i = 0; i < Ndwords; ++i) {
result.bits[i] = oper(bits[i]);
}
return result;
}
template <typename TCallable>
Valuemask_t operate(TCallable&& oper, Valuemask_t other) const {
Valuemask_t result = create_empty();
for (uint32_t i = 0; i < Ndwords; ++i) {
result.bits[i] = oper(bits[i], other.bits[i]);
}
return result;
}
Valuemask_t operator&(ValueMask other) const {
return operate([](vmask_portion_t x, vmask_portion_t y) { return x & y; },
other);
}
Valuemask_t operator|(ValueMask other) const {
return operate([](vmask_portion_t x, vmask_portion_t y) { return x | y; },
other);
}
Valuemask_t operator^(ValueMask other) const {
return operate([](vmask_portion_t x, vmask_portion_t y) { return x ^ y; },
other);
}
Valuemask_t operator~() const {
return operate([](vmask_portion_t x) { return ~x; }, other);
}
};
SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {}
@ -36,6 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
iter_result |= SimplifyBitArith(builder);
iter_result |= EliminateConversions(builder);
iter_result |= SimplifyAssignments(builder);
iter_result |= BackpropTruncations(builder);
result |= iter_result;
} while (iter_result);
return true;
@ -151,19 +199,88 @@ bool SimplificationPass::CheckOr(hir::Instr* i, hir::HIRBuilder* builder) {
}
return false;
}
bool SimplificationPass::CheckBooleanXor1(hir::Instr* i,
hir::HIRBuilder* builder,
hir::Value* xored) {
unsigned tunflags = MOVTUNNEL_ASSIGNS | MOVTUNNEL_MOVZX;
Instr* xordef = xored->GetDefTunnelMovs(&tunflags);
if (!xordef) {
return false;
}
Opcode xorop = xordef->opcode->num;
bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0;
Value* new_value = nullptr;
if (xorop == OPCODE_IS_FALSE) {
new_value = builder->IsTrue(xordef->src1.value);
} else if (xorop == OPCODE_IS_TRUE) {
new_value = builder->IsFalse(xordef->src1.value);
} else if (xorop == OPCODE_COMPARE_EQ) {
new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value);
} else if (xorop == OPCODE_COMPARE_NE) {
new_value = builder->CompareEQ(xordef->src1.value, xordef->src2.value);
} // todo: other conds
if (!new_value) {
return false;
}
new_value->def->MoveBefore(i);
i->Replace(need_zx ? &OPCODE_ZERO_EXTEND_info : &OPCODE_ASSIGN_info, 0);
i->set_src1(new_value);
return true;
}
bool SimplificationPass::CheckXorOfTwoBools(hir::Instr* i,
hir::HIRBuilder* builder,
hir::Value* b1, hir::Value* b2) {
// todo: implement
return false;
}
bool SimplificationPass::CheckXor(hir::Instr* i, hir::HIRBuilder* builder) {
if (CheckOrXorZero(i)) {
return true;
} else {
if (i->src1.value == i->src2.value) {
Value* src1 = i->src1.value;
Value* src2 = i->src2.value;
if (SameValueOrEqualConstant(src1, src2)) {
i->Replace(&OPCODE_ASSIGN_info, 0);
i->set_src1(builder->LoadZero(i->dest->type));
return true;
}
uint64_t type_mask = GetScalarTypeMask(i->dest->type);
auto [constant_value, variable_value] =
i->BinaryValueArrangeAsConstAndVar();
ScalarNZM nzm1 = GetScalarNZM(src1);
ScalarNZM nzm2 = GetScalarNZM(src2);
if ((nzm1 & nzm2) ==
0) { // no bits of the two sources overlap, this ought to be an OR
// cs:optimizing
/* i->Replace(&OPCODE_OR_info, 0);
i->set_src1(src1);
i->set_src2(src2);*/
i->opcode = &OPCODE_OR_info;
return true;
}
if (nzm1 == 1ULL && nzm2 == 1ULL) {
if (constant_value) {
return CheckBooleanXor1(i, builder, variable_value);
} else {
return CheckXorOfTwoBools(i, builder, src1, src2);
}
}
uint64_t type_mask = GetScalarTypeMask(i->dest->type);
if (!constant_value) return false;
@ -504,11 +621,12 @@ bool SimplificationPass::TryHandleANDROLORSHLSeq(hir::Instr* i,
}
bool SimplificationPass::CheckAnd(hir::Instr* i, hir::HIRBuilder* builder) {
retry_and_simplification:
auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar();
if (!constant_value) {
// added this for srawi
uint64_t nzml = GetScalarNZM(i->src1.value);
uint64_t nzmr = GetScalarNZM(i->src2.value);
ScalarNZM nzml = GetScalarNZM(i->src1.value);
ScalarNZM nzmr = GetScalarNZM(i->src2.value);
if ((nzml & nzmr) == 0) {
i->Replace(&OPCODE_ASSIGN_info, 0);
@ -524,9 +642,15 @@ retry_and_simplification:
// todo: check if masking with mask that covers all of zero extension source
uint64_t type_mask = GetScalarTypeMask(i->dest->type);
// if masking with entire width, pointless instruction so become an assign
if (constant_value->AsUint64() == type_mask) {
ScalarNZM nzm = GetScalarNZM(variable_value);
// if masking with entire width, pointless instruction so become an assign
// chrispy: changed this to use the nzm instead, this optimizes away many and
// instructions
// chrispy: changed this again. detecting if nzm is a subset of and mask, if
// so eliminate ex: (bool value) & 0xff = (bool value). the nzm is not equal
// to the mask, but it is a subset so can be elimed
if ((constant_value->AsUint64() & nzm) == nzm) {
i->Replace(&OPCODE_ASSIGN_info, 0);
i->set_src1(variable_value);
return true;
@ -555,7 +679,7 @@ retry_and_simplification:
Value* or_left = true_variable_def->src1.value;
Value* or_right = true_variable_def->src2.value;
uint64_t left_nzm = GetScalarNZM(or_left);
ScalarNZM left_nzm = GetScalarNZM(or_left);
// use the other or input instead of the or output
if ((constant_value->AsUint64() & left_nzm) == 0) {
@ -565,7 +689,7 @@ retry_and_simplification:
return true;
}
uint64_t right_nzm = GetScalarNZM(or_right);
ScalarNZM right_nzm = GetScalarNZM(or_right);
if ((constant_value->AsUint64() & right_nzm) == 0) {
i->Replace(&OPCODE_AND_info, 0);
@ -593,6 +717,21 @@ retry_and_simplification:
return false;
}
bool SimplificationPass::CheckAdd(hir::Instr* i, hir::HIRBuilder* builder) {
Value* src1 = i->src1.value;
Value* src2 = i->src2.value;
ScalarNZM nzm1 = GetScalarNZM(src1);
ScalarNZM nzm2 = GetScalarNZM(src2);
if ((nzm1 & nzm2) == 0) { // no bits overlap, there will never be a carry
// from any bits to any others, make this an OR
/* i->Replace(&OPCODE_OR_info, 0);
i->set_src1(src1);
i->set_src2(src2);*/
i->opcode = &OPCODE_OR_info;
return true;
}
auto [definition, added_constant] =
i->BinaryValueArrangeByDefOpAndConstant(&OPCODE_NOT_info);
@ -645,7 +784,7 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
return false;
}
uint64_t nzm_for_var = GetScalarNZM(variable);
ScalarNZM nzm_for_var = GetScalarNZM(variable);
Opcode cmpop = i->opcode->num;
uint64_t constant_unpacked = constant_value->AsUint64();
uint64_t signbit_for_var = GetScalarSignbitMask(variable->type);
@ -670,6 +809,14 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
i->set_src1(variable);
return true;
}
if (cmpop == OPCODE_COMPARE_ULE &&
constant_unpacked ==
0) { // less than or equal to zero = (== 0) = IS_FALSE
i->Replace(&OPCODE_IS_FALSE_info, 0);
i->set_src1(variable);
return true;
}
// todo: OPCODE_COMPARE_NE too?
if (cmpop == OPCODE_COMPARE_EQ &&
def_opcode == OPCODE_NOT) { // i see this a lot around addic insns
@ -774,7 +921,7 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
return false;
}
uint64_t input_nzm = GetScalarNZM(input);
ScalarNZM input_nzm = GetScalarNZM(input);
if (istrue &&
input_nzm == 1) { // doing istrue on a value thats already a bool bitwise
@ -813,6 +960,98 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
input_def = input_def->GetDestDefSkipAssigns();*/
return false;
}
bool SimplificationPass::CheckSHRByConst(hir::Instr* i,
hir::HIRBuilder* builder,
hir::Value* variable,
unsigned int shift) {
if (shift >= 3 && shift <= 6) {
// is possible shift of lzcnt res, do some tunneling
unsigned int tflags = MOVTUNNEL_ASSIGNS | MOVTUNNEL_MOVZX |
MOVTUNNEL_TRUNCATE | MOVTUNNEL_MOVSX |
MOVTUNNEL_AND32FF;
Instr* vardef = variable->def;
hir::Instr* var_def = variable->GetDefTunnelMovs(&tflags);
if (var_def && var_def->opcode == &OPCODE_CNTLZ_info) {
Value* lz_input = var_def->src1.value;
TypeName type_of_lz_input = lz_input->type;
size_t shift_for_zero =
xe::log2_floor(GetTypeSize(type_of_lz_input) * CHAR_BIT);
if (shift == shift_for_zero) {
// we ought to be OPCODE_IS_FALSE!
/*
explanation: if an input to lzcnt is zero, the result will be the
bit size of the input type, which is always a power of two any
nonzero result will be less than the bit size so you can test for
zero by doing, for instance with a 32 bit value, lzcnt32(input) >> 5
this is a very common way of testing for zero without branching on
ppc, and the xb360 ppc compiler used it a lot we optimize this away
for simplicity and to enable further optimizations, but actually this
is also quite fast on modern x86 processors as well, for instance on
zen 2 the rcp through of lzcnt is 0.25, meaning four can be executed
in one cycle
*/
if (variable->type != INT8_TYPE) {
Value* isfalsetest = builder->IsFalse(lz_input);
isfalsetest->def->MoveBefore(i);
i->Replace(&OPCODE_ZERO_EXTEND_info, 0);
i->set_src1(isfalsetest);
} else {
i->Replace(&OPCODE_IS_FALSE_info, 0);
i->set_src1(lz_input);
}
return true;
}
}
}
return false;
}
bool SimplificationPass::CheckSHR(hir::Instr* i, hir::HIRBuilder* builder) {
Value* shr_lhs = i->src1.value;
Value* shr_rhs = i->src2.value;
if (!shr_lhs || !shr_rhs) return false;
if (shr_rhs->IsConstant()) {
return CheckSHRByConst(i, builder, shr_lhs, shr_rhs->AsUint32());
}
return false;
}
bool SimplificationPass::CheckSAR(hir::Instr* i, hir::HIRBuilder* builder) {
Value* l = i->src1.value;
Value* r = i->src2.value;
ScalarNZM l_nzm = GetScalarNZM(l);
uint64_t signbit_mask = GetScalarSignbitMask(l->type);
size_t typesize = GetTypeSize(l->type);
/*
todo: folding this requires the mask of constant bits
if (r->IsConstant()) {
uint32_t const_r = r->AsUint32();
if (const_r == (typesize * CHAR_BIT) - 1) { //the shift is being done to
fill the result with the signbit of the input.
}
}*/
if ((l_nzm & signbit_mask) == 0) { // signbit will never be set, might as
// well be an SHR. (this does happen)
i->opcode = &OPCODE_SHR_info;
return true;
}
return false;
}
bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
bool result = false;
auto block = builder->first_block();
@ -822,19 +1061,24 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
// vector types use the same opcodes as scalar ones for AND/OR/XOR! we
// don't handle these in our simplifications, so skip
if (i->dest && IsScalarIntegralType(i->dest->type)) {
if (i->opcode == &OPCODE_OR_info) {
Opcode iop = i->opcode->num;
if (iop == OPCODE_OR) {
result |= CheckOr(i, builder);
} else if (i->opcode == &OPCODE_XOR_info) {
} else if (iop == OPCODE_XOR) {
result |= CheckXor(i, builder);
} else if (i->opcode == &OPCODE_AND_info) {
} else if (iop == OPCODE_AND) {
result |= CheckAnd(i, builder);
} else if (i->opcode == &OPCODE_ADD_info) {
} else if (iop == OPCODE_ADD) {
result |= CheckAdd(i, builder);
} else if (IsScalarBasicCmp(i->opcode->num)) {
} else if (IsScalarBasicCmp(iop)) {
result |= CheckScalarConstCmp(i, builder);
} else if (i->opcode == &OPCODE_IS_FALSE_info ||
i->opcode == &OPCODE_IS_TRUE_info) {
} else if (iop == OPCODE_IS_FALSE || iop == OPCODE_IS_TRUE) {
result |= CheckIsTrueIsFalse(i, builder);
} else if (iop == OPCODE_SHR) {
result |= CheckSHR(i, builder);
} else if (iop == OPCODE_SHA) {
result |= CheckSAR(i, builder);
}
}
@ -928,7 +1172,6 @@ bool SimplificationPass::CheckByteSwap(Instr* i) {
}
return false;
}
bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
// Run over the instructions and rename assigned variables:
// v1 = v0
@ -952,22 +1195,11 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
while (block) {
auto i = block->instr_head;
while (i) {
uint32_t signature = i->opcode->signature;
if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) {
i->VisitValueOperands([&result, i, this](Value* value, uint32_t idx) {
bool modified = false;
i->set_src1(CheckValue(i->src1.value, modified));
i->set_srcN(CheckValue(value, modified), idx);
result |= modified;
}
if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) {
bool modified = false;
i->set_src2(CheckValue(i->src2.value, modified));
result |= modified;
}
if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) {
bool modified = false;
i->set_src3(CheckValue(i->src3.value, modified));
result |= modified;
}
});
i = i->next;
}
@ -976,6 +1208,71 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
return result;
}
struct TruncateSimplifier {
TypeName type_from, type_to;
uint32_t sizeof_from, sizeof_to;
uint32_t bit_sizeof_from, bit_sizeof_to;
uint64_t typemask_from, typemask_to;
hir::HIRBuilder* builder;
hir::Instr* truncate_instr;
hir::Value* truncated_value;
hir::Instr* truncated_value_def;
};
bool SimplificationPass::BackpropTruncations(hir::Instr* i,
hir::HIRBuilder* builder) {
if (i->opcode != &OPCODE_TRUNCATE_info) {
return false;
}
TypeName type_from = i->src1.value->type;
TypeName type_to = i->dest->type;
uint32_t sizeof_from = static_cast<uint32_t>(GetTypeSize(type_from));
uint32_t sizeof_to = static_cast<uint32_t>(GetTypeSize(type_to));
Instr* input_def = i->src1.value->GetDefSkipAssigns();
if (!input_def) {
return false;
}
Opcode input_opc = input_def->opcode->num;
if (input_opc == OPCODE_SHL && input_def->src2.value->IsConstant()) {
uint32_t src2_shift = input_def->src2.value->AsUint32();
if (src2_shift < (sizeof_to * CHAR_BIT)) {
Value* truncated_preshift =
builder->Truncate(input_def->src1.value, type_to);
truncated_preshift->def->MoveBefore(i);
i->Replace(&OPCODE_SHL_info, 0);
i->set_src1(truncated_preshift);
i->set_src2(input_def->src2.value);
return true;
}
}
if (input_opc == OPCODE_LOAD_CONTEXT) {
if (sizeof_from == 8 && sizeof_to == 4) {
Value* loadof = builder->LoadContext(input_def->src1.offset, INT32_TYPE);
loadof->def->MoveBefore(input_def);
i->Replace(&OPCODE_ASSIGN_info, 0);
i->set_src1(loadof);
return true;
}
}
return false;
}
bool SimplificationPass::BackpropTruncations(hir::HIRBuilder* builder) {
bool result = false;
auto block = builder->first_block();
while (block) {
auto i = block->instr_head;
while (i) {
result |= BackpropTruncations(i, builder);
i = i->next;
}
block = block->next;
}
return result;
}
Value* SimplificationPass::CheckValue(Value* value, bool& result) {
auto def = value->def;
if (def && def->opcode == &OPCODE_ASSIGN_info) {

View File

@ -32,6 +32,8 @@ class SimplificationPass : public ConditionalGroupSubpass {
bool SimplifyAssignments(hir::HIRBuilder* builder);
hir::Value* CheckValue(hir::Value* value, bool& result);
bool SimplifyBitArith(hir::HIRBuilder* builder);
bool BackpropTruncations(hir::Instr* i, hir::HIRBuilder* builder);
bool BackpropTruncations(hir::HIRBuilder* builder);
// handle either or or xor with 0
bool CheckOrXorZero(hir::Instr* i);
bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);
@ -44,6 +46,17 @@ class SimplificationPass : public ConditionalGroupSubpass {
bool CheckSelect(hir::Instr* i, hir::HIRBuilder* builder);
bool CheckScalarConstCmp(hir::Instr* i, hir::HIRBuilder* builder);
bool CheckIsTrueIsFalse(hir::Instr* i, hir::HIRBuilder* builder);
bool CheckSHRByConst(hir::Instr* i, hir::HIRBuilder* builder,
hir::Value* variable, unsigned int shift);
bool CheckSHR(hir::Instr* i, hir::HIRBuilder* builder);
bool CheckSAR(hir::Instr* i, hir::HIRBuilder* builder);
// called by CheckXor, handles transforming a 1 bit value xored against 1
bool CheckBooleanXor1(hir::Instr* i, hir::HIRBuilder* builder,
hir::Value* xored);
bool CheckXorOfTwoBools(hir::Instr* i, hir::HIRBuilder* builder,
hir::Value* b1, hir::Value* b2);
// for rlwinm
bool TryHandleANDROLORSHLSeq(hir::Instr* i, hir::HIRBuilder* builder);
bool TransformANDROLORSHLSeq(

View File

@ -14,38 +14,15 @@
namespace xe {
namespace cpu {
namespace hir {
void Instr::set_src1(Value* value) {
if (src1.value == value) {
void Instr::set_srcN(Value* value, uint32_t idx) {
if (srcs[idx].value == value) {
return;
}
if (src1_use) {
src1.value->RemoveUse(src1_use);
if (srcs_use[idx]) {
srcs[idx].value->RemoveUse(srcs_use[idx]);
}
src1.value = value;
src1_use = value ? value->AddUse(block->arena, this) : NULL;
}
void Instr::set_src2(Value* value) {
if (src2.value == value) {
return;
}
if (src2_use) {
src2.value->RemoveUse(src2_use);
}
src2.value = value;
src2_use = value ? value->AddUse(block->arena, this) : NULL;
}
void Instr::set_src3(Value* value) {
if (src3.value == value) {
return;
}
if (src3_use) {
src3.value->RemoveUse(src3_use);
}
src3.value = value;
src3_use = value ? value->AddUse(block->arena, this) : NULL;
srcs[idx].value = value;
srcs_use[idx] = value ? value->AddUse(block->arena, this) : nullptr;
}
void Instr::MoveBefore(Instr* other) {
@ -128,6 +105,81 @@ Instr* Instr::GetDestDefSkipAssigns() {
}
return current_def;
}
Instr* Instr::GetDestDefTunnelMovs(unsigned int* tunnel_flags) {
unsigned int traversed_types = 0;
unsigned int in_flags = *tunnel_flags;
Instr* current_def = this;
while (true) {
Opcode op = current_def->opcode->num;
switch (op) {
case OPCODE_ASSIGN: {
if ((in_flags & MOVTUNNEL_ASSIGNS)) {
current_def = current_def->src1.value->def;
traversed_types |= MOVTUNNEL_ASSIGNS;
} else {
goto exit_loop;
}
break;
}
case OPCODE_ZERO_EXTEND: {
if ((in_flags & MOVTUNNEL_MOVZX)) {
current_def = current_def->src1.value->def;
traversed_types |= MOVTUNNEL_MOVZX;
} else {
goto exit_loop;
}
break;
}
case OPCODE_SIGN_EXTEND: {
if ((in_flags & MOVTUNNEL_MOVSX)) {
current_def = current_def->src1.value->def;
traversed_types |= MOVTUNNEL_MOVSX;
} else {
goto exit_loop;
}
break;
}
case OPCODE_TRUNCATE: {
if ((in_flags & MOVTUNNEL_TRUNCATE)) {
current_def = current_def->src1.value->def;
traversed_types |= MOVTUNNEL_TRUNCATE;
} else {
goto exit_loop;
}
break;
}
case OPCODE_AND: {
if ((in_flags & MOVTUNNEL_AND32FF)) {
auto [constant, nonconst] =
current_def->BinaryValueArrangeAsConstAndVar();
if (!constant || constant->AsUint64() != 0xFFFFFFFF) {
goto exit_loop;
}
current_def = nonconst->def;
traversed_types |= MOVTUNNEL_AND32FF;
} else {
goto exit_loop;
}
break;
}
default:
goto exit_loop;
}
if (!current_def) {
goto exit_loop;
}
}
exit_loop:
*tunnel_flags = traversed_types;
return current_def;
}
} // namespace hir
} // namespace cpu
} // namespace xe

View File

@ -25,6 +25,14 @@ namespace hir {
class Block;
class Label;
// todo: better name
enum MovTunnel {
MOVTUNNEL_ASSIGNS = 1,
MOVTUNNEL_MOVZX = 2,
MOVTUNNEL_MOVSX = 4,
MOVTUNNEL_TRUNCATE = 8,
MOVTUNNEL_AND32FF = 16, // tunnel through and with 0xFFFFFFFF
};
class Instr {
public:
@ -44,17 +52,28 @@ class Instr {
} Op;
Value* dest;
union {
struct {
Op src1;
Op src2;
Op src3;
};
Op srcs[3];
};
union {
struct {
Value::Use* src1_use;
Value::Use* src2_use;
Value::Use* src3_use;
};
Value::Use* srcs_use[3];
};
void set_srcN(Value* value, uint32_t idx);
void set_src1(Value* value) { set_srcN(value, 0); }
void set_src1(Value* value);
void set_src2(Value* value);
void set_src3(Value* value);
void set_src2(Value* value) { set_srcN(value, 1); }
void set_src3(Value* value) { set_srcN(value, 2); }
void MoveBefore(Instr* other);
void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
@ -104,6 +123,8 @@ if both are constant, return nullptr, nullptr
}
Instr* GetDestDefSkipAssigns();
Instr* GetDestDefTunnelMovs(unsigned int* tunnel_flags);
// returns [def op, constant]
std::pair<Value*, Value*> BinaryValueArrangeByDefOpAndConstant(
const OpcodeInfo* op_ptr) {
@ -115,6 +136,28 @@ if both are constant, return nullptr, nullptr
}
return result;
}
/*
Invokes the provided lambda callback on each operand that is a Value. Callback
is invoked with Value*, uint32_t index
*/
template <typename TCallable>
void VisitValueOperands(TCallable&& call_for_values) {
uint32_t signature = opcode->signature;
OpcodeSignatureType t_dest, t_src1, t_src2, t_src3;
UnpackOpcodeSig(signature, t_dest, t_src1, t_src2, t_src3);
if (t_src1 == OPCODE_SIG_TYPE_V) {
call_for_values(src1.value, 0);
}
if (t_src2 == OPCODE_SIG_TYPE_V) {
call_for_values(src2.value, 1);
}
if (t_src3 == OPCODE_SIG_TYPE_V) {
call_for_values(src3.value, 2);
}
}
};
} // namespace hir

View File

@ -1798,6 +1798,13 @@ hir::Instr* Value::GetDefSkipAssigns() {
return nullptr;
}
}
hir::Instr* Value::GetDefTunnelMovs(unsigned int* tunnel_flags) {
if (def) {
return def->GetDestDefTunnelMovs(tunnel_flags);
} else {
return nullptr;
}
}
} // namespace hir
} // namespace cpu
} // namespace xe

View File

@ -598,6 +598,8 @@ class Value {
void CountLeadingZeros(const Value* other);
bool Compare(Opcode opcode, Value* other);
hir::Instr* GetDefSkipAssigns();
// tunnel_flags is updated to the kinds we actually traversed
hir::Instr* GetDefTunnelMovs(unsigned int* tunnel_flags);
private:
static bool CompareInt8(Opcode opcode, Value* a, Value* b);

View File

@ -246,30 +246,7 @@ enum class PPCRegister {
};
#pragma pack(push, 8)
typedef struct PPCContext_s {
// Must be stored at 0x0 for now.
// TODO(benvanik): find a nice way to describe this to the JIT.
ThreadState* thread_state; // 0x0
// TODO(benvanik): this is getting nasty. Must be here.
uint8_t* virtual_membase; // 0x8
// Most frequently used registers first.
uint64_t lr; // 0x10 Link register
uint64_t ctr; // 0x18 Count register
uint64_t r[32]; // 0x20 General purpose registers
double f[32]; // 0x120 Floating-point registers
vec128_t v[128]; // 0x220 VMX128 vector registers
// XER register:
// Split to make it easier to do individual updates.
uint8_t xer_ca; // 0xA20
uint8_t xer_ov; // 0xA21
uint8_t xer_so; // 0xA22
// Condition registers:
// These are split to make it easier to do DCE on unused stores.
uint64_t cr() const;
void set_cr(uint64_t value);
typedef struct alignas(64) PPCContext_s {
union {
uint32_t value;
struct {
@ -395,6 +372,25 @@ typedef struct PPCContext_s {
} bits;
} fpscr; // Floating-point status and control register
// Most frequently used registers first.
uint64_t r[32]; // 0x20 General purpose registers
uint64_t ctr; // 0x18 Count register
uint64_t lr; // 0x10 Link register
double f[32]; // 0x120 Floating-point registers
vec128_t v[128]; // 0x220 VMX128 vector registers
// XER register:
// Split to make it easier to do individual updates.
uint8_t xer_ca;
uint8_t xer_ov;
uint8_t xer_so;
// Condition registers:
// These are split to make it easier to do DCE on unused stores.
uint64_t cr() const;
void set_cr(uint64_t value);
uint8_t vscr_sat;
// uint32_t get_fprf() {
@ -425,7 +421,8 @@ typedef struct PPCContext_s {
// Value of last reserved load
uint64_t reserved_val;
ThreadState* thread_state;
uint8_t* virtual_membase;
static std::string GetRegisterName(PPCRegister reg);
std::string GetStringFromValue(PPCRegister reg) const;
void SetValueFromString(PPCRegister reg, std::string value);

View File

@ -18,12 +18,50 @@
#include "xenia/cpu/processor.h"
#include "xenia/xbox.h"
namespace xe {
namespace cpu {
thread_local ThreadState* thread_state_ = nullptr;
static void* AllocateContext() {
size_t granularity = xe::memory::allocation_granularity();
for (unsigned pos32 = 0x40; pos32 < 8192; ++pos32) {
/*
we want our register which points to the context to have 0xE0000000 in
the low 32 bits, for checking for whether we need the 4k offset, but also
if we allocate starting from the page before we allow backends to index
negatively to get to their own backend specific data, which makes full
use of int8 displacement
the downside is we waste most of one granula and probably a fair bit of
the one starting at 0xE0 by using a direct virtual memory allocation
instead of malloc
*/
uintptr_t context_pre =
((static_cast<uint64_t>(pos32) << 32) | 0xE0000000) - granularity;
void* p = memory::AllocFixed(
(void*)context_pre, granularity + sizeof(ppc::PPCContext),
memory::AllocationType::kReserveCommit, memory::PageAccess::kReadWrite);
if (p) {
return reinterpret_cast<char*>(p) +
granularity; // now we have a ctx ptr with the e0 constant in low,
// and one page allocated before it
}
}
assert_always("giving up on allocating context, likely leaking contexts");
return nullptr;
}
static void FreeContext(void* ctx) {
char* true_start_of_ctx = &reinterpret_cast<char*>(
ctx)[-static_cast<ptrdiff_t>(xe::memory::allocation_granularity())];
memory::DeallocFixed(true_start_of_ctx, 0,
memory::DeallocationType::kRelease);
}
ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
uint32_t stack_base, uint32_t pcr_address)
: processor_(processor),
@ -38,7 +76,9 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
backend_data_ = processor->backend()->AllocThreadData();
// Allocate with 64b alignment.
context_ = memory::AlignedAlloc<ppc::PPCContext>(64);
context_ = reinterpret_cast<ppc::PPCContext*>(AllocateContext()); // memory::AlignedAlloc<ppc::PPCContext>(64);
processor->backend()->InitializeBackendContext(context_);
assert_true(((uint64_t)context_ & 0x3F) == 0);
std::memset(context_, 0, sizeof(ppc::PPCContext));
@ -62,8 +102,10 @@ ThreadState::~ThreadState() {
if (thread_state_ == this) {
thread_state_ = nullptr;
}
memory::AlignedFree(context_);
if (context_) {
FreeContext(reinterpret_cast<void*>(context_));
}
// memory::AlignedFree(context_);
}
void ThreadState::Bind(ThreadState* thread_state) {