Merge pull request #52 from chrisps/canary_experimental
Fix previous batch of CPU changes
This commit is contained in:
commit
3757580f45
|
@ -63,6 +63,10 @@ class Backend {
|
|||
virtual void InstallBreakpoint(Breakpoint* breakpoint) {}
|
||||
virtual void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) {}
|
||||
virtual void UninstallBreakpoint(Breakpoint* breakpoint) {}
|
||||
// ctx points to the start of a ppccontext, ctx - page_allocation_granularity
|
||||
// up until the start of ctx may be used by the backend to store whatever data
|
||||
// they want
|
||||
virtual void InitializeBackendContext(void* ctx) {}
|
||||
|
||||
protected:
|
||||
Processor* processor_ = nullptr;
|
||||
|
|
|
@ -32,6 +32,9 @@
|
|||
#include "xenia/cpu/cpu_flags.h"
|
||||
#include "xenia/cpu/function.h"
|
||||
#include "xenia/cpu/function_debug_info.h"
|
||||
#include "xenia/cpu/hir/instr.h"
|
||||
#include "xenia/cpu/hir/opcodes.h"
|
||||
#include "xenia/cpu/hir/value.h"
|
||||
#include "xenia/cpu/processor.h"
|
||||
#include "xenia/cpu/symbol.h"
|
||||
#include "xenia/cpu/thread_state.h"
|
||||
|
@ -393,7 +396,8 @@ void X64Emitter::DebugBreak() {
|
|||
}
|
||||
|
||||
uint64_t TrapDebugPrint(void* raw_context, uint64_t address) {
|
||||
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
|
||||
auto thread_state =
|
||||
reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
|
||||
uint32_t str_ptr = uint32_t(thread_state->context()->r[3]);
|
||||
// uint16_t str_len = uint16_t(thread_state->context()->r[4]);
|
||||
auto str = thread_state->memory()->TranslateVirtual<const char*>(str_ptr);
|
||||
|
@ -408,7 +412,8 @@ uint64_t TrapDebugPrint(void* raw_context, uint64_t address) {
|
|||
}
|
||||
|
||||
uint64_t TrapDebugBreak(void* raw_context, uint64_t address) {
|
||||
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
|
||||
auto thread_state =
|
||||
reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
|
||||
XELOGE("tw/td forced trap hit! This should be a crash!");
|
||||
if (cvars::break_on_debugbreak) {
|
||||
xe::debugging::Break();
|
||||
|
@ -447,7 +452,8 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
|
|||
|
||||
// This is used by the X64ThunkEmitter's ResolveFunctionThunk.
|
||||
uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
|
||||
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
|
||||
auto thread_state =
|
||||
reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
|
||||
|
||||
// TODO(benvanik): required?
|
||||
assert_not_zero(target_address);
|
||||
|
@ -1191,7 +1197,109 @@ Xbyak::Address X64Emitter::StashConstantXmm(int index, const vec128_t& v) {
|
|||
MovMem64(addr + 8, v.high);
|
||||
return ptr[addr];
|
||||
}
|
||||
static bool IsVectorCompare(const Instr* i) {
|
||||
hir::Opcode op = i->opcode->num;
|
||||
return op >= hir::OPCODE_VECTOR_COMPARE_EQ &&
|
||||
op <= hir::OPCODE_VECTOR_COMPARE_UGE;
|
||||
}
|
||||
|
||||
static bool IsFlaggedVectorOp(const Instr* i) {
|
||||
if (IsVectorCompare(i)) {
|
||||
return true;
|
||||
}
|
||||
hir::Opcode op = i->opcode->num;
|
||||
using namespace hir;
|
||||
switch (op) {
|
||||
case OPCODE_VECTOR_SUB:
|
||||
case OPCODE_VECTOR_ADD:
|
||||
case OPCODE_SWIZZLE:
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static SimdDomain GetDomainForFlaggedVectorOp(const hir::Instr* df) {
|
||||
switch (df->flags) { // check what datatype we compared as
|
||||
case hir::INT16_TYPE:
|
||||
case hir::INT32_TYPE:
|
||||
case hir::INT8_TYPE:
|
||||
case hir::INT64_TYPE:
|
||||
return SimdDomain::INTEGER;
|
||||
case hir::FLOAT32_TYPE:
|
||||
case hir::FLOAT64_TYPE: // pretty sure float64 doesnt occur with vectors.
|
||||
// here for completeness
|
||||
return SimdDomain::FLOATING;
|
||||
default:
|
||||
return SimdDomain::DONTCARE;
|
||||
}
|
||||
return SimdDomain::DONTCARE;
|
||||
}
|
||||
// this list is incomplete
|
||||
static bool IsDefiniteIntegerDomainOpcode(hir::Opcode opc) {
|
||||
using namespace hir;
|
||||
switch (opc) {
|
||||
case OPCODE_LOAD_VECTOR_SHL:
|
||||
case OPCODE_LOAD_VECTOR_SHR:
|
||||
case OPCODE_VECTOR_CONVERT_F2I:
|
||||
case OPCODE_VECTOR_MIN: // there apparently is no FLOAT32_TYPE for min/maxs
|
||||
// flags
|
||||
case OPCODE_VECTOR_MAX:
|
||||
case OPCODE_VECTOR_SHL:
|
||||
case OPCODE_VECTOR_SHR:
|
||||
case OPCODE_VECTOR_SHA:
|
||||
case OPCODE_VECTOR_ROTATE_LEFT:
|
||||
case OPCODE_VECTOR_AVERAGE: // apparently no float32 type for this
|
||||
case OPCODE_EXTRACT:
|
||||
case OPCODE_INSERT: // apparently no f32 type for these two
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
static bool IsDefiniteFloatingDomainOpcode(hir::Opcode opc) {
|
||||
using namespace hir;
|
||||
switch (opc) {
|
||||
case OPCODE_VECTOR_CONVERT_I2F:
|
||||
case OPCODE_VECTOR_DENORMFLUSH:
|
||||
case OPCODE_DOT_PRODUCT_3:
|
||||
case OPCODE_DOT_PRODUCT_4:
|
||||
case OPCODE_LOG2:
|
||||
case OPCODE_POW2:
|
||||
case OPCODE_RECIP:
|
||||
case OPCODE_ROUND:
|
||||
case OPCODE_SQRT:
|
||||
case OPCODE_MUL:
|
||||
case OPCODE_MUL_SUB:
|
||||
case OPCODE_MUL_ADD:
|
||||
case OPCODE_ABS:
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) {
|
||||
hir::Instr* df = for_value->def;
|
||||
if (!df) {
|
||||
// todo: visit uses to figure out domain
|
||||
return SimdDomain::DONTCARE;
|
||||
|
||||
} else {
|
||||
SimdDomain result = SimdDomain::DONTCARE;
|
||||
|
||||
if (IsFlaggedVectorOp(df)) {
|
||||
result = GetDomainForFlaggedVectorOp(df);
|
||||
} else if (IsDefiniteIntegerDomainOpcode(df->opcode->num)) {
|
||||
result = SimdDomain::INTEGER;
|
||||
} else if (IsDefiniteFloatingDomainOpcode(df->opcode->num)) {
|
||||
result = SimdDomain::FLOATING;
|
||||
}
|
||||
|
||||
// todo: check if still dontcare, if so, visit uses of the value to figure
|
||||
// it out
|
||||
return result;
|
||||
}
|
||||
|
||||
return SimdDomain::DONTCARE;
|
||||
}
|
||||
} // namespace x64
|
||||
} // namespace backend
|
||||
} // namespace cpu
|
||||
|
|
|
@ -44,7 +44,39 @@ enum RegisterFlags {
|
|||
REG_DEST = (1 << 0),
|
||||
REG_ABCD = (1 << 1),
|
||||
};
|
||||
/*
|
||||
SSE/AVX/AVX512 has seperate move instructions/shuffle instructions for float
|
||||
data and int data for a reason most processors implement two distinct
|
||||
pipelines, one for the integer domain and one for the floating point domain
|
||||
currently, xenia makes no distinction between the two. Crossing domains is
|
||||
expensive. On Zen processors the penalty is one cycle each time you cross,
|
||||
plus the two pipelines need to synchronize Often xenia will emit an integer
|
||||
instruction, then a floating instruction, then integer again. this
|
||||
effectively adds at least two cycles to the time taken These values will in
|
||||
the future be used as tags to operations that tell them which domain to
|
||||
operate in, if its at all possible to avoid crossing
|
||||
*/
|
||||
enum class SimdDomain : uint32_t {
|
||||
FLOATING,
|
||||
INTEGER,
|
||||
DONTCARE,
|
||||
CONFLICTING // just used as a special result for PickDomain, different from
|
||||
// dontcare (dontcare means we just dont know the domain,
|
||||
// CONFLICTING means its used in multiple domains)
|
||||
};
|
||||
|
||||
static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) {
|
||||
if (dom1 == dom2) {
|
||||
return dom1;
|
||||
}
|
||||
if (dom1 == SimdDomain::DONTCARE) {
|
||||
return dom2;
|
||||
}
|
||||
if (dom2 == SimdDomain::DONTCARE) {
|
||||
return dom1;
|
||||
}
|
||||
return SimdDomain::CONFLICTING;
|
||||
}
|
||||
enum XmmConst {
|
||||
XMMZero = 0,
|
||||
XMMOne,
|
||||
|
@ -122,7 +154,7 @@ enum XmmConst {
|
|||
XMMLVSLTableBase,
|
||||
XMMLVSRTableBase,
|
||||
XMMSingleDenormalMask,
|
||||
XMMThreeFloatMask, //for clearing the fourth float prior to DOT_PRODUCT_3
|
||||
XMMThreeFloatMask, // for clearing the fourth float prior to DOT_PRODUCT_3
|
||||
XMMXenosF16ExtRangeStart
|
||||
};
|
||||
|
||||
|
@ -150,8 +182,9 @@ enum X64EmitterFeatureFlags {
|
|||
|
||||
kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
|
||||
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
|
||||
kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
|
||||
kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
|
||||
kX64FastJrcx = 1 << 12, // jrcxz is as fast as any other jump ( >= Zen1)
|
||||
kX64FastLoop =
|
||||
1 << 13, // loop/loope/loopne is as fast as any other jump ( >= Zen2)
|
||||
kX64EmitAVX512VBMI = 1 << 14
|
||||
};
|
||||
class ResolvableGuestCall {
|
||||
|
@ -259,6 +292,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
FunctionDebugInfo* debug_info() const { return debug_info_; }
|
||||
|
||||
size_t stack_size() const { return stack_size_; }
|
||||
SimdDomain DeduceSimdDomain(const hir::Value* for_value);
|
||||
|
||||
protected:
|
||||
void* Emplace(const EmitFunctionInfo& func_info,
|
||||
|
|
|
@ -12,11 +12,11 @@
|
|||
#include <algorithm>
|
||||
#include <cstring>
|
||||
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/base/memory.h"
|
||||
#include "xenia/cpu/backend/x64/x64_op.h"
|
||||
#include "xenia/cpu/backend/x64/x64_tracers.h"
|
||||
#include "xenia/cpu/ppc/ppc_context.h"
|
||||
#include "xenia/base/cvar.h"
|
||||
|
||||
DEFINE_bool(
|
||||
elide_e0_check, false,
|
||||
|
@ -83,11 +83,17 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
|
|||
!is_definitely_not_eo(guest)) {
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
|
||||
// todo: do branching or use an alt membase and cmov
|
||||
e.xor_(e.eax, e.eax);
|
||||
e.cmp(guest.reg().cvt32(), 0xE0000000 - offset_const);
|
||||
e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]);
|
||||
|
||||
e.cmp(e.edx, e.GetContextReg().cvt32());
|
||||
e.setae(e.al);
|
||||
e.shl(e.eax, 12);
|
||||
e.add(e.eax, guest.reg().cvt32());
|
||||
e.add(e.eax, e.edx);
|
||||
return e.GetMembaseReg() + e.rax;
|
||||
|
||||
} else {
|
||||
// Clear the top 32 bits, as they are likely garbage.
|
||||
// TODO(benvanik): find a way to avoid doing this.
|
||||
|
@ -122,7 +128,7 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
|
|||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
e.xor_(e.eax, e.eax);
|
||||
e.cmp(guest.reg().cvt32(), 0xE0000000);
|
||||
e.cmp(guest.reg().cvt32(), e.GetContextReg().cvt32());
|
||||
e.setae(e.al);
|
||||
e.shl(e.eax, 12);
|
||||
e.add(e.eax, guest.reg().cvt32());
|
||||
|
@ -208,7 +214,7 @@ struct ATOMIC_COMPARE_EXCHANGE_I32
|
|||
if (xe::memory::allocation_granularity() > 0x1000) {
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
e.cmp(i.src1.reg().cvt32(), 0xE0000000);
|
||||
e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
|
||||
e.setae(e.cl);
|
||||
e.movzx(e.ecx, e.cl);
|
||||
e.shl(e.ecx, 12);
|
||||
|
@ -229,7 +235,7 @@ struct ATOMIC_COMPARE_EXCHANGE_I64
|
|||
if (xe::memory::allocation_granularity() > 0x1000) {
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
e.cmp(i.src1.reg().cvt32(), 0xE0000000);
|
||||
e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
|
||||
e.setae(e.cl);
|
||||
e.movzx(e.ecx, e.cl);
|
||||
e.shl(e.ecx, 12);
|
||||
|
@ -1113,7 +1119,7 @@ struct CACHE_CONTROL
|
|||
if (xe::memory::allocation_granularity() > 0x1000) {
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
e.cmp(i.src1.reg().cvt32(), 0xE0000000);
|
||||
e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32());
|
||||
e.setae(e.al);
|
||||
e.movzx(e.eax, e.al);
|
||||
e.shl(e.eax, 12);
|
||||
|
|
|
@ -1826,7 +1826,7 @@ struct PERMUTE_I32
|
|||
}
|
||||
}
|
||||
};
|
||||
//todo: use this on const src1
|
||||
// todo: use this on const src1
|
||||
static vec128_t FixupConstantShuf8(vec128_t input) {
|
||||
for (uint32_t i = 0; i < 16; ++i) {
|
||||
input.u8[i] ^= 0x03;
|
||||
|
@ -1984,7 +1984,11 @@ struct SWIZZLE
|
|||
} else {
|
||||
src1 = i.src1;
|
||||
}
|
||||
e.vpshufd(i.dest, src1, swizzle_mask);
|
||||
if (element_type == INT32_TYPE) {
|
||||
e.vpshufd(i.dest, src1, swizzle_mask);
|
||||
} else if (element_type == FLOAT32_TYPE) {
|
||||
e.vshufps(i.dest, src1, src1, swizzle_mask);
|
||||
}
|
||||
} else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) {
|
||||
assert_always();
|
||||
} else {
|
||||
|
|
|
@ -717,6 +717,9 @@ struct SELECT_V128_I8
|
|||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
// TODO(benvanik): find a shorter sequence.
|
||||
// dest = src1 != 0 ? src2 : src3
|
||||
/*
|
||||
chrispy: this is dead code, this sequence is never emitted
|
||||
*/
|
||||
e.movzx(e.eax, i.src1);
|
||||
e.vmovd(e.xmm1, e.eax);
|
||||
e.vpbroadcastd(e.xmm1, e.xmm1);
|
||||
|
@ -737,11 +740,46 @@ struct SELECT_V128_I8
|
|||
e.vpor(i.dest, e.xmm1);
|
||||
}
|
||||
};
|
||||
|
||||
enum class PermittedBlend : uint32_t { NotPermitted, Int8, Ps };
|
||||
static bool IsVectorCompare(const Instr* i) {
|
||||
Opcode op = i->opcode->num;
|
||||
return op >= OPCODE_VECTOR_COMPARE_EQ && op <= OPCODE_VECTOR_COMPARE_UGE;
|
||||
}
|
||||
/*
|
||||
OPCODE_SELECT does a bit by bit selection, however, if the selector is the
|
||||
result of a comparison or if each element may only be 0xff or 0 we may use a
|
||||
blend instruction instead
|
||||
*/
|
||||
static PermittedBlend GetPermittedBlendForSelectV128(const Value* src1v) {
|
||||
const Instr* df = src1v->def;
|
||||
if (!df) {
|
||||
return PermittedBlend::NotPermitted;
|
||||
} else {
|
||||
if (!IsVectorCompare(df)) {
|
||||
return PermittedBlend::NotPermitted; // todo: check ors, ands of
|
||||
// condition
|
||||
} else {
|
||||
switch (df->flags) { // check what datatype we compared as
|
||||
case INT16_TYPE:
|
||||
case INT32_TYPE:
|
||||
case INT8_TYPE:
|
||||
return PermittedBlend::Int8; // use vpblendvb
|
||||
case FLOAT32_TYPE:
|
||||
return PermittedBlend::Ps; // use vblendvps
|
||||
default: // unknown type! just ignore
|
||||
return PermittedBlend::NotPermitted;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
struct SELECT_V128_V128
|
||||
: Sequence<SELECT_V128_V128,
|
||||
I<OPCODE_SELECT, V128Op, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
Xmm src1 = i.src1.is_constant ? e.xmm0 : i.src1;
|
||||
PermittedBlend mayblend = GetPermittedBlendForSelectV128(i.src1.value);
|
||||
//todo: detect whether src1 is only 0 or FFFF and use blends if so. currently we only detect cmps
|
||||
if (i.src1.is_constant) {
|
||||
e.LoadConstantXmm(src1, i.src1.constant());
|
||||
}
|
||||
|
@ -756,10 +794,16 @@ struct SELECT_V128_V128
|
|||
e.LoadConstantXmm(src3, i.src3.constant());
|
||||
}
|
||||
|
||||
// src1 ? src2 : src3;
|
||||
e.vpandn(e.xmm3, src1, src2);
|
||||
e.vpand(i.dest, src1, src3);
|
||||
e.vpor(i.dest, i.dest, e.xmm3);
|
||||
if (mayblend == PermittedBlend::Int8) {
|
||||
e.vpblendvb(i.dest, src2, src3, src1);
|
||||
} else if (mayblend == PermittedBlend::Ps) {
|
||||
e.vblendvps(i.dest, src2, src3, src1);
|
||||
} else {
|
||||
// src1 ? src2 : src3;
|
||||
e.vpandn(e.xmm3, src1, src2);
|
||||
e.vpand(i.dest, src1, src3);
|
||||
e.vpor(i.dest, i.dest, e.xmm3);
|
||||
}
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_SELECT, SELECT_I8, SELECT_I16, SELECT_I32,
|
||||
|
@ -2122,7 +2166,8 @@ struct MUL_ADD_V128
|
|||
// TODO(benvanik): the vfmadd sequence produces slightly different results
|
||||
// than vmul+vadd and it'd be nice to know why. Until we know, it's
|
||||
// disabled so tests pass.
|
||||
if (false && e.IsFeatureEnabled(kX64EmitFMA)) {
|
||||
// chrispy: reenabled, i have added the DAZ behavior that was missing
|
||||
if (true && e.IsFeatureEnabled(kX64EmitFMA)) {
|
||||
EmitCommutativeBinaryXmmOp(e, i,
|
||||
[&i](X64Emitter& e, const Xmm& dest,
|
||||
const Xmm& src1, const Xmm& src2) {
|
||||
|
@ -2139,7 +2184,11 @@ struct MUL_ADD_V128
|
|||
e.vfmadd231ps(i.dest, src1, src2);
|
||||
} else {
|
||||
// Dest not equal to anything
|
||||
e.vmovdqa(i.dest, src1);
|
||||
// e.vmovdqa(i.dest,
|
||||
// src1);
|
||||
// chrispy: vmovdqa was a domain pipeline
|
||||
// hazard
|
||||
e.vmovaps(i.dest, src1);
|
||||
e.vfmadd213ps(i.dest, src2, src3);
|
||||
}
|
||||
});
|
||||
|
@ -2152,7 +2201,8 @@ struct MUL_ADD_V128
|
|||
// If i.dest == i.src3, back up i.src3 so we don't overwrite it.
|
||||
src3 = i.src3;
|
||||
if (i.dest == i.src3) {
|
||||
e.vmovdqa(e.xmm1, i.src3);
|
||||
// e.vmovdqa(e.xmm1, i.src3);
|
||||
e.vmovaps(e.xmm1, i.src3);
|
||||
src3 = e.xmm1;
|
||||
}
|
||||
}
|
||||
|
@ -2384,17 +2434,17 @@ EMITTER_OPCODE_TABLE(OPCODE_NEG, NEG_I8, NEG_I16, NEG_I32, NEG_I64, NEG_F32,
|
|||
// ============================================================================
|
||||
struct ABS_F32 : Sequence<ABS_F32, I<OPCODE_ABS, F32Op, F32Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
|
||||
e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
|
||||
}
|
||||
};
|
||||
struct ABS_F64 : Sequence<ABS_F64, I<OPCODE_ABS, F64Op, F64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD));
|
||||
e.vandpd(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD));
|
||||
}
|
||||
};
|
||||
struct ABS_V128 : Sequence<ABS_V128, I<OPCODE_ABS, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
|
||||
e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_ABS, ABS_F32, ABS_F64, ABS_V128);
|
||||
|
@ -2634,6 +2684,8 @@ struct DOT_PRODUCT_3_V128
|
|||
*/
|
||||
e.vstmxcsr(mxcsr_storage);
|
||||
|
||||
e.vmovaps(e.xmm2, e.GetXmmConstPtr(XMMThreeFloatMask));
|
||||
|
||||
e.mov(e.eax, 8);
|
||||
|
||||
auto src1v = e.xmm0;
|
||||
|
@ -2655,8 +2707,8 @@ struct DOT_PRODUCT_3_V128
|
|||
// so that in the future this could be optimized away if the top is known to
|
||||
// be zero. Right now im not sure that happens often though and its
|
||||
// currently not worth it also, maybe pre-and if constant
|
||||
e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
|
||||
e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
|
||||
e.vandps(e.xmm3, src1v, e.xmm2);
|
||||
e.vandps(e.xmm2, src2v, e.xmm2);
|
||||
|
||||
e.and_(mxcsr_storage, e.eax);
|
||||
e.vldmxcsr(mxcsr_storage); // overflow flag is cleared, now we're good to
|
||||
|
@ -2682,8 +2734,7 @@ struct DOT_PRODUCT_3_V128
|
|||
Xbyak::Label ret_qnan;
|
||||
Xbyak::Label done;
|
||||
e.jnz(ret_qnan);
|
||||
// e.vshufps(i.dest, e.xmm1,e.xmm1, 0); // broadcast
|
||||
e.vbroadcastss(i.dest, e.xmm1);
|
||||
e.vshufps(i.dest, e.xmm1, e.xmm1, 0); // broadcast
|
||||
e.jmp(done);
|
||||
e.L(ret_qnan);
|
||||
e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
|
||||
|
@ -2728,27 +2779,7 @@ struct DOT_PRODUCT_4_V128
|
|||
|
||||
e.vcvtps2pd(e.ymm0, src1v);
|
||||
e.vcvtps2pd(e.ymm1, src2v);
|
||||
/*
|
||||
e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
|
||||
e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
|
||||
|
||||
e.and_(mxcsr_storage, e.eax);
|
||||
e.vldmxcsr(mxcsr_storage); // overflow flag is cleared, now we're good to
|
||||
// go
|
||||
|
||||
e.vcvtps2pd(e.ymm0, e.xmm3);
|
||||
e.vcvtps2pd(e.ymm1, e.xmm2);
|
||||
|
||||
|
||||
e.vmulpd(e.ymm5, e.ymm0, e.ymm1);
|
||||
e.vextractf128(e.xmm4, e.ymm5, 1);
|
||||
e.vunpckhpd(e.xmm3, e.xmm5, e.xmm5); // get element [1] in xmm3
|
||||
e.vaddsd(e.xmm5, e.xmm5, e.xmm4);
|
||||
e.not_(e.eax);
|
||||
e.vaddsd(e.xmm2, e.xmm5, e.xmm3);
|
||||
e.vcvtsd2ss(e.xmm1, e.xmm2);
|
||||
|
||||
*/
|
||||
e.vmulpd(e.ymm3, e.ymm0, e.ymm1);
|
||||
e.vextractf128(e.xmm2, e.ymm3, 1);
|
||||
e.vaddpd(e.xmm3, e.xmm3, e.xmm2);
|
||||
|
@ -2765,8 +2796,7 @@ struct DOT_PRODUCT_4_V128
|
|||
Xbyak::Label ret_qnan;
|
||||
Xbyak::Label done;
|
||||
e.jnz(ret_qnan); // reorder these jmps later, just want to get this fix in
|
||||
// e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
|
||||
e.vbroadcastss(i.dest, e.xmm1);
|
||||
e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
|
||||
e.jmp(done);
|
||||
e.L(ret_qnan);
|
||||
e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
|
||||
|
@ -2846,10 +2876,17 @@ struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> {
|
|||
};
|
||||
struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
EmitCommutativeBinaryXmmOp(e, i,
|
||||
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
e.vpand(dest, src1, src2);
|
||||
});
|
||||
SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
|
||||
e.DeduceSimdDomain(i.src2.value));
|
||||
|
||||
EmitCommutativeBinaryXmmOp(
|
||||
e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
if (dom == SimdDomain::FLOATING) {
|
||||
e.vandps(dest, src2, src1);
|
||||
} else {
|
||||
e.vpand(dest, src2, src1);
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_AND, AND_I8, AND_I16, AND_I32, AND_I64, AND_V128);
|
||||
|
@ -2948,10 +2985,17 @@ struct AND_NOT_I64
|
|||
struct AND_NOT_V128
|
||||
: Sequence<AND_NOT_V128, I<OPCODE_AND_NOT, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
EmitCommutativeBinaryXmmOp(e, i,
|
||||
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
e.vpandn(dest, src2, src1);
|
||||
});
|
||||
SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
|
||||
e.DeduceSimdDomain(i.src2.value));
|
||||
|
||||
EmitCommutativeBinaryXmmOp(
|
||||
e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
if (dom == SimdDomain::FLOATING) {
|
||||
e.vandnps(dest, src2, src1);
|
||||
} else {
|
||||
e.vpandn(dest, src2, src1);
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_AND_NOT, AND_NOT_I8, AND_NOT_I16, AND_NOT_I32,
|
||||
|
@ -2994,10 +3038,17 @@ struct OR_I64 : Sequence<OR_I64, I<OPCODE_OR, I64Op, I64Op, I64Op>> {
|
|||
};
|
||||
struct OR_V128 : Sequence<OR_V128, I<OPCODE_OR, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
EmitCommutativeBinaryXmmOp(e, i,
|
||||
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
e.vpor(dest, src1, src2);
|
||||
});
|
||||
SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
|
||||
e.DeduceSimdDomain(i.src2.value));
|
||||
|
||||
EmitCommutativeBinaryXmmOp(
|
||||
e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
if (dom == SimdDomain::FLOATING) {
|
||||
e.vorps(dest, src1, src2);
|
||||
} else {
|
||||
e.vpor(dest, src1, src2);
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_OR, OR_I8, OR_I16, OR_I32, OR_I64, OR_V128);
|
||||
|
@ -3039,10 +3090,17 @@ struct XOR_I64 : Sequence<XOR_I64, I<OPCODE_XOR, I64Op, I64Op, I64Op>> {
|
|||
};
|
||||
struct XOR_V128 : Sequence<XOR_V128, I<OPCODE_XOR, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
EmitCommutativeBinaryXmmOp(e, i,
|
||||
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
e.vpxor(dest, src1, src2);
|
||||
});
|
||||
SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value),
|
||||
e.DeduceSimdDomain(i.src2.value));
|
||||
|
||||
EmitCommutativeBinaryXmmOp(
|
||||
e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
if (dom == SimdDomain::FLOATING) {
|
||||
e.vxorps(dest, src1, src2);
|
||||
} else {
|
||||
e.vpxor(dest, src1, src2);
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_XOR, XOR_I8, XOR_I16, XOR_I32, XOR_I64, XOR_V128);
|
||||
|
@ -3078,8 +3136,15 @@ struct NOT_I64 : Sequence<NOT_I64, I<OPCODE_NOT, I64Op, I64Op>> {
|
|||
};
|
||||
struct NOT_V128 : Sequence<NOT_V128, I<OPCODE_NOT, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
// dest = src ^ 0xFFFF...
|
||||
e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
|
||||
|
||||
SimdDomain domain =
|
||||
e.DeduceSimdDomain(i.src1.value);
|
||||
if (domain == SimdDomain::FLOATING) {
|
||||
e.vxorps(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
|
||||
} else {
|
||||
// dest = src ^ 0xFFFF...
|
||||
e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */));
|
||||
}
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_NOT, NOT_I8, NOT_I16, NOT_I32, NOT_I64, NOT_V128);
|
||||
|
@ -3217,7 +3282,7 @@ struct SHR_V128 : Sequence<SHR_V128, I<OPCODE_SHR, V128Op, V128Op, I8Op>> {
|
|||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateShrV128));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
e.vmovdqa(i.dest, e.xmm0);
|
||||
}
|
||||
static __m128i EmulateShrV128(void*, __m128i src1, uint8_t src2) {
|
||||
// Almost all instances are shamt = 1, but non-constant.
|
||||
|
|
|
@ -759,6 +759,18 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
i->Remove();
|
||||
result = true;
|
||||
}
|
||||
|
||||
else if (i->src2.value->IsConstantZero() && i->src3.value->IsConstantZero() &&
|
||||
i->flags == INT8_TYPE /*probably safe for int16 too*/) {
|
||||
/*
|
||||
chrispy: hoisted this check here from x64_seq_vector where if src1 is not constant, but src2 and src3 are zero, then we know the result will always be zero
|
||||
*/
|
||||
|
||||
v->set_zero(VEC128_TYPE);
|
||||
i->Remove();
|
||||
result = true;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case OPCODE_INSERT:
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#include "xenia/cpu/compiler/passes/simplification_pass.h"
|
||||
|
||||
#include <__msvc_int128.hpp>
|
||||
#include "xenia/base/byte_order.h"
|
||||
#include "xenia/base/profiling.h"
|
||||
namespace xe {
|
||||
|
@ -22,6 +23,52 @@ using namespace xe::cpu::hir;
|
|||
using xe::cpu::hir::HIRBuilder;
|
||||
using xe::cpu::hir::Instr;
|
||||
using xe::cpu::hir::Value;
|
||||
using vmask_portion_t = uint64_t;
|
||||
template <uint32_t Ndwords>
|
||||
struct Valuemask_t {
|
||||
vmask_portion_t bits[Ndwords];
|
||||
|
||||
static Valuemask_t create_empty(vmask_portion_t fill = 0) {
|
||||
Valuemask_t result;
|
||||
for (uint32_t i = 0; i < Ndwords; ++i) {
|
||||
result.bits[i] = fill;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
template <typename TCallable>
|
||||
Valuemask_t operate(TCallable&& oper) const {
|
||||
Valuemask_t result = create_empty();
|
||||
|
||||
for (uint32_t i = 0; i < Ndwords; ++i) {
|
||||
result.bits[i] = oper(bits[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
template <typename TCallable>
|
||||
Valuemask_t operate(TCallable&& oper, Valuemask_t other) const {
|
||||
Valuemask_t result = create_empty();
|
||||
|
||||
for (uint32_t i = 0; i < Ndwords; ++i) {
|
||||
result.bits[i] = oper(bits[i], other.bits[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
Valuemask_t operator&(ValueMask other) const {
|
||||
return operate([](vmask_portion_t x, vmask_portion_t y) { return x & y; },
|
||||
other);
|
||||
}
|
||||
Valuemask_t operator|(ValueMask other) const {
|
||||
return operate([](vmask_portion_t x, vmask_portion_t y) { return x | y; },
|
||||
other);
|
||||
}
|
||||
Valuemask_t operator^(ValueMask other) const {
|
||||
return operate([](vmask_portion_t x, vmask_portion_t y) { return x ^ y; },
|
||||
other);
|
||||
}
|
||||
Valuemask_t operator~() const {
|
||||
return operate([](vmask_portion_t x) { return ~x; }, other);
|
||||
}
|
||||
};
|
||||
|
||||
SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {}
|
||||
|
||||
|
@ -36,6 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
iter_result |= SimplifyBitArith(builder);
|
||||
iter_result |= EliminateConversions(builder);
|
||||
iter_result |= SimplifyAssignments(builder);
|
||||
iter_result |= BackpropTruncations(builder);
|
||||
result |= iter_result;
|
||||
} while (iter_result);
|
||||
return true;
|
||||
|
@ -151,19 +199,88 @@ bool SimplificationPass::CheckOr(hir::Instr* i, hir::HIRBuilder* builder) {
|
|||
}
|
||||
return false;
|
||||
}
|
||||
bool SimplificationPass::CheckBooleanXor1(hir::Instr* i,
|
||||
hir::HIRBuilder* builder,
|
||||
hir::Value* xored) {
|
||||
unsigned tunflags = MOVTUNNEL_ASSIGNS | MOVTUNNEL_MOVZX;
|
||||
|
||||
Instr* xordef = xored->GetDefTunnelMovs(&tunflags);
|
||||
if (!xordef) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Opcode xorop = xordef->opcode->num;
|
||||
bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0;
|
||||
|
||||
Value* new_value = nullptr;
|
||||
if (xorop == OPCODE_IS_FALSE) {
|
||||
new_value = builder->IsTrue(xordef->src1.value);
|
||||
|
||||
} else if (xorop == OPCODE_IS_TRUE) {
|
||||
new_value = builder->IsFalse(xordef->src1.value);
|
||||
} else if (xorop == OPCODE_COMPARE_EQ) {
|
||||
new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value);
|
||||
|
||||
} else if (xorop == OPCODE_COMPARE_NE) {
|
||||
new_value = builder->CompareEQ(xordef->src1.value, xordef->src2.value);
|
||||
} // todo: other conds
|
||||
|
||||
if (!new_value) {
|
||||
return false;
|
||||
}
|
||||
|
||||
new_value->def->MoveBefore(i);
|
||||
|
||||
i->Replace(need_zx ? &OPCODE_ZERO_EXTEND_info : &OPCODE_ASSIGN_info, 0);
|
||||
i->set_src1(new_value);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SimplificationPass::CheckXorOfTwoBools(hir::Instr* i,
|
||||
hir::HIRBuilder* builder,
|
||||
hir::Value* b1, hir::Value* b2) {
|
||||
// todo: implement
|
||||
return false;
|
||||
}
|
||||
bool SimplificationPass::CheckXor(hir::Instr* i, hir::HIRBuilder* builder) {
|
||||
if (CheckOrXorZero(i)) {
|
||||
return true;
|
||||
} else {
|
||||
if (i->src1.value == i->src2.value) {
|
||||
Value* src1 = i->src1.value;
|
||||
Value* src2 = i->src2.value;
|
||||
|
||||
if (SameValueOrEqualConstant(src1, src2)) {
|
||||
i->Replace(&OPCODE_ASSIGN_info, 0);
|
||||
i->set_src1(builder->LoadZero(i->dest->type));
|
||||
return true;
|
||||
}
|
||||
uint64_t type_mask = GetScalarTypeMask(i->dest->type);
|
||||
|
||||
auto [constant_value, variable_value] =
|
||||
i->BinaryValueArrangeAsConstAndVar();
|
||||
ScalarNZM nzm1 = GetScalarNZM(src1);
|
||||
ScalarNZM nzm2 = GetScalarNZM(src2);
|
||||
|
||||
if ((nzm1 & nzm2) ==
|
||||
0) { // no bits of the two sources overlap, this ought to be an OR
|
||||
// cs:optimizing
|
||||
/* i->Replace(&OPCODE_OR_info, 0);
|
||||
i->set_src1(src1);
|
||||
i->set_src2(src2);*/
|
||||
|
||||
i->opcode = &OPCODE_OR_info;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
if (nzm1 == 1ULL && nzm2 == 1ULL) {
|
||||
if (constant_value) {
|
||||
return CheckBooleanXor1(i, builder, variable_value);
|
||||
} else {
|
||||
return CheckXorOfTwoBools(i, builder, src1, src2);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t type_mask = GetScalarTypeMask(i->dest->type);
|
||||
|
||||
if (!constant_value) return false;
|
||||
|
||||
|
@ -504,11 +621,12 @@ bool SimplificationPass::TryHandleANDROLORSHLSeq(hir::Instr* i,
|
|||
}
|
||||
bool SimplificationPass::CheckAnd(hir::Instr* i, hir::HIRBuilder* builder) {
|
||||
retry_and_simplification:
|
||||
|
||||
auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar();
|
||||
if (!constant_value) {
|
||||
// added this for srawi
|
||||
uint64_t nzml = GetScalarNZM(i->src1.value);
|
||||
uint64_t nzmr = GetScalarNZM(i->src2.value);
|
||||
ScalarNZM nzml = GetScalarNZM(i->src1.value);
|
||||
ScalarNZM nzmr = GetScalarNZM(i->src2.value);
|
||||
|
||||
if ((nzml & nzmr) == 0) {
|
||||
i->Replace(&OPCODE_ASSIGN_info, 0);
|
||||
|
@ -524,9 +642,15 @@ retry_and_simplification:
|
|||
|
||||
// todo: check if masking with mask that covers all of zero extension source
|
||||
uint64_t type_mask = GetScalarTypeMask(i->dest->type);
|
||||
// if masking with entire width, pointless instruction so become an assign
|
||||
|
||||
if (constant_value->AsUint64() == type_mask) {
|
||||
ScalarNZM nzm = GetScalarNZM(variable_value);
|
||||
// if masking with entire width, pointless instruction so become an assign
|
||||
// chrispy: changed this to use the nzm instead, this optimizes away many and
|
||||
// instructions
|
||||
// chrispy: changed this again. detecting if nzm is a subset of and mask, if
|
||||
// so eliminate ex: (bool value) & 0xff = (bool value). the nzm is not equal
|
||||
// to the mask, but it is a subset so can be elimed
|
||||
if ((constant_value->AsUint64() & nzm) == nzm) {
|
||||
i->Replace(&OPCODE_ASSIGN_info, 0);
|
||||
i->set_src1(variable_value);
|
||||
return true;
|
||||
|
@ -555,7 +679,7 @@ retry_and_simplification:
|
|||
Value* or_left = true_variable_def->src1.value;
|
||||
Value* or_right = true_variable_def->src2.value;
|
||||
|
||||
uint64_t left_nzm = GetScalarNZM(or_left);
|
||||
ScalarNZM left_nzm = GetScalarNZM(or_left);
|
||||
|
||||
// use the other or input instead of the or output
|
||||
if ((constant_value->AsUint64() & left_nzm) == 0) {
|
||||
|
@ -565,7 +689,7 @@ retry_and_simplification:
|
|||
return true;
|
||||
}
|
||||
|
||||
uint64_t right_nzm = GetScalarNZM(or_right);
|
||||
ScalarNZM right_nzm = GetScalarNZM(or_right);
|
||||
|
||||
if ((constant_value->AsUint64() & right_nzm) == 0) {
|
||||
i->Replace(&OPCODE_AND_info, 0);
|
||||
|
@ -593,6 +717,21 @@ retry_and_simplification:
|
|||
return false;
|
||||
}
|
||||
bool SimplificationPass::CheckAdd(hir::Instr* i, hir::HIRBuilder* builder) {
|
||||
Value* src1 = i->src1.value;
|
||||
Value* src2 = i->src2.value;
|
||||
|
||||
ScalarNZM nzm1 = GetScalarNZM(src1);
|
||||
ScalarNZM nzm2 = GetScalarNZM(src2);
|
||||
if ((nzm1 & nzm2) == 0) { // no bits overlap, there will never be a carry
|
||||
// from any bits to any others, make this an OR
|
||||
|
||||
/* i->Replace(&OPCODE_OR_info, 0);
|
||||
i->set_src1(src1);
|
||||
i->set_src2(src2);*/
|
||||
i->opcode = &OPCODE_OR_info;
|
||||
return true;
|
||||
}
|
||||
|
||||
auto [definition, added_constant] =
|
||||
i->BinaryValueArrangeByDefOpAndConstant(&OPCODE_NOT_info);
|
||||
|
||||
|
@ -645,7 +784,7 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
|
|||
return false;
|
||||
}
|
||||
|
||||
uint64_t nzm_for_var = GetScalarNZM(variable);
|
||||
ScalarNZM nzm_for_var = GetScalarNZM(variable);
|
||||
Opcode cmpop = i->opcode->num;
|
||||
uint64_t constant_unpacked = constant_value->AsUint64();
|
||||
uint64_t signbit_for_var = GetScalarSignbitMask(variable->type);
|
||||
|
@ -670,6 +809,14 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
|
|||
i->set_src1(variable);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (cmpop == OPCODE_COMPARE_ULE &&
|
||||
constant_unpacked ==
|
||||
0) { // less than or equal to zero = (== 0) = IS_FALSE
|
||||
i->Replace(&OPCODE_IS_FALSE_info, 0);
|
||||
i->set_src1(variable);
|
||||
return true;
|
||||
}
|
||||
// todo: OPCODE_COMPARE_NE too?
|
||||
if (cmpop == OPCODE_COMPARE_EQ &&
|
||||
def_opcode == OPCODE_NOT) { // i see this a lot around addic insns
|
||||
|
@ -774,7 +921,7 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
|
|||
return false;
|
||||
}
|
||||
|
||||
uint64_t input_nzm = GetScalarNZM(input);
|
||||
ScalarNZM input_nzm = GetScalarNZM(input);
|
||||
|
||||
if (istrue &&
|
||||
input_nzm == 1) { // doing istrue on a value thats already a bool bitwise
|
||||
|
@ -813,6 +960,98 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
|
|||
input_def = input_def->GetDestDefSkipAssigns();*/
|
||||
return false;
|
||||
}
|
||||
bool SimplificationPass::CheckSHRByConst(hir::Instr* i,
|
||||
hir::HIRBuilder* builder,
|
||||
hir::Value* variable,
|
||||
unsigned int shift) {
|
||||
if (shift >= 3 && shift <= 6) {
|
||||
// is possible shift of lzcnt res, do some tunneling
|
||||
|
||||
unsigned int tflags = MOVTUNNEL_ASSIGNS | MOVTUNNEL_MOVZX |
|
||||
MOVTUNNEL_TRUNCATE | MOVTUNNEL_MOVSX |
|
||||
MOVTUNNEL_AND32FF;
|
||||
|
||||
Instr* vardef = variable->def;
|
||||
|
||||
hir::Instr* var_def = variable->GetDefTunnelMovs(&tflags);
|
||||
|
||||
if (var_def && var_def->opcode == &OPCODE_CNTLZ_info) {
|
||||
Value* lz_input = var_def->src1.value;
|
||||
TypeName type_of_lz_input = lz_input->type;
|
||||
size_t shift_for_zero =
|
||||
xe::log2_floor(GetTypeSize(type_of_lz_input) * CHAR_BIT);
|
||||
|
||||
if (shift == shift_for_zero) {
|
||||
// we ought to be OPCODE_IS_FALSE!
|
||||
/*
|
||||
explanation: if an input to lzcnt is zero, the result will be the
|
||||
bit size of the input type, which is always a power of two any
|
||||
nonzero result will be less than the bit size so you can test for
|
||||
zero by doing, for instance with a 32 bit value, lzcnt32(input) >> 5
|
||||
this is a very common way of testing for zero without branching on
|
||||
ppc, and the xb360 ppc compiler used it a lot we optimize this away
|
||||
for simplicity and to enable further optimizations, but actually this
|
||||
is also quite fast on modern x86 processors as well, for instance on
|
||||
zen 2 the rcp through of lzcnt is 0.25, meaning four can be executed
|
||||
in one cycle
|
||||
|
||||
*/
|
||||
|
||||
if (variable->type != INT8_TYPE) {
|
||||
Value* isfalsetest = builder->IsFalse(lz_input);
|
||||
|
||||
isfalsetest->def->MoveBefore(i);
|
||||
i->Replace(&OPCODE_ZERO_EXTEND_info, 0);
|
||||
i->set_src1(isfalsetest);
|
||||
|
||||
} else {
|
||||
i->Replace(&OPCODE_IS_FALSE_info, 0);
|
||||
i->set_src1(lz_input);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool SimplificationPass::CheckSHR(hir::Instr* i, hir::HIRBuilder* builder) {
|
||||
Value* shr_lhs = i->src1.value;
|
||||
Value* shr_rhs = i->src2.value;
|
||||
if (!shr_lhs || !shr_rhs) return false;
|
||||
if (shr_rhs->IsConstant()) {
|
||||
return CheckSHRByConst(i, builder, shr_lhs, shr_rhs->AsUint32());
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool SimplificationPass::CheckSAR(hir::Instr* i, hir::HIRBuilder* builder) {
|
||||
Value* l = i->src1.value;
|
||||
Value* r = i->src2.value;
|
||||
ScalarNZM l_nzm = GetScalarNZM(l);
|
||||
uint64_t signbit_mask = GetScalarSignbitMask(l->type);
|
||||
size_t typesize = GetTypeSize(l->type);
|
||||
|
||||
/*
|
||||
todo: folding this requires the mask of constant bits
|
||||
if (r->IsConstant()) {
|
||||
uint32_t const_r = r->AsUint32();
|
||||
|
||||
if (const_r == (typesize * CHAR_BIT) - 1) { //the shift is being done to
|
||||
fill the result with the signbit of the input.
|
||||
|
||||
|
||||
}
|
||||
}*/
|
||||
if ((l_nzm & signbit_mask) == 0) { // signbit will never be set, might as
|
||||
// well be an SHR. (this does happen)
|
||||
i->opcode = &OPCODE_SHR_info;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
|
||||
bool result = false;
|
||||
auto block = builder->first_block();
|
||||
|
@ -822,19 +1061,24 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
|
|||
// vector types use the same opcodes as scalar ones for AND/OR/XOR! we
|
||||
// don't handle these in our simplifications, so skip
|
||||
if (i->dest && IsScalarIntegralType(i->dest->type)) {
|
||||
if (i->opcode == &OPCODE_OR_info) {
|
||||
Opcode iop = i->opcode->num;
|
||||
|
||||
if (iop == OPCODE_OR) {
|
||||
result |= CheckOr(i, builder);
|
||||
} else if (i->opcode == &OPCODE_XOR_info) {
|
||||
} else if (iop == OPCODE_XOR) {
|
||||
result |= CheckXor(i, builder);
|
||||
} else if (i->opcode == &OPCODE_AND_info) {
|
||||
} else if (iop == OPCODE_AND) {
|
||||
result |= CheckAnd(i, builder);
|
||||
} else if (i->opcode == &OPCODE_ADD_info) {
|
||||
} else if (iop == OPCODE_ADD) {
|
||||
result |= CheckAdd(i, builder);
|
||||
} else if (IsScalarBasicCmp(i->opcode->num)) {
|
||||
} else if (IsScalarBasicCmp(iop)) {
|
||||
result |= CheckScalarConstCmp(i, builder);
|
||||
} else if (i->opcode == &OPCODE_IS_FALSE_info ||
|
||||
i->opcode == &OPCODE_IS_TRUE_info) {
|
||||
} else if (iop == OPCODE_IS_FALSE || iop == OPCODE_IS_TRUE) {
|
||||
result |= CheckIsTrueIsFalse(i, builder);
|
||||
} else if (iop == OPCODE_SHR) {
|
||||
result |= CheckSHR(i, builder);
|
||||
} else if (iop == OPCODE_SHA) {
|
||||
result |= CheckSAR(i, builder);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -928,7 +1172,6 @@ bool SimplificationPass::CheckByteSwap(Instr* i) {
|
|||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
|
||||
// Run over the instructions and rename assigned variables:
|
||||
// v1 = v0
|
||||
|
@ -952,22 +1195,11 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
|
|||
while (block) {
|
||||
auto i = block->instr_head;
|
||||
while (i) {
|
||||
uint32_t signature = i->opcode->signature;
|
||||
if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) {
|
||||
i->VisitValueOperands([&result, i, this](Value* value, uint32_t idx) {
|
||||
bool modified = false;
|
||||
i->set_src1(CheckValue(i->src1.value, modified));
|
||||
i->set_srcN(CheckValue(value, modified), idx);
|
||||
result |= modified;
|
||||
}
|
||||
if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) {
|
||||
bool modified = false;
|
||||
i->set_src2(CheckValue(i->src2.value, modified));
|
||||
result |= modified;
|
||||
}
|
||||
if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) {
|
||||
bool modified = false;
|
||||
i->set_src3(CheckValue(i->src3.value, modified));
|
||||
result |= modified;
|
||||
}
|
||||
});
|
||||
|
||||
i = i->next;
|
||||
}
|
||||
|
@ -976,6 +1208,71 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
|
|||
return result;
|
||||
}
|
||||
|
||||
struct TruncateSimplifier {
|
||||
TypeName type_from, type_to;
|
||||
uint32_t sizeof_from, sizeof_to;
|
||||
uint32_t bit_sizeof_from, bit_sizeof_to;
|
||||
uint64_t typemask_from, typemask_to;
|
||||
hir::HIRBuilder* builder;
|
||||
hir::Instr* truncate_instr;
|
||||
hir::Value* truncated_value;
|
||||
hir::Instr* truncated_value_def;
|
||||
};
|
||||
bool SimplificationPass::BackpropTruncations(hir::Instr* i,
|
||||
hir::HIRBuilder* builder) {
|
||||
if (i->opcode != &OPCODE_TRUNCATE_info) {
|
||||
return false;
|
||||
}
|
||||
TypeName type_from = i->src1.value->type;
|
||||
TypeName type_to = i->dest->type;
|
||||
|
||||
uint32_t sizeof_from = static_cast<uint32_t>(GetTypeSize(type_from));
|
||||
uint32_t sizeof_to = static_cast<uint32_t>(GetTypeSize(type_to));
|
||||
|
||||
Instr* input_def = i->src1.value->GetDefSkipAssigns();
|
||||
if (!input_def) {
|
||||
return false;
|
||||
}
|
||||
Opcode input_opc = input_def->opcode->num;
|
||||
|
||||
if (input_opc == OPCODE_SHL && input_def->src2.value->IsConstant()) {
|
||||
uint32_t src2_shift = input_def->src2.value->AsUint32();
|
||||
if (src2_shift < (sizeof_to * CHAR_BIT)) {
|
||||
Value* truncated_preshift =
|
||||
builder->Truncate(input_def->src1.value, type_to);
|
||||
|
||||
truncated_preshift->def->MoveBefore(i);
|
||||
i->Replace(&OPCODE_SHL_info, 0);
|
||||
i->set_src1(truncated_preshift);
|
||||
i->set_src2(input_def->src2.value);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (input_opc == OPCODE_LOAD_CONTEXT) {
|
||||
if (sizeof_from == 8 && sizeof_to == 4) {
|
||||
Value* loadof = builder->LoadContext(input_def->src1.offset, INT32_TYPE);
|
||||
loadof->def->MoveBefore(input_def);
|
||||
i->Replace(&OPCODE_ASSIGN_info, 0);
|
||||
i->set_src1(loadof);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
bool SimplificationPass::BackpropTruncations(hir::HIRBuilder* builder) {
|
||||
bool result = false;
|
||||
auto block = builder->first_block();
|
||||
while (block) {
|
||||
auto i = block->instr_head;
|
||||
while (i) {
|
||||
result |= BackpropTruncations(i, builder);
|
||||
i = i->next;
|
||||
}
|
||||
block = block->next;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
Value* SimplificationPass::CheckValue(Value* value, bool& result) {
|
||||
auto def = value->def;
|
||||
if (def && def->opcode == &OPCODE_ASSIGN_info) {
|
||||
|
|
|
@ -32,6 +32,8 @@ class SimplificationPass : public ConditionalGroupSubpass {
|
|||
bool SimplifyAssignments(hir::HIRBuilder* builder);
|
||||
hir::Value* CheckValue(hir::Value* value, bool& result);
|
||||
bool SimplifyBitArith(hir::HIRBuilder* builder);
|
||||
bool BackpropTruncations(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
bool BackpropTruncations(hir::HIRBuilder* builder);
|
||||
// handle either or or xor with 0
|
||||
bool CheckOrXorZero(hir::Instr* i);
|
||||
bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
|
@ -44,6 +46,17 @@ class SimplificationPass : public ConditionalGroupSubpass {
|
|||
bool CheckSelect(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
bool CheckScalarConstCmp(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
bool CheckIsTrueIsFalse(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
bool CheckSHRByConst(hir::Instr* i, hir::HIRBuilder* builder,
|
||||
hir::Value* variable, unsigned int shift);
|
||||
|
||||
bool CheckSHR(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
bool CheckSAR(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
// called by CheckXor, handles transforming a 1 bit value xored against 1
|
||||
bool CheckBooleanXor1(hir::Instr* i, hir::HIRBuilder* builder,
|
||||
hir::Value* xored);
|
||||
bool CheckXorOfTwoBools(hir::Instr* i, hir::HIRBuilder* builder,
|
||||
hir::Value* b1, hir::Value* b2);
|
||||
|
||||
// for rlwinm
|
||||
bool TryHandleANDROLORSHLSeq(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
bool TransformANDROLORSHLSeq(
|
||||
|
|
|
@ -14,38 +14,15 @@
|
|||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace hir {
|
||||
|
||||
void Instr::set_src1(Value* value) {
|
||||
if (src1.value == value) {
|
||||
void Instr::set_srcN(Value* value, uint32_t idx) {
|
||||
if (srcs[idx].value == value) {
|
||||
return;
|
||||
}
|
||||
if (src1_use) {
|
||||
src1.value->RemoveUse(src1_use);
|
||||
if (srcs_use[idx]) {
|
||||
srcs[idx].value->RemoveUse(srcs_use[idx]);
|
||||
}
|
||||
src1.value = value;
|
||||
src1_use = value ? value->AddUse(block->arena, this) : NULL;
|
||||
}
|
||||
|
||||
void Instr::set_src2(Value* value) {
|
||||
if (src2.value == value) {
|
||||
return;
|
||||
}
|
||||
if (src2_use) {
|
||||
src2.value->RemoveUse(src2_use);
|
||||
}
|
||||
src2.value = value;
|
||||
src2_use = value ? value->AddUse(block->arena, this) : NULL;
|
||||
}
|
||||
|
||||
void Instr::set_src3(Value* value) {
|
||||
if (src3.value == value) {
|
||||
return;
|
||||
}
|
||||
if (src3_use) {
|
||||
src3.value->RemoveUse(src3_use);
|
||||
}
|
||||
src3.value = value;
|
||||
src3_use = value ? value->AddUse(block->arena, this) : NULL;
|
||||
srcs[idx].value = value;
|
||||
srcs_use[idx] = value ? value->AddUse(block->arena, this) : nullptr;
|
||||
}
|
||||
|
||||
void Instr::MoveBefore(Instr* other) {
|
||||
|
@ -128,6 +105,81 @@ Instr* Instr::GetDestDefSkipAssigns() {
|
|||
}
|
||||
return current_def;
|
||||
}
|
||||
Instr* Instr::GetDestDefTunnelMovs(unsigned int* tunnel_flags) {
|
||||
unsigned int traversed_types = 0;
|
||||
unsigned int in_flags = *tunnel_flags;
|
||||
Instr* current_def = this;
|
||||
|
||||
while (true) {
|
||||
Opcode op = current_def->opcode->num;
|
||||
|
||||
switch (op) {
|
||||
case OPCODE_ASSIGN: {
|
||||
if ((in_flags & MOVTUNNEL_ASSIGNS)) {
|
||||
current_def = current_def->src1.value->def;
|
||||
traversed_types |= MOVTUNNEL_ASSIGNS;
|
||||
|
||||
} else {
|
||||
goto exit_loop;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case OPCODE_ZERO_EXTEND: {
|
||||
if ((in_flags & MOVTUNNEL_MOVZX)) {
|
||||
current_def = current_def->src1.value->def;
|
||||
traversed_types |= MOVTUNNEL_MOVZX;
|
||||
|
||||
} else {
|
||||
goto exit_loop;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case OPCODE_SIGN_EXTEND: {
|
||||
if ((in_flags & MOVTUNNEL_MOVSX)) {
|
||||
current_def = current_def->src1.value->def;
|
||||
traversed_types |= MOVTUNNEL_MOVSX;
|
||||
|
||||
} else {
|
||||
goto exit_loop;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case OPCODE_TRUNCATE: {
|
||||
if ((in_flags & MOVTUNNEL_TRUNCATE)) {
|
||||
current_def = current_def->src1.value->def;
|
||||
traversed_types |= MOVTUNNEL_TRUNCATE;
|
||||
|
||||
} else {
|
||||
goto exit_loop;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case OPCODE_AND: {
|
||||
if ((in_flags & MOVTUNNEL_AND32FF)) {
|
||||
auto [constant, nonconst] =
|
||||
current_def->BinaryValueArrangeAsConstAndVar();
|
||||
if (!constant || constant->AsUint64() != 0xFFFFFFFF) {
|
||||
goto exit_loop;
|
||||
}
|
||||
current_def = nonconst->def;
|
||||
traversed_types |= MOVTUNNEL_AND32FF;
|
||||
|
||||
} else {
|
||||
goto exit_loop;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
goto exit_loop;
|
||||
}
|
||||
if (!current_def) {
|
||||
goto exit_loop;
|
||||
}
|
||||
}
|
||||
exit_loop:
|
||||
*tunnel_flags = traversed_types;
|
||||
return current_def;
|
||||
}
|
||||
} // namespace hir
|
||||
} // namespace cpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -25,6 +25,14 @@ namespace hir {
|
|||
|
||||
class Block;
|
||||
class Label;
|
||||
// todo: better name
|
||||
enum MovTunnel {
|
||||
MOVTUNNEL_ASSIGNS = 1,
|
||||
MOVTUNNEL_MOVZX = 2,
|
||||
MOVTUNNEL_MOVSX = 4,
|
||||
MOVTUNNEL_TRUNCATE = 8,
|
||||
MOVTUNNEL_AND32FF = 16, // tunnel through and with 0xFFFFFFFF
|
||||
};
|
||||
|
||||
class Instr {
|
||||
public:
|
||||
|
@ -44,17 +52,28 @@ class Instr {
|
|||
} Op;
|
||||
|
||||
Value* dest;
|
||||
Op src1;
|
||||
Op src2;
|
||||
Op src3;
|
||||
union {
|
||||
struct {
|
||||
Op src1;
|
||||
Op src2;
|
||||
Op src3;
|
||||
};
|
||||
Op srcs[3];
|
||||
};
|
||||
union {
|
||||
struct {
|
||||
Value::Use* src1_use;
|
||||
Value::Use* src2_use;
|
||||
Value::Use* src3_use;
|
||||
};
|
||||
Value::Use* srcs_use[3];
|
||||
};
|
||||
void set_srcN(Value* value, uint32_t idx);
|
||||
void set_src1(Value* value) { set_srcN(value, 0); }
|
||||
|
||||
Value::Use* src1_use;
|
||||
Value::Use* src2_use;
|
||||
Value::Use* src3_use;
|
||||
void set_src2(Value* value) { set_srcN(value, 1); }
|
||||
|
||||
void set_src1(Value* value);
|
||||
void set_src2(Value* value);
|
||||
void set_src3(Value* value);
|
||||
void set_src3(Value* value) { set_srcN(value, 2); }
|
||||
|
||||
void MoveBefore(Instr* other);
|
||||
void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
|
||||
|
@ -104,6 +123,8 @@ if both are constant, return nullptr, nullptr
|
|||
}
|
||||
|
||||
Instr* GetDestDefSkipAssigns();
|
||||
Instr* GetDestDefTunnelMovs(unsigned int* tunnel_flags);
|
||||
|
||||
// returns [def op, constant]
|
||||
std::pair<Value*, Value*> BinaryValueArrangeByDefOpAndConstant(
|
||||
const OpcodeInfo* op_ptr) {
|
||||
|
@ -115,6 +136,28 @@ if both are constant, return nullptr, nullptr
|
|||
}
|
||||
return result;
|
||||
}
|
||||
/*
|
||||
Invokes the provided lambda callback on each operand that is a Value. Callback
|
||||
is invoked with Value*, uint32_t index
|
||||
*/
|
||||
template <typename TCallable>
|
||||
void VisitValueOperands(TCallable&& call_for_values) {
|
||||
uint32_t signature = opcode->signature;
|
||||
|
||||
OpcodeSignatureType t_dest, t_src1, t_src2, t_src3;
|
||||
|
||||
UnpackOpcodeSig(signature, t_dest, t_src1, t_src2, t_src3);
|
||||
|
||||
if (t_src1 == OPCODE_SIG_TYPE_V) {
|
||||
call_for_values(src1.value, 0);
|
||||
}
|
||||
if (t_src2 == OPCODE_SIG_TYPE_V) {
|
||||
call_for_values(src2.value, 1);
|
||||
}
|
||||
if (t_src3 == OPCODE_SIG_TYPE_V) {
|
||||
call_for_values(src3.value, 2);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace hir
|
||||
|
|
|
@ -1798,6 +1798,13 @@ hir::Instr* Value::GetDefSkipAssigns() {
|
|||
return nullptr;
|
||||
}
|
||||
}
|
||||
hir::Instr* Value::GetDefTunnelMovs(unsigned int* tunnel_flags) {
|
||||
if (def) {
|
||||
return def->GetDestDefTunnelMovs(tunnel_flags);
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
} // namespace hir
|
||||
} // namespace cpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -598,6 +598,8 @@ class Value {
|
|||
void CountLeadingZeros(const Value* other);
|
||||
bool Compare(Opcode opcode, Value* other);
|
||||
hir::Instr* GetDefSkipAssigns();
|
||||
// tunnel_flags is updated to the kinds we actually traversed
|
||||
hir::Instr* GetDefTunnelMovs(unsigned int* tunnel_flags);
|
||||
|
||||
private:
|
||||
static bool CompareInt8(Opcode opcode, Value* a, Value* b);
|
||||
|
|
|
@ -246,30 +246,7 @@ enum class PPCRegister {
|
|||
};
|
||||
|
||||
#pragma pack(push, 8)
|
||||
typedef struct PPCContext_s {
|
||||
// Must be stored at 0x0 for now.
|
||||
// TODO(benvanik): find a nice way to describe this to the JIT.
|
||||
ThreadState* thread_state; // 0x0
|
||||
// TODO(benvanik): this is getting nasty. Must be here.
|
||||
uint8_t* virtual_membase; // 0x8
|
||||
|
||||
// Most frequently used registers first.
|
||||
uint64_t lr; // 0x10 Link register
|
||||
uint64_t ctr; // 0x18 Count register
|
||||
uint64_t r[32]; // 0x20 General purpose registers
|
||||
double f[32]; // 0x120 Floating-point registers
|
||||
vec128_t v[128]; // 0x220 VMX128 vector registers
|
||||
|
||||
// XER register:
|
||||
// Split to make it easier to do individual updates.
|
||||
uint8_t xer_ca; // 0xA20
|
||||
uint8_t xer_ov; // 0xA21
|
||||
uint8_t xer_so; // 0xA22
|
||||
|
||||
// Condition registers:
|
||||
// These are split to make it easier to do DCE on unused stores.
|
||||
uint64_t cr() const;
|
||||
void set_cr(uint64_t value);
|
||||
typedef struct alignas(64) PPCContext_s {
|
||||
union {
|
||||
uint32_t value;
|
||||
struct {
|
||||
|
@ -395,6 +372,25 @@ typedef struct PPCContext_s {
|
|||
} bits;
|
||||
} fpscr; // Floating-point status and control register
|
||||
|
||||
// Most frequently used registers first.
|
||||
|
||||
uint64_t r[32]; // 0x20 General purpose registers
|
||||
uint64_t ctr; // 0x18 Count register
|
||||
uint64_t lr; // 0x10 Link register
|
||||
double f[32]; // 0x120 Floating-point registers
|
||||
vec128_t v[128]; // 0x220 VMX128 vector registers
|
||||
|
||||
// XER register:
|
||||
// Split to make it easier to do individual updates.
|
||||
uint8_t xer_ca;
|
||||
uint8_t xer_ov;
|
||||
uint8_t xer_so;
|
||||
|
||||
// Condition registers:
|
||||
// These are split to make it easier to do DCE on unused stores.
|
||||
uint64_t cr() const;
|
||||
void set_cr(uint64_t value);
|
||||
|
||||
uint8_t vscr_sat;
|
||||
|
||||
// uint32_t get_fprf() {
|
||||
|
@ -425,7 +421,8 @@ typedef struct PPCContext_s {
|
|||
|
||||
// Value of last reserved load
|
||||
uint64_t reserved_val;
|
||||
|
||||
ThreadState* thread_state;
|
||||
uint8_t* virtual_membase;
|
||||
static std::string GetRegisterName(PPCRegister reg);
|
||||
std::string GetStringFromValue(PPCRegister reg) const;
|
||||
void SetValueFromString(PPCRegister reg, std::string value);
|
||||
|
|
|
@ -18,12 +18,50 @@
|
|||
#include "xenia/cpu/processor.h"
|
||||
|
||||
#include "xenia/xbox.h"
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
|
||||
thread_local ThreadState* thread_state_ = nullptr;
|
||||
|
||||
static void* AllocateContext() {
|
||||
size_t granularity = xe::memory::allocation_granularity();
|
||||
for (unsigned pos32 = 0x40; pos32 < 8192; ++pos32) {
|
||||
/*
|
||||
we want our register which points to the context to have 0xE0000000 in
|
||||
the low 32 bits, for checking for whether we need the 4k offset, but also
|
||||
if we allocate starting from the page before we allow backends to index
|
||||
negatively to get to their own backend specific data, which makes full
|
||||
use of int8 displacement
|
||||
|
||||
|
||||
the downside is we waste most of one granula and probably a fair bit of
|
||||
the one starting at 0xE0 by using a direct virtual memory allocation
|
||||
instead of malloc
|
||||
*/
|
||||
uintptr_t context_pre =
|
||||
((static_cast<uint64_t>(pos32) << 32) | 0xE0000000) - granularity;
|
||||
|
||||
void* p = memory::AllocFixed(
|
||||
(void*)context_pre, granularity + sizeof(ppc::PPCContext),
|
||||
memory::AllocationType::kReserveCommit, memory::PageAccess::kReadWrite);
|
||||
if (p) {
|
||||
return reinterpret_cast<char*>(p) +
|
||||
granularity; // now we have a ctx ptr with the e0 constant in low,
|
||||
// and one page allocated before it
|
||||
}
|
||||
}
|
||||
|
||||
assert_always("giving up on allocating context, likely leaking contexts");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static void FreeContext(void* ctx) {
|
||||
char* true_start_of_ctx = &reinterpret_cast<char*>(
|
||||
ctx)[-static_cast<ptrdiff_t>(xe::memory::allocation_granularity())];
|
||||
memory::DeallocFixed(true_start_of_ctx, 0,
|
||||
memory::DeallocationType::kRelease);
|
||||
}
|
||||
|
||||
ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
|
||||
uint32_t stack_base, uint32_t pcr_address)
|
||||
: processor_(processor),
|
||||
|
@ -38,7 +76,9 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
|
|||
backend_data_ = processor->backend()->AllocThreadData();
|
||||
|
||||
// Allocate with 64b alignment.
|
||||
context_ = memory::AlignedAlloc<ppc::PPCContext>(64);
|
||||
|
||||
context_ = reinterpret_cast<ppc::PPCContext*>(AllocateContext()); // memory::AlignedAlloc<ppc::PPCContext>(64);
|
||||
processor->backend()->InitializeBackendContext(context_);
|
||||
assert_true(((uint64_t)context_ & 0x3F) == 0);
|
||||
std::memset(context_, 0, sizeof(ppc::PPCContext));
|
||||
|
||||
|
@ -62,8 +102,10 @@ ThreadState::~ThreadState() {
|
|||
if (thread_state_ == this) {
|
||||
thread_state_ = nullptr;
|
||||
}
|
||||
|
||||
memory::AlignedFree(context_);
|
||||
if (context_) {
|
||||
FreeContext(reinterpret_cast<void*>(context_));
|
||||
}
|
||||
// memory::AlignedFree(context_);
|
||||
}
|
||||
|
||||
void ThreadState::Bind(ThreadState* thread_state) {
|
||||
|
|
Loading…
Reference in New Issue