diff --git a/src/xenia/cpu/backend/backend.h b/src/xenia/cpu/backend/backend.h index 054d7e752..aa9097602 100644 --- a/src/xenia/cpu/backend/backend.h +++ b/src/xenia/cpu/backend/backend.h @@ -63,6 +63,10 @@ class Backend { virtual void InstallBreakpoint(Breakpoint* breakpoint) {} virtual void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) {} virtual void UninstallBreakpoint(Breakpoint* breakpoint) {} + // ctx points to the start of a ppccontext, ctx - page_allocation_granularity + // up until the start of ctx may be used by the backend to store whatever data + // they want + virtual void InitializeBackendContext(void* ctx) {} protected: Processor* processor_ = nullptr; diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index c6f2d6180..6d5690c2f 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -32,6 +32,9 @@ #include "xenia/cpu/cpu_flags.h" #include "xenia/cpu/function.h" #include "xenia/cpu/function_debug_info.h" +#include "xenia/cpu/hir/instr.h" +#include "xenia/cpu/hir/opcodes.h" +#include "xenia/cpu/hir/value.h" #include "xenia/cpu/processor.h" #include "xenia/cpu/symbol.h" #include "xenia/cpu/thread_state.h" @@ -393,7 +396,8 @@ void X64Emitter::DebugBreak() { } uint64_t TrapDebugPrint(void* raw_context, uint64_t address) { - auto thread_state = *reinterpret_cast(raw_context); + auto thread_state = + reinterpret_cast(raw_context)->thread_state; uint32_t str_ptr = uint32_t(thread_state->context()->r[3]); // uint16_t str_len = uint16_t(thread_state->context()->r[4]); auto str = thread_state->memory()->TranslateVirtual(str_ptr); @@ -408,7 +412,8 @@ uint64_t TrapDebugPrint(void* raw_context, uint64_t address) { } uint64_t TrapDebugBreak(void* raw_context, uint64_t address) { - auto thread_state = *reinterpret_cast(raw_context); + auto thread_state = + reinterpret_cast(raw_context)->thread_state; XELOGE("tw/td forced trap hit! This should be a crash!"); if (cvars::break_on_debugbreak) { xe::debugging::Break(); @@ -447,7 +452,8 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) { // This is used by the X64ThunkEmitter's ResolveFunctionThunk. uint64_t ResolveFunction(void* raw_context, uint64_t target_address) { - auto thread_state = *reinterpret_cast(raw_context); + auto thread_state = + reinterpret_cast(raw_context)->thread_state; // TODO(benvanik): required? assert_not_zero(target_address); @@ -1191,7 +1197,109 @@ Xbyak::Address X64Emitter::StashConstantXmm(int index, const vec128_t& v) { MovMem64(addr + 8, v.high); return ptr[addr]; } +static bool IsVectorCompare(const Instr* i) { + hir::Opcode op = i->opcode->num; + return op >= hir::OPCODE_VECTOR_COMPARE_EQ && + op <= hir::OPCODE_VECTOR_COMPARE_UGE; +} +static bool IsFlaggedVectorOp(const Instr* i) { + if (IsVectorCompare(i)) { + return true; + } + hir::Opcode op = i->opcode->num; + using namespace hir; + switch (op) { + case OPCODE_VECTOR_SUB: + case OPCODE_VECTOR_ADD: + case OPCODE_SWIZZLE: + return true; + } + return false; +} + +static SimdDomain GetDomainForFlaggedVectorOp(const hir::Instr* df) { + switch (df->flags) { // check what datatype we compared as + case hir::INT16_TYPE: + case hir::INT32_TYPE: + case hir::INT8_TYPE: + case hir::INT64_TYPE: + return SimdDomain::INTEGER; + case hir::FLOAT32_TYPE: + case hir::FLOAT64_TYPE: // pretty sure float64 doesnt occur with vectors. + // here for completeness + return SimdDomain::FLOATING; + default: + return SimdDomain::DONTCARE; + } + return SimdDomain::DONTCARE; +} +// this list is incomplete +static bool IsDefiniteIntegerDomainOpcode(hir::Opcode opc) { + using namespace hir; + switch (opc) { + case OPCODE_LOAD_VECTOR_SHL: + case OPCODE_LOAD_VECTOR_SHR: + case OPCODE_VECTOR_CONVERT_F2I: + case OPCODE_VECTOR_MIN: // there apparently is no FLOAT32_TYPE for min/maxs + // flags + case OPCODE_VECTOR_MAX: + case OPCODE_VECTOR_SHL: + case OPCODE_VECTOR_SHR: + case OPCODE_VECTOR_SHA: + case OPCODE_VECTOR_ROTATE_LEFT: + case OPCODE_VECTOR_AVERAGE: // apparently no float32 type for this + case OPCODE_EXTRACT: + case OPCODE_INSERT: // apparently no f32 type for these two + return true; + } + return false; +} +static bool IsDefiniteFloatingDomainOpcode(hir::Opcode opc) { + using namespace hir; + switch (opc) { + case OPCODE_VECTOR_CONVERT_I2F: + case OPCODE_VECTOR_DENORMFLUSH: + case OPCODE_DOT_PRODUCT_3: + case OPCODE_DOT_PRODUCT_4: + case OPCODE_LOG2: + case OPCODE_POW2: + case OPCODE_RECIP: + case OPCODE_ROUND: + case OPCODE_SQRT: + case OPCODE_MUL: + case OPCODE_MUL_SUB: + case OPCODE_MUL_ADD: + case OPCODE_ABS: + return true; + } + return false; +} + +SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) { + hir::Instr* df = for_value->def; + if (!df) { + // todo: visit uses to figure out domain + return SimdDomain::DONTCARE; + + } else { + SimdDomain result = SimdDomain::DONTCARE; + + if (IsFlaggedVectorOp(df)) { + result = GetDomainForFlaggedVectorOp(df); + } else if (IsDefiniteIntegerDomainOpcode(df->opcode->num)) { + result = SimdDomain::INTEGER; + } else if (IsDefiniteFloatingDomainOpcode(df->opcode->num)) { + result = SimdDomain::FLOATING; + } + + // todo: check if still dontcare, if so, visit uses of the value to figure + // it out + return result; + } + + return SimdDomain::DONTCARE; +} } // namespace x64 } // namespace backend } // namespace cpu diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index d73d86fe1..519bc629a 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -44,7 +44,39 @@ enum RegisterFlags { REG_DEST = (1 << 0), REG_ABCD = (1 << 1), }; +/* + SSE/AVX/AVX512 has seperate move instructions/shuffle instructions for float + data and int data for a reason most processors implement two distinct + pipelines, one for the integer domain and one for the floating point domain + currently, xenia makes no distinction between the two. Crossing domains is + expensive. On Zen processors the penalty is one cycle each time you cross, + plus the two pipelines need to synchronize Often xenia will emit an integer + instruction, then a floating instruction, then integer again. this + effectively adds at least two cycles to the time taken These values will in + the future be used as tags to operations that tell them which domain to + operate in, if its at all possible to avoid crossing +*/ +enum class SimdDomain : uint32_t { + FLOATING, + INTEGER, + DONTCARE, + CONFLICTING // just used as a special result for PickDomain, different from + // dontcare (dontcare means we just dont know the domain, + // CONFLICTING means its used in multiple domains) +}; +static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) { + if (dom1 == dom2) { + return dom1; + } + if (dom1 == SimdDomain::DONTCARE) { + return dom2; + } + if (dom2 == SimdDomain::DONTCARE) { + return dom1; + } + return SimdDomain::CONFLICTING; +} enum XmmConst { XMMZero = 0, XMMOne, @@ -122,7 +154,7 @@ enum XmmConst { XMMLVSLTableBase, XMMLVSRTableBase, XMMSingleDenormalMask, - XMMThreeFloatMask, //for clearing the fourth float prior to DOT_PRODUCT_3 + XMMThreeFloatMask, // for clearing the fourth float prior to DOT_PRODUCT_3 XMMXenosF16ExtRangeStart }; @@ -150,8 +182,9 @@ enum X64EmitterFeatureFlags { kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL, kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ, - kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1) - kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2) + kX64FastJrcx = 1 << 12, // jrcxz is as fast as any other jump ( >= Zen1) + kX64FastLoop = + 1 << 13, // loop/loope/loopne is as fast as any other jump ( >= Zen2) kX64EmitAVX512VBMI = 1 << 14 }; class ResolvableGuestCall { @@ -259,6 +292,7 @@ class X64Emitter : public Xbyak::CodeGenerator { FunctionDebugInfo* debug_info() const { return debug_info_; } size_t stack_size() const { return stack_size_; } + SimdDomain DeduceSimdDomain(const hir::Value* for_value); protected: void* Emplace(const EmitFunctionInfo& func_info, diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc index 0646fdb39..33919d466 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc @@ -12,11 +12,11 @@ #include #include +#include "xenia/base/cvar.h" #include "xenia/base/memory.h" #include "xenia/cpu/backend/x64/x64_op.h" #include "xenia/cpu/backend/x64/x64_tracers.h" #include "xenia/cpu/ppc/ppc_context.h" -#include "xenia/base/cvar.h" DEFINE_bool( elide_e0_check, false, @@ -83,11 +83,17 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest, !is_definitely_not_eo(guest)) { // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do // it via memory mapping. + + // todo: do branching or use an alt membase and cmov e.xor_(e.eax, e.eax); - e.cmp(guest.reg().cvt32(), 0xE0000000 - offset_const); + e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]); + + e.cmp(e.edx, e.GetContextReg().cvt32()); e.setae(e.al); e.shl(e.eax, 12); - e.add(e.eax, guest.reg().cvt32()); + e.add(e.eax, e.edx); + return e.GetMembaseReg() + e.rax; + } else { // Clear the top 32 bits, as they are likely garbage. // TODO(benvanik): find a way to avoid doing this. @@ -122,7 +128,7 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do // it via memory mapping. e.xor_(e.eax, e.eax); - e.cmp(guest.reg().cvt32(), 0xE0000000); + e.cmp(guest.reg().cvt32(), e.GetContextReg().cvt32()); e.setae(e.al); e.shl(e.eax, 12); e.add(e.eax, guest.reg().cvt32()); @@ -208,7 +214,7 @@ struct ATOMIC_COMPARE_EXCHANGE_I32 if (xe::memory::allocation_granularity() > 0x1000) { // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do // it via memory mapping. - e.cmp(i.src1.reg().cvt32(), 0xE0000000); + e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32()); e.setae(e.cl); e.movzx(e.ecx, e.cl); e.shl(e.ecx, 12); @@ -229,7 +235,7 @@ struct ATOMIC_COMPARE_EXCHANGE_I64 if (xe::memory::allocation_granularity() > 0x1000) { // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do // it via memory mapping. - e.cmp(i.src1.reg().cvt32(), 0xE0000000); + e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32()); e.setae(e.cl); e.movzx(e.ecx, e.cl); e.shl(e.ecx, 12); @@ -1113,7 +1119,7 @@ struct CACHE_CONTROL if (xe::memory::allocation_granularity() > 0x1000) { // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do // it via memory mapping. - e.cmp(i.src1.reg().cvt32(), 0xE0000000); + e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32()); e.setae(e.al); e.movzx(e.eax, e.al); e.shl(e.eax, 12); diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 1cca6469f..7c55300db 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -1826,7 +1826,7 @@ struct PERMUTE_I32 } } }; -//todo: use this on const src1 +// todo: use this on const src1 static vec128_t FixupConstantShuf8(vec128_t input) { for (uint32_t i = 0; i < 16; ++i) { input.u8[i] ^= 0x03; @@ -1984,7 +1984,11 @@ struct SWIZZLE } else { src1 = i.src1; } - e.vpshufd(i.dest, src1, swizzle_mask); + if (element_type == INT32_TYPE) { + e.vpshufd(i.dest, src1, swizzle_mask); + } else if (element_type == FLOAT32_TYPE) { + e.vshufps(i.dest, src1, src1, swizzle_mask); + } } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) { assert_always(); } else { diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 5af242118..73e2d646b 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -717,6 +717,9 @@ struct SELECT_V128_I8 static void Emit(X64Emitter& e, const EmitArgType& i) { // TODO(benvanik): find a shorter sequence. // dest = src1 != 0 ? src2 : src3 + /* + chrispy: this is dead code, this sequence is never emitted + */ e.movzx(e.eax, i.src1); e.vmovd(e.xmm1, e.eax); e.vpbroadcastd(e.xmm1, e.xmm1); @@ -737,11 +740,46 @@ struct SELECT_V128_I8 e.vpor(i.dest, e.xmm1); } }; + +enum class PermittedBlend : uint32_t { NotPermitted, Int8, Ps }; +static bool IsVectorCompare(const Instr* i) { + Opcode op = i->opcode->num; + return op >= OPCODE_VECTOR_COMPARE_EQ && op <= OPCODE_VECTOR_COMPARE_UGE; +} +/* + OPCODE_SELECT does a bit by bit selection, however, if the selector is the + result of a comparison or if each element may only be 0xff or 0 we may use a + blend instruction instead +*/ +static PermittedBlend GetPermittedBlendForSelectV128(const Value* src1v) { + const Instr* df = src1v->def; + if (!df) { + return PermittedBlend::NotPermitted; + } else { + if (!IsVectorCompare(df)) { + return PermittedBlend::NotPermitted; // todo: check ors, ands of + // condition + } else { + switch (df->flags) { // check what datatype we compared as + case INT16_TYPE: + case INT32_TYPE: + case INT8_TYPE: + return PermittedBlend::Int8; // use vpblendvb + case FLOAT32_TYPE: + return PermittedBlend::Ps; // use vblendvps + default: // unknown type! just ignore + return PermittedBlend::NotPermitted; + } + } + } +} struct SELECT_V128_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { Xmm src1 = i.src1.is_constant ? e.xmm0 : i.src1; + PermittedBlend mayblend = GetPermittedBlendForSelectV128(i.src1.value); + //todo: detect whether src1 is only 0 or FFFF and use blends if so. currently we only detect cmps if (i.src1.is_constant) { e.LoadConstantXmm(src1, i.src1.constant()); } @@ -756,10 +794,16 @@ struct SELECT_V128_V128 e.LoadConstantXmm(src3, i.src3.constant()); } - // src1 ? src2 : src3; - e.vpandn(e.xmm3, src1, src2); - e.vpand(i.dest, src1, src3); - e.vpor(i.dest, i.dest, e.xmm3); + if (mayblend == PermittedBlend::Int8) { + e.vpblendvb(i.dest, src2, src3, src1); + } else if (mayblend == PermittedBlend::Ps) { + e.vblendvps(i.dest, src2, src3, src1); + } else { + // src1 ? src2 : src3; + e.vpandn(e.xmm3, src1, src2); + e.vpand(i.dest, src1, src3); + e.vpor(i.dest, i.dest, e.xmm3); + } } }; EMITTER_OPCODE_TABLE(OPCODE_SELECT, SELECT_I8, SELECT_I16, SELECT_I32, @@ -2122,7 +2166,8 @@ struct MUL_ADD_V128 // TODO(benvanik): the vfmadd sequence produces slightly different results // than vmul+vadd and it'd be nice to know why. Until we know, it's // disabled so tests pass. - if (false && e.IsFeatureEnabled(kX64EmitFMA)) { + // chrispy: reenabled, i have added the DAZ behavior that was missing + if (true && e.IsFeatureEnabled(kX64EmitFMA)) { EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, const Xmm& src2) { @@ -2139,7 +2184,11 @@ struct MUL_ADD_V128 e.vfmadd231ps(i.dest, src1, src2); } else { // Dest not equal to anything - e.vmovdqa(i.dest, src1); + // e.vmovdqa(i.dest, + // src1); + // chrispy: vmovdqa was a domain pipeline + // hazard + e.vmovaps(i.dest, src1); e.vfmadd213ps(i.dest, src2, src3); } }); @@ -2152,7 +2201,8 @@ struct MUL_ADD_V128 // If i.dest == i.src3, back up i.src3 so we don't overwrite it. src3 = i.src3; if (i.dest == i.src3) { - e.vmovdqa(e.xmm1, i.src3); + // e.vmovdqa(e.xmm1, i.src3); + e.vmovaps(e.xmm1, i.src3); src3 = e.xmm1; } } @@ -2384,17 +2434,17 @@ EMITTER_OPCODE_TABLE(OPCODE_NEG, NEG_I8, NEG_I16, NEG_I32, NEG_I64, NEG_F32, // ============================================================================ struct ABS_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS)); + e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS)); } }; struct ABS_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD)); + e.vandpd(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD)); } }; struct ABS_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS)); + e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS)); } }; EMITTER_OPCODE_TABLE(OPCODE_ABS, ABS_F32, ABS_F64, ABS_V128); @@ -2634,6 +2684,8 @@ struct DOT_PRODUCT_3_V128 */ e.vstmxcsr(mxcsr_storage); + e.vmovaps(e.xmm2, e.GetXmmConstPtr(XMMThreeFloatMask)); + e.mov(e.eax, 8); auto src1v = e.xmm0; @@ -2655,8 +2707,8 @@ struct DOT_PRODUCT_3_V128 // so that in the future this could be optimized away if the top is known to // be zero. Right now im not sure that happens often though and its // currently not worth it also, maybe pre-and if constant - e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask)); - e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask)); + e.vandps(e.xmm3, src1v, e.xmm2); + e.vandps(e.xmm2, src2v, e.xmm2); e.and_(mxcsr_storage, e.eax); e.vldmxcsr(mxcsr_storage); // overflow flag is cleared, now we're good to @@ -2682,8 +2734,7 @@ struct DOT_PRODUCT_3_V128 Xbyak::Label ret_qnan; Xbyak::Label done; e.jnz(ret_qnan); - // e.vshufps(i.dest, e.xmm1,e.xmm1, 0); // broadcast - e.vbroadcastss(i.dest, e.xmm1); + e.vshufps(i.dest, e.xmm1, e.xmm1, 0); // broadcast e.jmp(done); e.L(ret_qnan); e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN)); @@ -2728,27 +2779,7 @@ struct DOT_PRODUCT_4_V128 e.vcvtps2pd(e.ymm0, src1v); e.vcvtps2pd(e.ymm1, src2v); - /* - e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask)); - e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask)); - e.and_(mxcsr_storage, e.eax); - e.vldmxcsr(mxcsr_storage); // overflow flag is cleared, now we're good to - // go - - e.vcvtps2pd(e.ymm0, e.xmm3); - e.vcvtps2pd(e.ymm1, e.xmm2); - - - e.vmulpd(e.ymm5, e.ymm0, e.ymm1); - e.vextractf128(e.xmm4, e.ymm5, 1); - e.vunpckhpd(e.xmm3, e.xmm5, e.xmm5); // get element [1] in xmm3 - e.vaddsd(e.xmm5, e.xmm5, e.xmm4); - e.not_(e.eax); - e.vaddsd(e.xmm2, e.xmm5, e.xmm3); - e.vcvtsd2ss(e.xmm1, e.xmm2); - - */ e.vmulpd(e.ymm3, e.ymm0, e.ymm1); e.vextractf128(e.xmm2, e.ymm3, 1); e.vaddpd(e.xmm3, e.xmm3, e.xmm2); @@ -2765,8 +2796,7 @@ struct DOT_PRODUCT_4_V128 Xbyak::Label ret_qnan; Xbyak::Label done; e.jnz(ret_qnan); // reorder these jmps later, just want to get this fix in - // e.vshufps(i.dest, e.xmm1, e.xmm1, 0); - e.vbroadcastss(i.dest, e.xmm1); + e.vshufps(i.dest, e.xmm1, e.xmm1, 0); e.jmp(done); e.L(ret_qnan); e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN)); @@ -2846,10 +2876,17 @@ struct AND_I64 : Sequence> { }; struct AND_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vpand(dest, src1, src2); - }); + SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value), + e.DeduceSimdDomain(i.src2.value)); + + EmitCommutativeBinaryXmmOp( + e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + if (dom == SimdDomain::FLOATING) { + e.vandps(dest, src2, src1); + } else { + e.vpand(dest, src2, src1); + } + }); } }; EMITTER_OPCODE_TABLE(OPCODE_AND, AND_I8, AND_I16, AND_I32, AND_I64, AND_V128); @@ -2948,10 +2985,17 @@ struct AND_NOT_I64 struct AND_NOT_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vpandn(dest, src2, src1); - }); + SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value), + e.DeduceSimdDomain(i.src2.value)); + + EmitCommutativeBinaryXmmOp( + e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + if (dom == SimdDomain::FLOATING) { + e.vandnps(dest, src2, src1); + } else { + e.vpandn(dest, src2, src1); + } + }); } }; EMITTER_OPCODE_TABLE(OPCODE_AND_NOT, AND_NOT_I8, AND_NOT_I16, AND_NOT_I32, @@ -2994,10 +3038,17 @@ struct OR_I64 : Sequence> { }; struct OR_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vpor(dest, src1, src2); - }); + SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value), + e.DeduceSimdDomain(i.src2.value)); + + EmitCommutativeBinaryXmmOp( + e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + if (dom == SimdDomain::FLOATING) { + e.vorps(dest, src1, src2); + } else { + e.vpor(dest, src1, src2); + } + }); } }; EMITTER_OPCODE_TABLE(OPCODE_OR, OR_I8, OR_I16, OR_I32, OR_I64, OR_V128); @@ -3039,10 +3090,17 @@ struct XOR_I64 : Sequence> { }; struct XOR_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vpxor(dest, src1, src2); - }); + SimdDomain dom = PickDomain2(e.DeduceSimdDomain(i.src1.value), + e.DeduceSimdDomain(i.src2.value)); + + EmitCommutativeBinaryXmmOp( + e, i, [dom](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + if (dom == SimdDomain::FLOATING) { + e.vxorps(dest, src1, src2); + } else { + e.vpxor(dest, src1, src2); + } + }); } }; EMITTER_OPCODE_TABLE(OPCODE_XOR, XOR_I8, XOR_I16, XOR_I32, XOR_I64, XOR_V128); @@ -3078,8 +3136,15 @@ struct NOT_I64 : Sequence> { }; struct NOT_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // dest = src ^ 0xFFFF... - e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */)); + + SimdDomain domain = + e.DeduceSimdDomain(i.src1.value); + if (domain == SimdDomain::FLOATING) { + e.vxorps(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */)); + } else { + // dest = src ^ 0xFFFF... + e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */)); + } } }; EMITTER_OPCODE_TABLE(OPCODE_NOT, NOT_I8, NOT_I16, NOT_I32, NOT_I64, NOT_V128); @@ -3217,7 +3282,7 @@ struct SHR_V128 : Sequence> { } e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateShrV128)); - e.vmovaps(i.dest, e.xmm0); + e.vmovdqa(i.dest, e.xmm0); } static __m128i EmulateShrV128(void*, __m128i src1, uint8_t src2) { // Almost all instances are shamt = 1, but non-constant. diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index 025b4114e..6a6a56330 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -759,6 +759,18 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { i->Remove(); result = true; } + + else if (i->src2.value->IsConstantZero() && i->src3.value->IsConstantZero() && + i->flags == INT8_TYPE /*probably safe for int16 too*/) { + /* + chrispy: hoisted this check here from x64_seq_vector where if src1 is not constant, but src2 and src3 are zero, then we know the result will always be zero + */ + + v->set_zero(VEC128_TYPE); + i->Remove(); + result = true; + } + break; } case OPCODE_INSERT: diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index 10862fd54..8c1cc18c2 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -9,6 +9,7 @@ #include "xenia/cpu/compiler/passes/simplification_pass.h" +#include <__msvc_int128.hpp> #include "xenia/base/byte_order.h" #include "xenia/base/profiling.h" namespace xe { @@ -22,6 +23,52 @@ using namespace xe::cpu::hir; using xe::cpu::hir::HIRBuilder; using xe::cpu::hir::Instr; using xe::cpu::hir::Value; +using vmask_portion_t = uint64_t; +template +struct Valuemask_t { + vmask_portion_t bits[Ndwords]; + + static Valuemask_t create_empty(vmask_portion_t fill = 0) { + Valuemask_t result; + for (uint32_t i = 0; i < Ndwords; ++i) { + result.bits[i] = fill; + } + return result; + } + template + Valuemask_t operate(TCallable&& oper) const { + Valuemask_t result = create_empty(); + + for (uint32_t i = 0; i < Ndwords; ++i) { + result.bits[i] = oper(bits[i]); + } + return result; + } + template + Valuemask_t operate(TCallable&& oper, Valuemask_t other) const { + Valuemask_t result = create_empty(); + + for (uint32_t i = 0; i < Ndwords; ++i) { + result.bits[i] = oper(bits[i], other.bits[i]); + } + return result; + } + Valuemask_t operator&(ValueMask other) const { + return operate([](vmask_portion_t x, vmask_portion_t y) { return x & y; }, + other); + } + Valuemask_t operator|(ValueMask other) const { + return operate([](vmask_portion_t x, vmask_portion_t y) { return x | y; }, + other); + } + Valuemask_t operator^(ValueMask other) const { + return operate([](vmask_portion_t x, vmask_portion_t y) { return x ^ y; }, + other); + } + Valuemask_t operator~() const { + return operate([](vmask_portion_t x) { return ~x; }, other); + } +}; SimplificationPass::SimplificationPass() : ConditionalGroupSubpass() {} @@ -36,6 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) { iter_result |= SimplifyBitArith(builder); iter_result |= EliminateConversions(builder); iter_result |= SimplifyAssignments(builder); + iter_result |= BackpropTruncations(builder); result |= iter_result; } while (iter_result); return true; @@ -151,19 +199,88 @@ bool SimplificationPass::CheckOr(hir::Instr* i, hir::HIRBuilder* builder) { } return false; } +bool SimplificationPass::CheckBooleanXor1(hir::Instr* i, + hir::HIRBuilder* builder, + hir::Value* xored) { + unsigned tunflags = MOVTUNNEL_ASSIGNS | MOVTUNNEL_MOVZX; + + Instr* xordef = xored->GetDefTunnelMovs(&tunflags); + if (!xordef) { + return false; + } + + Opcode xorop = xordef->opcode->num; + bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0; + + Value* new_value = nullptr; + if (xorop == OPCODE_IS_FALSE) { + new_value = builder->IsTrue(xordef->src1.value); + + } else if (xorop == OPCODE_IS_TRUE) { + new_value = builder->IsFalse(xordef->src1.value); + } else if (xorop == OPCODE_COMPARE_EQ) { + new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value); + + } else if (xorop == OPCODE_COMPARE_NE) { + new_value = builder->CompareEQ(xordef->src1.value, xordef->src2.value); + } // todo: other conds + + if (!new_value) { + return false; + } + + new_value->def->MoveBefore(i); + + i->Replace(need_zx ? &OPCODE_ZERO_EXTEND_info : &OPCODE_ASSIGN_info, 0); + i->set_src1(new_value); + + return true; +} + +bool SimplificationPass::CheckXorOfTwoBools(hir::Instr* i, + hir::HIRBuilder* builder, + hir::Value* b1, hir::Value* b2) { + // todo: implement + return false; +} bool SimplificationPass::CheckXor(hir::Instr* i, hir::HIRBuilder* builder) { if (CheckOrXorZero(i)) { return true; } else { - if (i->src1.value == i->src2.value) { + Value* src1 = i->src1.value; + Value* src2 = i->src2.value; + + if (SameValueOrEqualConstant(src1, src2)) { i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(builder->LoadZero(i->dest->type)); return true; } - uint64_t type_mask = GetScalarTypeMask(i->dest->type); - auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar(); + ScalarNZM nzm1 = GetScalarNZM(src1); + ScalarNZM nzm2 = GetScalarNZM(src2); + + if ((nzm1 & nzm2) == + 0) { // no bits of the two sources overlap, this ought to be an OR + // cs:optimizing + /* i->Replace(&OPCODE_OR_info, 0); + i->set_src1(src1); + i->set_src2(src2);*/ + + i->opcode = &OPCODE_OR_info; + + return true; + } + + if (nzm1 == 1ULL && nzm2 == 1ULL) { + if (constant_value) { + return CheckBooleanXor1(i, builder, variable_value); + } else { + return CheckXorOfTwoBools(i, builder, src1, src2); + } + } + + uint64_t type_mask = GetScalarTypeMask(i->dest->type); if (!constant_value) return false; @@ -504,11 +621,12 @@ bool SimplificationPass::TryHandleANDROLORSHLSeq(hir::Instr* i, } bool SimplificationPass::CheckAnd(hir::Instr* i, hir::HIRBuilder* builder) { retry_and_simplification: + auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar(); if (!constant_value) { // added this for srawi - uint64_t nzml = GetScalarNZM(i->src1.value); - uint64_t nzmr = GetScalarNZM(i->src2.value); + ScalarNZM nzml = GetScalarNZM(i->src1.value); + ScalarNZM nzmr = GetScalarNZM(i->src2.value); if ((nzml & nzmr) == 0) { i->Replace(&OPCODE_ASSIGN_info, 0); @@ -524,9 +642,15 @@ retry_and_simplification: // todo: check if masking with mask that covers all of zero extension source uint64_t type_mask = GetScalarTypeMask(i->dest->type); - // if masking with entire width, pointless instruction so become an assign - if (constant_value->AsUint64() == type_mask) { + ScalarNZM nzm = GetScalarNZM(variable_value); + // if masking with entire width, pointless instruction so become an assign + // chrispy: changed this to use the nzm instead, this optimizes away many and + // instructions + // chrispy: changed this again. detecting if nzm is a subset of and mask, if + // so eliminate ex: (bool value) & 0xff = (bool value). the nzm is not equal + // to the mask, but it is a subset so can be elimed + if ((constant_value->AsUint64() & nzm) == nzm) { i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(variable_value); return true; @@ -555,7 +679,7 @@ retry_and_simplification: Value* or_left = true_variable_def->src1.value; Value* or_right = true_variable_def->src2.value; - uint64_t left_nzm = GetScalarNZM(or_left); + ScalarNZM left_nzm = GetScalarNZM(or_left); // use the other or input instead of the or output if ((constant_value->AsUint64() & left_nzm) == 0) { @@ -565,7 +689,7 @@ retry_and_simplification: return true; } - uint64_t right_nzm = GetScalarNZM(or_right); + ScalarNZM right_nzm = GetScalarNZM(or_right); if ((constant_value->AsUint64() & right_nzm) == 0) { i->Replace(&OPCODE_AND_info, 0); @@ -593,6 +717,21 @@ retry_and_simplification: return false; } bool SimplificationPass::CheckAdd(hir::Instr* i, hir::HIRBuilder* builder) { + Value* src1 = i->src1.value; + Value* src2 = i->src2.value; + + ScalarNZM nzm1 = GetScalarNZM(src1); + ScalarNZM nzm2 = GetScalarNZM(src2); + if ((nzm1 & nzm2) == 0) { // no bits overlap, there will never be a carry + // from any bits to any others, make this an OR + + /* i->Replace(&OPCODE_OR_info, 0); + i->set_src1(src1); + i->set_src2(src2);*/ + i->opcode = &OPCODE_OR_info; + return true; + } + auto [definition, added_constant] = i->BinaryValueArrangeByDefOpAndConstant(&OPCODE_NOT_info); @@ -645,7 +784,7 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i, return false; } - uint64_t nzm_for_var = GetScalarNZM(variable); + ScalarNZM nzm_for_var = GetScalarNZM(variable); Opcode cmpop = i->opcode->num; uint64_t constant_unpacked = constant_value->AsUint64(); uint64_t signbit_for_var = GetScalarSignbitMask(variable->type); @@ -670,6 +809,14 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i, i->set_src1(variable); return true; } + + if (cmpop == OPCODE_COMPARE_ULE && + constant_unpacked == + 0) { // less than or equal to zero = (== 0) = IS_FALSE + i->Replace(&OPCODE_IS_FALSE_info, 0); + i->set_src1(variable); + return true; + } // todo: OPCODE_COMPARE_NE too? if (cmpop == OPCODE_COMPARE_EQ && def_opcode == OPCODE_NOT) { // i see this a lot around addic insns @@ -774,7 +921,7 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i, return false; } - uint64_t input_nzm = GetScalarNZM(input); + ScalarNZM input_nzm = GetScalarNZM(input); if (istrue && input_nzm == 1) { // doing istrue on a value thats already a bool bitwise @@ -813,6 +960,98 @@ bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i, input_def = input_def->GetDestDefSkipAssigns();*/ return false; } +bool SimplificationPass::CheckSHRByConst(hir::Instr* i, + hir::HIRBuilder* builder, + hir::Value* variable, + unsigned int shift) { + if (shift >= 3 && shift <= 6) { + // is possible shift of lzcnt res, do some tunneling + + unsigned int tflags = MOVTUNNEL_ASSIGNS | MOVTUNNEL_MOVZX | + MOVTUNNEL_TRUNCATE | MOVTUNNEL_MOVSX | + MOVTUNNEL_AND32FF; + + Instr* vardef = variable->def; + + hir::Instr* var_def = variable->GetDefTunnelMovs(&tflags); + + if (var_def && var_def->opcode == &OPCODE_CNTLZ_info) { + Value* lz_input = var_def->src1.value; + TypeName type_of_lz_input = lz_input->type; + size_t shift_for_zero = + xe::log2_floor(GetTypeSize(type_of_lz_input) * CHAR_BIT); + + if (shift == shift_for_zero) { + // we ought to be OPCODE_IS_FALSE! + /* + explanation: if an input to lzcnt is zero, the result will be the + bit size of the input type, which is always a power of two any + nonzero result will be less than the bit size so you can test for + zero by doing, for instance with a 32 bit value, lzcnt32(input) >> 5 + this is a very common way of testing for zero without branching on + ppc, and the xb360 ppc compiler used it a lot we optimize this away + for simplicity and to enable further optimizations, but actually this + is also quite fast on modern x86 processors as well, for instance on + zen 2 the rcp through of lzcnt is 0.25, meaning four can be executed + in one cycle + + */ + + if (variable->type != INT8_TYPE) { + Value* isfalsetest = builder->IsFalse(lz_input); + + isfalsetest->def->MoveBefore(i); + i->Replace(&OPCODE_ZERO_EXTEND_info, 0); + i->set_src1(isfalsetest); + + } else { + i->Replace(&OPCODE_IS_FALSE_info, 0); + i->set_src1(lz_input); + } + return true; + } + } + } + return false; +} +bool SimplificationPass::CheckSHR(hir::Instr* i, hir::HIRBuilder* builder) { + Value* shr_lhs = i->src1.value; + Value* shr_rhs = i->src2.value; + if (!shr_lhs || !shr_rhs) return false; + if (shr_rhs->IsConstant()) { + return CheckSHRByConst(i, builder, shr_lhs, shr_rhs->AsUint32()); + } + + return false; +} + +bool SimplificationPass::CheckSAR(hir::Instr* i, hir::HIRBuilder* builder) { + Value* l = i->src1.value; + Value* r = i->src2.value; + ScalarNZM l_nzm = GetScalarNZM(l); + uint64_t signbit_mask = GetScalarSignbitMask(l->type); + size_t typesize = GetTypeSize(l->type); + + /* + todo: folding this requires the mask of constant bits + if (r->IsConstant()) { + uint32_t const_r = r->AsUint32(); + + if (const_r == (typesize * CHAR_BIT) - 1) { //the shift is being done to + fill the result with the signbit of the input. + + + } + }*/ + if ((l_nzm & signbit_mask) == 0) { // signbit will never be set, might as + // well be an SHR. (this does happen) + i->opcode = &OPCODE_SHR_info; + + return true; + } + + return false; +} bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) { bool result = false; auto block = builder->first_block(); @@ -822,19 +1061,24 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) { // vector types use the same opcodes as scalar ones for AND/OR/XOR! we // don't handle these in our simplifications, so skip if (i->dest && IsScalarIntegralType(i->dest->type)) { - if (i->opcode == &OPCODE_OR_info) { + Opcode iop = i->opcode->num; + + if (iop == OPCODE_OR) { result |= CheckOr(i, builder); - } else if (i->opcode == &OPCODE_XOR_info) { + } else if (iop == OPCODE_XOR) { result |= CheckXor(i, builder); - } else if (i->opcode == &OPCODE_AND_info) { + } else if (iop == OPCODE_AND) { result |= CheckAnd(i, builder); - } else if (i->opcode == &OPCODE_ADD_info) { + } else if (iop == OPCODE_ADD) { result |= CheckAdd(i, builder); - } else if (IsScalarBasicCmp(i->opcode->num)) { + } else if (IsScalarBasicCmp(iop)) { result |= CheckScalarConstCmp(i, builder); - } else if (i->opcode == &OPCODE_IS_FALSE_info || - i->opcode == &OPCODE_IS_TRUE_info) { + } else if (iop == OPCODE_IS_FALSE || iop == OPCODE_IS_TRUE) { result |= CheckIsTrueIsFalse(i, builder); + } else if (iop == OPCODE_SHR) { + result |= CheckSHR(i, builder); + } else if (iop == OPCODE_SHA) { + result |= CheckSAR(i, builder); } } @@ -928,7 +1172,6 @@ bool SimplificationPass::CheckByteSwap(Instr* i) { } return false; } - bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) { // Run over the instructions and rename assigned variables: // v1 = v0 @@ -952,22 +1195,11 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) { while (block) { auto i = block->instr_head; while (i) { - uint32_t signature = i->opcode->signature; - if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) { + i->VisitValueOperands([&result, i, this](Value* value, uint32_t idx) { bool modified = false; - i->set_src1(CheckValue(i->src1.value, modified)); + i->set_srcN(CheckValue(value, modified), idx); result |= modified; - } - if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) { - bool modified = false; - i->set_src2(CheckValue(i->src2.value, modified)); - result |= modified; - } - if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) { - bool modified = false; - i->set_src3(CheckValue(i->src3.value, modified)); - result |= modified; - } + }); i = i->next; } @@ -976,6 +1208,71 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) { return result; } +struct TruncateSimplifier { + TypeName type_from, type_to; + uint32_t sizeof_from, sizeof_to; + uint32_t bit_sizeof_from, bit_sizeof_to; + uint64_t typemask_from, typemask_to; + hir::HIRBuilder* builder; + hir::Instr* truncate_instr; + hir::Value* truncated_value; + hir::Instr* truncated_value_def; +}; +bool SimplificationPass::BackpropTruncations(hir::Instr* i, + hir::HIRBuilder* builder) { + if (i->opcode != &OPCODE_TRUNCATE_info) { + return false; + } + TypeName type_from = i->src1.value->type; + TypeName type_to = i->dest->type; + + uint32_t sizeof_from = static_cast(GetTypeSize(type_from)); + uint32_t sizeof_to = static_cast(GetTypeSize(type_to)); + + Instr* input_def = i->src1.value->GetDefSkipAssigns(); + if (!input_def) { + return false; + } + Opcode input_opc = input_def->opcode->num; + + if (input_opc == OPCODE_SHL && input_def->src2.value->IsConstant()) { + uint32_t src2_shift = input_def->src2.value->AsUint32(); + if (src2_shift < (sizeof_to * CHAR_BIT)) { + Value* truncated_preshift = + builder->Truncate(input_def->src1.value, type_to); + + truncated_preshift->def->MoveBefore(i); + i->Replace(&OPCODE_SHL_info, 0); + i->set_src1(truncated_preshift); + i->set_src2(input_def->src2.value); + return true; + } + } + if (input_opc == OPCODE_LOAD_CONTEXT) { + if (sizeof_from == 8 && sizeof_to == 4) { + Value* loadof = builder->LoadContext(input_def->src1.offset, INT32_TYPE); + loadof->def->MoveBefore(input_def); + i->Replace(&OPCODE_ASSIGN_info, 0); + i->set_src1(loadof); + return true; + } + } + + return false; +} +bool SimplificationPass::BackpropTruncations(hir::HIRBuilder* builder) { + bool result = false; + auto block = builder->first_block(); + while (block) { + auto i = block->instr_head; + while (i) { + result |= BackpropTruncations(i, builder); + i = i->next; + } + block = block->next; + } + return result; +} Value* SimplificationPass::CheckValue(Value* value, bool& result) { auto def = value->def; if (def && def->opcode == &OPCODE_ASSIGN_info) { diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.h b/src/xenia/cpu/compiler/passes/simplification_pass.h index d805ea27c..fe8de8474 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.h +++ b/src/xenia/cpu/compiler/passes/simplification_pass.h @@ -32,6 +32,8 @@ class SimplificationPass : public ConditionalGroupSubpass { bool SimplifyAssignments(hir::HIRBuilder* builder); hir::Value* CheckValue(hir::Value* value, bool& result); bool SimplifyBitArith(hir::HIRBuilder* builder); + bool BackpropTruncations(hir::Instr* i, hir::HIRBuilder* builder); + bool BackpropTruncations(hir::HIRBuilder* builder); // handle either or or xor with 0 bool CheckOrXorZero(hir::Instr* i); bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder); @@ -44,6 +46,17 @@ class SimplificationPass : public ConditionalGroupSubpass { bool CheckSelect(hir::Instr* i, hir::HIRBuilder* builder); bool CheckScalarConstCmp(hir::Instr* i, hir::HIRBuilder* builder); bool CheckIsTrueIsFalse(hir::Instr* i, hir::HIRBuilder* builder); + bool CheckSHRByConst(hir::Instr* i, hir::HIRBuilder* builder, + hir::Value* variable, unsigned int shift); + + bool CheckSHR(hir::Instr* i, hir::HIRBuilder* builder); + bool CheckSAR(hir::Instr* i, hir::HIRBuilder* builder); + // called by CheckXor, handles transforming a 1 bit value xored against 1 + bool CheckBooleanXor1(hir::Instr* i, hir::HIRBuilder* builder, + hir::Value* xored); + bool CheckXorOfTwoBools(hir::Instr* i, hir::HIRBuilder* builder, + hir::Value* b1, hir::Value* b2); + // for rlwinm bool TryHandleANDROLORSHLSeq(hir::Instr* i, hir::HIRBuilder* builder); bool TransformANDROLORSHLSeq( diff --git a/src/xenia/cpu/hir/instr.cc b/src/xenia/cpu/hir/instr.cc index 4096d8e4a..118895719 100644 --- a/src/xenia/cpu/hir/instr.cc +++ b/src/xenia/cpu/hir/instr.cc @@ -14,38 +14,15 @@ namespace xe { namespace cpu { namespace hir { - -void Instr::set_src1(Value* value) { - if (src1.value == value) { +void Instr::set_srcN(Value* value, uint32_t idx) { + if (srcs[idx].value == value) { return; } - if (src1_use) { - src1.value->RemoveUse(src1_use); + if (srcs_use[idx]) { + srcs[idx].value->RemoveUse(srcs_use[idx]); } - src1.value = value; - src1_use = value ? value->AddUse(block->arena, this) : NULL; -} - -void Instr::set_src2(Value* value) { - if (src2.value == value) { - return; - } - if (src2_use) { - src2.value->RemoveUse(src2_use); - } - src2.value = value; - src2_use = value ? value->AddUse(block->arena, this) : NULL; -} - -void Instr::set_src3(Value* value) { - if (src3.value == value) { - return; - } - if (src3_use) { - src3.value->RemoveUse(src3_use); - } - src3.value = value; - src3_use = value ? value->AddUse(block->arena, this) : NULL; + srcs[idx].value = value; + srcs_use[idx] = value ? value->AddUse(block->arena, this) : nullptr; } void Instr::MoveBefore(Instr* other) { @@ -128,6 +105,81 @@ Instr* Instr::GetDestDefSkipAssigns() { } return current_def; } +Instr* Instr::GetDestDefTunnelMovs(unsigned int* tunnel_flags) { + unsigned int traversed_types = 0; + unsigned int in_flags = *tunnel_flags; + Instr* current_def = this; + + while (true) { + Opcode op = current_def->opcode->num; + + switch (op) { + case OPCODE_ASSIGN: { + if ((in_flags & MOVTUNNEL_ASSIGNS)) { + current_def = current_def->src1.value->def; + traversed_types |= MOVTUNNEL_ASSIGNS; + + } else { + goto exit_loop; + } + break; + } + case OPCODE_ZERO_EXTEND: { + if ((in_flags & MOVTUNNEL_MOVZX)) { + current_def = current_def->src1.value->def; + traversed_types |= MOVTUNNEL_MOVZX; + + } else { + goto exit_loop; + } + break; + } + case OPCODE_SIGN_EXTEND: { + if ((in_flags & MOVTUNNEL_MOVSX)) { + current_def = current_def->src1.value->def; + traversed_types |= MOVTUNNEL_MOVSX; + + } else { + goto exit_loop; + } + break; + } + case OPCODE_TRUNCATE: { + if ((in_flags & MOVTUNNEL_TRUNCATE)) { + current_def = current_def->src1.value->def; + traversed_types |= MOVTUNNEL_TRUNCATE; + + } else { + goto exit_loop; + } + break; + } + case OPCODE_AND: { + if ((in_flags & MOVTUNNEL_AND32FF)) { + auto [constant, nonconst] = + current_def->BinaryValueArrangeAsConstAndVar(); + if (!constant || constant->AsUint64() != 0xFFFFFFFF) { + goto exit_loop; + } + current_def = nonconst->def; + traversed_types |= MOVTUNNEL_AND32FF; + + } else { + goto exit_loop; + } + break; + } + default: + goto exit_loop; + } + if (!current_def) { + goto exit_loop; + } + } +exit_loop: + *tunnel_flags = traversed_types; + return current_def; +} } // namespace hir } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/hir/instr.h b/src/xenia/cpu/hir/instr.h index 1f09ee341..db3c78922 100644 --- a/src/xenia/cpu/hir/instr.h +++ b/src/xenia/cpu/hir/instr.h @@ -25,6 +25,14 @@ namespace hir { class Block; class Label; +// todo: better name +enum MovTunnel { + MOVTUNNEL_ASSIGNS = 1, + MOVTUNNEL_MOVZX = 2, + MOVTUNNEL_MOVSX = 4, + MOVTUNNEL_TRUNCATE = 8, + MOVTUNNEL_AND32FF = 16, // tunnel through and with 0xFFFFFFFF +}; class Instr { public: @@ -44,17 +52,28 @@ class Instr { } Op; Value* dest; - Op src1; - Op src2; - Op src3; + union { + struct { + Op src1; + Op src2; + Op src3; + }; + Op srcs[3]; + }; + union { + struct { + Value::Use* src1_use; + Value::Use* src2_use; + Value::Use* src3_use; + }; + Value::Use* srcs_use[3]; + }; + void set_srcN(Value* value, uint32_t idx); + void set_src1(Value* value) { set_srcN(value, 0); } - Value::Use* src1_use; - Value::Use* src2_use; - Value::Use* src3_use; + void set_src2(Value* value) { set_srcN(value, 1); } - void set_src1(Value* value); - void set_src2(Value* value); - void set_src3(Value* value); + void set_src3(Value* value) { set_srcN(value, 2); } void MoveBefore(Instr* other); void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags); @@ -104,6 +123,8 @@ if both are constant, return nullptr, nullptr } Instr* GetDestDefSkipAssigns(); + Instr* GetDestDefTunnelMovs(unsigned int* tunnel_flags); + // returns [def op, constant] std::pair BinaryValueArrangeByDefOpAndConstant( const OpcodeInfo* op_ptr) { @@ -115,6 +136,28 @@ if both are constant, return nullptr, nullptr } return result; } + /* + Invokes the provided lambda callback on each operand that is a Value. Callback + is invoked with Value*, uint32_t index +*/ + template + void VisitValueOperands(TCallable&& call_for_values) { + uint32_t signature = opcode->signature; + + OpcodeSignatureType t_dest, t_src1, t_src2, t_src3; + + UnpackOpcodeSig(signature, t_dest, t_src1, t_src2, t_src3); + + if (t_src1 == OPCODE_SIG_TYPE_V) { + call_for_values(src1.value, 0); + } + if (t_src2 == OPCODE_SIG_TYPE_V) { + call_for_values(src2.value, 1); + } + if (t_src3 == OPCODE_SIG_TYPE_V) { + call_for_values(src3.value, 2); + } + } }; } // namespace hir diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc index 211cd18f9..c4ebdeb2c 100644 --- a/src/xenia/cpu/hir/value.cc +++ b/src/xenia/cpu/hir/value.cc @@ -1798,6 +1798,13 @@ hir::Instr* Value::GetDefSkipAssigns() { return nullptr; } } +hir::Instr* Value::GetDefTunnelMovs(unsigned int* tunnel_flags) { + if (def) { + return def->GetDestDefTunnelMovs(tunnel_flags); + } else { + return nullptr; + } +} } // namespace hir } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h index 1d8963b64..84d121a26 100644 --- a/src/xenia/cpu/hir/value.h +++ b/src/xenia/cpu/hir/value.h @@ -598,6 +598,8 @@ class Value { void CountLeadingZeros(const Value* other); bool Compare(Opcode opcode, Value* other); hir::Instr* GetDefSkipAssigns(); + // tunnel_flags is updated to the kinds we actually traversed + hir::Instr* GetDefTunnelMovs(unsigned int* tunnel_flags); private: static bool CompareInt8(Opcode opcode, Value* a, Value* b); diff --git a/src/xenia/cpu/ppc/ppc_context.h b/src/xenia/cpu/ppc/ppc_context.h index 4acdaed3c..777ef568a 100644 --- a/src/xenia/cpu/ppc/ppc_context.h +++ b/src/xenia/cpu/ppc/ppc_context.h @@ -246,30 +246,7 @@ enum class PPCRegister { }; #pragma pack(push, 8) -typedef struct PPCContext_s { - // Must be stored at 0x0 for now. - // TODO(benvanik): find a nice way to describe this to the JIT. - ThreadState* thread_state; // 0x0 - // TODO(benvanik): this is getting nasty. Must be here. - uint8_t* virtual_membase; // 0x8 - - // Most frequently used registers first. - uint64_t lr; // 0x10 Link register - uint64_t ctr; // 0x18 Count register - uint64_t r[32]; // 0x20 General purpose registers - double f[32]; // 0x120 Floating-point registers - vec128_t v[128]; // 0x220 VMX128 vector registers - - // XER register: - // Split to make it easier to do individual updates. - uint8_t xer_ca; // 0xA20 - uint8_t xer_ov; // 0xA21 - uint8_t xer_so; // 0xA22 - - // Condition registers: - // These are split to make it easier to do DCE on unused stores. - uint64_t cr() const; - void set_cr(uint64_t value); +typedef struct alignas(64) PPCContext_s { union { uint32_t value; struct { @@ -395,6 +372,25 @@ typedef struct PPCContext_s { } bits; } fpscr; // Floating-point status and control register + // Most frequently used registers first. + + uint64_t r[32]; // 0x20 General purpose registers + uint64_t ctr; // 0x18 Count register + uint64_t lr; // 0x10 Link register + double f[32]; // 0x120 Floating-point registers + vec128_t v[128]; // 0x220 VMX128 vector registers + + // XER register: + // Split to make it easier to do individual updates. + uint8_t xer_ca; + uint8_t xer_ov; + uint8_t xer_so; + + // Condition registers: + // These are split to make it easier to do DCE on unused stores. + uint64_t cr() const; + void set_cr(uint64_t value); + uint8_t vscr_sat; // uint32_t get_fprf() { @@ -425,7 +421,8 @@ typedef struct PPCContext_s { // Value of last reserved load uint64_t reserved_val; - + ThreadState* thread_state; + uint8_t* virtual_membase; static std::string GetRegisterName(PPCRegister reg); std::string GetStringFromValue(PPCRegister reg) const; void SetValueFromString(PPCRegister reg, std::string value); diff --git a/src/xenia/cpu/thread_state.cc b/src/xenia/cpu/thread_state.cc index 3816446fc..1383646e1 100644 --- a/src/xenia/cpu/thread_state.cc +++ b/src/xenia/cpu/thread_state.cc @@ -18,12 +18,50 @@ #include "xenia/cpu/processor.h" #include "xenia/xbox.h" - namespace xe { namespace cpu { thread_local ThreadState* thread_state_ = nullptr; +static void* AllocateContext() { + size_t granularity = xe::memory::allocation_granularity(); + for (unsigned pos32 = 0x40; pos32 < 8192; ++pos32) { + /* + we want our register which points to the context to have 0xE0000000 in + the low 32 bits, for checking for whether we need the 4k offset, but also + if we allocate starting from the page before we allow backends to index + negatively to get to their own backend specific data, which makes full + use of int8 displacement + + + the downside is we waste most of one granula and probably a fair bit of + the one starting at 0xE0 by using a direct virtual memory allocation + instead of malloc + */ + uintptr_t context_pre = + ((static_cast(pos32) << 32) | 0xE0000000) - granularity; + + void* p = memory::AllocFixed( + (void*)context_pre, granularity + sizeof(ppc::PPCContext), + memory::AllocationType::kReserveCommit, memory::PageAccess::kReadWrite); + if (p) { + return reinterpret_cast(p) + + granularity; // now we have a ctx ptr with the e0 constant in low, + // and one page allocated before it + } + } + + assert_always("giving up on allocating context, likely leaking contexts"); + return nullptr; +} + +static void FreeContext(void* ctx) { + char* true_start_of_ctx = &reinterpret_cast( + ctx)[-static_cast(xe::memory::allocation_granularity())]; + memory::DeallocFixed(true_start_of_ctx, 0, + memory::DeallocationType::kRelease); +} + ThreadState::ThreadState(Processor* processor, uint32_t thread_id, uint32_t stack_base, uint32_t pcr_address) : processor_(processor), @@ -38,7 +76,9 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id, backend_data_ = processor->backend()->AllocThreadData(); // Allocate with 64b alignment. - context_ = memory::AlignedAlloc(64); + + context_ = reinterpret_cast(AllocateContext()); // memory::AlignedAlloc(64); + processor->backend()->InitializeBackendContext(context_); assert_true(((uint64_t)context_ & 0x3F) == 0); std::memset(context_, 0, sizeof(ppc::PPCContext)); @@ -62,8 +102,10 @@ ThreadState::~ThreadState() { if (thread_state_ == this) { thread_state_ = nullptr; } - - memory::AlignedFree(context_); + if (context_) { + FreeContext(reinterpret_cast(context_)); + } + // memory::AlignedFree(context_); } void ThreadState::Bind(ThreadState* thread_state) {