diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 1cf4dc416..3d9323af9 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -106,7 +106,16 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) #undef TEST_EMIT_FEATURE - + /* + fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in latest version of xbyak +*/ + unsigned int data[4]; + Xbyak::util::Cpu::getCpuid(0x80000001, data); + if (data[2] & (1U << 5)) { + if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) { + feature_flags_ |= kX64EmitLZCNT; + } + } if (cpu_.has(Xbyak::util::Cpu::tAMD)) { bool is_zennish = cpu_.displayFamily >= 0x17; diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index b4b9a70e2..b647ff404 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -2749,11 +2749,17 @@ struct AND_I32 : Sequence> { }; struct AND_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitAndXX(e, i); + if (i.src2.is_constant && i.src2.constant() == 0xFFFFFFFF) { + // special case for rlwinm codegen + e.mov(((Reg64)i.dest).cvt32(), ((Reg64)i.src1).cvt32()); + } else { + EmitAndXX(e, i); + } } }; struct AND_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vpand(dest, src1, src2); diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index 3569887a4..3ab9276a6 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -9,8 +9,8 @@ #include "xenia/cpu/compiler/passes/simplification_pass.h" +#include "xenia/base/byte_order.h" #include "xenia/base/profiling.h" - namespace xe { namespace cpu { namespace compiler { @@ -29,11 +29,241 @@ SimplificationPass::~SimplificationPass() {} bool SimplificationPass::Run(HIRBuilder* builder, bool& result) { result = false; + result |= SimplifyBitArith(builder); result |= EliminateConversions(builder); result |= SimplifyAssignments(builder); return true; } +// simplifications that apply to both or and xor +bool SimplificationPass::CheckOrXorZero(hir::Instr* i) { + auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar(); + if (constant_value && constant_value->IsConstantZero()) { + i->Replace(&OPCODE_ASSIGN_info, 0); + i->set_src1(variable_value); + return true; + } + return false; +} +bool SimplificationPass::CheckOr(hir::Instr* i) { return CheckOrXorZero(i); } +bool SimplificationPass::CheckXor(hir::Instr* i) { + if (CheckOrXorZero(i)) { + return true; + } else { + uint64_t type_mask = GetScalarTypeMask(i->dest->type); + + auto [constant_value, variable_value] = + i->BinaryValueArrangeAsConstAndVar(); + + if (!constant_value) return false; + + if (constant_value->AsUint64() == type_mask) { + i->Replace(&OPCODE_NOT_info, 0); + i->set_src1(variable_value); + return true; + } + } + return false; +} +bool SimplificationPass::Is1BitOpcode(hir::Opcode def_opcode) { + return def_opcode >= OPCODE_IS_TRUE && def_opcode <= OPCODE_DID_SATURATE; +} +uint64_t SimplificationPass::GetScalarNZM(hir::Value* value, hir::Instr* def, + + uint64_t typemask, + hir::Opcode def_opcode) { + if (def_opcode == OPCODE_SHL) { + hir::Value* shifted = def->src1.value; + hir::Value* shiftby = def->src2.value; + // todo: nzm shift + if (shiftby->IsConstant()) { + uint64_t shifted_nzm = GetScalarNZM(shifted); + return shifted_nzm << shiftby->AsUint64(); + } + } else if (def_opcode == OPCODE_SHR) { + hir::Value* shifted = def->src1.value; + hir::Value* shiftby = def->src2.value; + // todo: nzm shift + if (shiftby->IsConstant()) { + uint64_t shifted_nzm = GetScalarNZM(shifted); + return shifted_nzm >> shiftby->AsUint64(); + } + } + // todo : sha, check signbit + else if (def_opcode == OPCODE_ROTATE_LEFT) { + hir::Value* shifted = def->src1.value; + hir::Value* shiftby = def->src2.value; + // todo: nzm shift + if (shiftby->IsConstant()) { + uint64_t shifted_nzm = GetScalarNZM(shifted); + return xe::rotate_left(shifted_nzm, + static_cast(shiftby->AsUint64())); + } + } else if (def_opcode == OPCODE_XOR || def_opcode == OPCODE_OR) { + return GetScalarNZM(def->src1.value) | GetScalarNZM(def->src2.value); + } else if (def_opcode == OPCODE_NOT) { + return typemask; + } else if (def_opcode == OPCODE_ASSIGN) { + return GetScalarNZM(def->src1.value); + } else if (def_opcode == OPCODE_BYTE_SWAP) { + uint64_t input_nzm = GetScalarNZM(def->src1.value); + switch (GetTypeSize(def->dest->type)) { + case 1: + return input_nzm; + case 2: + return xe::byte_swap( + static_cast(input_nzm)); + + case 4: + return xe::byte_swap( + static_cast(input_nzm)); + case 8: + return xe::byte_swap(input_nzm); + default: + xenia_assert(0); + return typemask; + } + } else if (def_opcode == OPCODE_ZERO_EXTEND) { + return GetScalarNZM(def->src1.value); + } else if (def_opcode == OPCODE_TRUNCATE) { + return GetScalarNZM(def->src1.value); // caller will truncate by masking + } else if (def_opcode == OPCODE_AND) { + return GetScalarNZM(def->src1.value) & GetScalarNZM(def->src2.value); + } else if (def_opcode == OPCODE_SELECT) { + return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src3.value); + } else if (def_opcode == OPCODE_MIN) { + /* + the nzm will be that of the narrowest operand, because if one value is + capable of being much larger than the other it can never actually reach + a value that is outside the range of the other values nzm, because that + would make it not the minimum of the two + + ahh, actually, we have to be careful about constants then.... for now, + just return or + */ + return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src1.value); + } else if (def_opcode == OPCODE_MAX) { + return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src1.value); + } else if (Is1BitOpcode(def_opcode)) { + return 1ULL; + } else if (def_opcode == OPCODE_CAST) { + return GetScalarNZM(def->src1.value); + } + + return typemask; +} +uint64_t SimplificationPass::GetScalarNZM(hir::Value* value) { + if (value->IsConstant()) { + return value->AsUint64(); + } + + uint64_t default_return = GetScalarTypeMask(value->type); + + hir::Instr* def = value->def; + if (!def) { + return default_return; + } + return GetScalarNZM(value, def, default_return, def->opcode->num) & + default_return; +} +bool SimplificationPass::CheckAnd(hir::Instr* i) { +retry_and_simplification: + auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar(); + if (!constant_value) return false; + + // todo: check if masking with mask that covers all of zero extension source + uint64_t type_mask = GetScalarTypeMask(i->dest->type); + // if masking with entire width, pointless instruction so become an assign + + if (constant_value->AsUint64() == type_mask) { + i->Replace(&OPCODE_ASSIGN_info, 0); + i->set_src1(variable_value); + return true; + } + + auto variable_def = variable_value->def; + + if (variable_def) { + auto true_variable_def = variable_def->GetDestDefSkipAssigns(); + if (true_variable_def) { + if (true_variable_def->opcode == &OPCODE_AND_info) { + auto [variable_def_constant, variable_def_variable] = + true_variable_def->BinaryValueArrangeAsConstAndVar(); + + if (variable_def_constant) { + // todo: check if masked with mask that was a subset of the current + // one and elim if so + if (variable_def_constant->AsUint64() == constant_value->AsUint64()) { + // we already masked the input with the same mask + i->Replace(&OPCODE_ASSIGN_info, 0); + i->set_src1(variable_value); + return true; + } + } + } else if (true_variable_def->opcode == &OPCODE_OR_info) { + Value* or_left = true_variable_def->src1.value; + Value* or_right = true_variable_def->src2.value; + + uint64_t left_nzm = GetScalarNZM(or_left); + + // use the other or input instead of the or output + if ((constant_value->AsUint64() & left_nzm) == 0) { + i->Replace(&OPCODE_AND_info, 0); + i->set_src1(or_right); + i->set_src2(constant_value); + return true; + } + + uint64_t right_nzm = GetScalarNZM(or_right); + + if ((constant_value->AsUint64() & right_nzm) == 0) { + i->Replace(&OPCODE_AND_info, 0); + i->set_src1(or_left); + i->set_src2(constant_value); + return true; + } + } else if (true_variable_def->opcode == &OPCODE_ROTATE_LEFT_info) { + if (true_variable_def->src2.value->IsConstant()) { + if (((type_mask << true_variable_def->src2.value->AsUint64()) & + type_mask) == + constant_value->AsUint64()) { // rotated bits are unused, convert + // to shift if we are the only use + if (true_variable_def->dest->use_head->next == nullptr) { + // one use, convert to shift + true_variable_def->opcode = &OPCODE_SHL_info; + goto retry_and_simplification; + } + } + } + } + } + } + + return false; +} +bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) { + bool result = false; + auto block = builder->first_block(); + while (block) { + auto i = block->instr_head; + while (i) { + // vector types use the same opcodes as scalar ones for AND/OR/XOR! we + // don't handle these in our simplifications, so skip + if (i->dest && i->dest->type != VEC128_TYPE) { + if (i->opcode == &OPCODE_OR_info) { + result |= CheckOr(i); + } else if (i->opcode == &OPCODE_XOR_info) { + result |= CheckXor(i); + } else if (i->opcode == &OPCODE_AND_info) { + result |= CheckAnd(i); + } + } + i = i->next; + } + block = block->next; + } + return result; +} bool SimplificationPass::EliminateConversions(HIRBuilder* builder) { // First, we check for truncates/extensions that can be skipped. // This generates some assignments which then the second step will clean up. @@ -158,6 +388,7 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) { i->set_src3(CheckValue(i->src3.value, modified)); result |= modified; } + i = i->next; } block = block->next; diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.h b/src/xenia/cpu/compiler/passes/simplification_pass.h index 2ba6efad7..b2ef9bd95 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.h +++ b/src/xenia/cpu/compiler/passes/simplification_pass.h @@ -31,6 +31,19 @@ class SimplificationPass : public ConditionalGroupSubpass { bool SimplifyAssignments(hir::HIRBuilder* builder); hir::Value* CheckValue(hir::Value* value, bool& result); + bool SimplifyBitArith(hir::HIRBuilder* builder); + // handle either or or xor with 0 + bool CheckOrXorZero(hir::Instr* i); + bool CheckOr(hir::Instr* i); + bool CheckXor(hir::Instr* i); + bool CheckAnd(hir::Instr* i); + static bool Is1BitOpcode(hir::Opcode def_opcode); + static uint64_t GetScalarNZM(hir::Value* value, hir::Instr* def, + uint64_t typemask, hir::Opcode def_opcode); + // todo: use valuemask + // returns maybenonzeromask for value (mask of bits that may possibly hold + // information) + static uint64_t GetScalarNZM(hir::Value* value); }; } // namespace passes diff --git a/src/xenia/cpu/hir/instr.cc b/src/xenia/cpu/hir/instr.cc index 657dc5f53..4096d8e4a 100644 --- a/src/xenia/cpu/hir/instr.cc +++ b/src/xenia/cpu/hir/instr.cc @@ -114,7 +114,20 @@ void Instr::Remove() { block->instr_tail = prev; } } +Instr* Instr::GetDestDefSkipAssigns() { + Instr* current_def = this; + while (current_def->opcode == &OPCODE_ASSIGN_info) { + Instr* next_def = current_def->src1.value->def; + + if (!next_def) { + return nullptr; + } + + current_def = next_def; + } + return current_def; +} } // namespace hir } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/hir/instr.h b/src/xenia/cpu/hir/instr.h index 67fe47ede..035be44d8 100644 --- a/src/xenia/cpu/hir/instr.h +++ b/src/xenia/cpu/hir/instr.h @@ -59,6 +59,52 @@ class Instr { void MoveBefore(Instr* other); void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags); void Remove(); + + template + std::pair BinaryValueArrangeByPredicateExclusive( + TPredicate&& pred) { + auto src1_value = src1.value; + auto src2_value = src2.value; + if (!src1_value || !src2_value) return {nullptr, nullptr}; + + if (!opcode) return {nullptr, nullptr}; // impossible! + + // check if binary opcode taking two values. we dont care if the dest is a + // value + + if (!IsOpcodeBinaryValue(opcode->signature)) return {nullptr, nullptr}; + + if (pred(src1_value)) { + if (pred(src2_value)) { + return {nullptr, nullptr}; + } else { + return {src1_value, src2_value}; + } + } else if (pred(src2_value)) { + return {src2_value, src1_value}; + } else { + return {nullptr, nullptr}; + } + } + + /* +if src1 is constant, and src2 is not, return [src1, src2] +if src2 is constant, and src1 is not, return [src2, src1] +if neither is constant, return nullptr, nullptr +if both are constant, return nullptr, nullptr +*/ + std::pair BinaryValueArrangeAsConstAndVar() { + return BinaryValueArrangeByPredicateExclusive( + [](Value* value) { return value->IsConstant(); }); + } + std::pair BinaryValueArrangeByDefiningOpcode( + const OpcodeInfo* op_ptr) { + return BinaryValueArrangeByPredicateExclusive([op_ptr](Value* value) { + return value->def && value->def->opcode == op_ptr; + }); + } + + Instr* GetDestDefSkipAssigns(); }; } // namespace hir diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index 6f45bb8da..8e681c757 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -347,6 +347,10 @@ enum OpcodeSignature { #define GET_OPCODE_SIG_TYPE_SRC1(sig) (OpcodeSignatureType)((sig >> 3) & 0x7) #define GET_OPCODE_SIG_TYPE_SRC2(sig) (OpcodeSignatureType)((sig >> 6) & 0x7) #define GET_OPCODE_SIG_TYPE_SRC3(sig) (OpcodeSignatureType)((sig >> 9) & 0x7) +static bool IsOpcodeBinaryValue(uint32_t signature) { + return (signature & ~(0x7)) == + ((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6)); +} typedef struct { uint32_t flags; diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h index e3426a816..94a8c5965 100644 --- a/src/xenia/cpu/hir/value.h +++ b/src/xenia/cpu/hir/value.h @@ -57,6 +57,15 @@ inline size_t GetTypeSize(TypeName type_name) { return 0; } } +inline uint64_t GetScalarTypeMask(TypeName type_name) { + size_t mask_width = GetTypeSize(type_name); + + if (mask_width == 8) { + return ~0ULL; + } else { + return (1ULL << (mask_width * CHAR_BIT)) - 1; + } +} enum ValueFlags { VALUE_IS_CONSTANT = (1 << 1), @@ -68,6 +77,23 @@ struct RegAssignment { int32_t index; }; +struct ValueMask { + uint64_t low; // low 64 bits, usually for scalar values + uint64_t high; // high 64 bits, only used for vector types + + ValueMask(uint64_t _low, uint64_t _high) : low(_low), high(_high) {} + + ValueMask operator&(ValueMask other) const { + return ValueMask{low & other.low, high & other.high}; + } + ValueMask operator|(ValueMask other) const { + return ValueMask{low | other.low, high | other.high}; + } + ValueMask operator^(ValueMask other) const { + return ValueMask{low ^ other.low, high ^ other.high}; + } +}; + class Value { public: typedef struct Use_s { diff --git a/src/xenia/cpu/ppc/ppc_emit_alu.cc b/src/xenia/cpu/ppc/ppc_emit_alu.cc index 81cd2a05b..7139e4c50 100644 --- a/src/xenia/cpu/ppc/ppc_emit_alu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_alu.cc @@ -1023,6 +1023,17 @@ int InstrEmit_rlwimix(PPCHIRBuilder& f, const InstrData& i) { } return 0; } +static bool InstrCheck_rlx_only_needs_low(unsigned rotation, uint64_t mask) { + uint32_t mask32 = static_cast(mask); + if (static_cast(mask32) != mask) { + return false; + } + uint32_t all_ones_32 = ~0U; + all_ones_32 <<= rotation; + + return all_ones_32 == mask32; // mask is only 32 bits and all bits from the + // rotation are discarded +} int InstrEmit_rlwinmx(PPCHIRBuilder& f, const InstrData& i) { // n <- SH @@ -1031,23 +1042,47 @@ int InstrEmit_rlwinmx(PPCHIRBuilder& f, const InstrData& i) { // RA <- r & m Value* v = f.LoadGPR(i.M.RT); - // (x||x) - v = f.Or(f.Shl(v, 32), f.ZeroExtend(f.Truncate(v, INT32_TYPE), INT64_TYPE)); + unsigned rotation = i.M.SH; - // TODO(benvanik): optimize srwi - // TODO(benvanik): optimize slwi - // The compiler will generate a bunch of these for the special case of SH=0. - // Which seems to just select some bits and set cr0 for use with a branch. - // We can detect this and do less work. - if (i.M.SH) { - v = f.RotateLeft(v, f.LoadConstantInt8(i.M.SH)); - } - // Compiler sometimes masks with 0xFFFFFFFF (identity) - avoid the work here - // as our truncation/zero-extend does it for us. uint64_t m = XEMASK(i.M.MB + 32, i.M.ME + 32); - if (m != 0xFFFFFFFFFFFFFFFFull) { + + // in uint32 range (so no register concat/truncate/zx needed) and no rotation + if (m < (1ULL << 32) && (rotation == 0)) { v = f.And(v, f.LoadConstantUint64(m)); } + // masks out all the bits that are rotated in from the right, so just do a + // shift + and. the and with 0xFFFFFFFF is done instead of a truncate/zx + // because we have a special case for it in the emitters that will just do a + // single insn (mov reg32, lowpartofreg64), otherwise we generate + // significantly more code from setting up the opnds of the truncate/zx + else if (InstrCheck_rlx_only_needs_low(rotation, m)) { + // this path is taken for like 90% of all rlwinms + v = f.And(f.Shl(v, rotation), f.LoadConstantUint64(0xFFFFFFFF)); + } + + else { + // (x||x) + // cs: changed this to mask with UINT32_MAX instead of doing the + // truncate/extend, this generates better code in the backend and is easier + // to do analysis on + v = f.And(v, f.LoadConstantUint64(0xFFFFFFFF)); + + v = f.Or(f.Shl(v, 32), v); + + // TODO(benvanik): optimize srwi + // TODO(benvanik): optimize slwi + // The compiler will generate a bunch of these for the special case of SH=0. + // Which seems to just select some bits and set cr0 for use with a branch. + // We can detect this and do less work. + if (i.M.SH) { + v = f.RotateLeft(v, f.LoadConstantInt8(rotation)); + } + // Compiler sometimes masks with 0xFFFFFFFF (identity) - avoid the work here + // as our truncation/zero-extend does it for us. + if (m != 0xFFFFFFFFFFFFFFFFull) { + v = f.And(v, f.LoadConstantUint64(m)); + } + } f.StoreGPR(i.M.RA, v); if (i.M.Rc) { f.UpdateCR(0, v);