Merge pull request #47 from chrisps/canary_experimental

drastically reduce size of final generated code for rlwinm by adding …
2022-06-26 19:52:17 +02:00 · 2022-06-26 19:52:17 +02:00 · f8f6a20569
parent 2b3686f0e9 08232de8cc
commit f8f6a20569
9 changed files with 399 additions and 16 deletions
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -106,7 +106,16 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)


 #undef TEST_EMIT_FEATURE
-
+  /*
+  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in latest version of xbyak
+*/
+  unsigned int data[4];
+  Xbyak::util::Cpu::getCpuid(0x80000001, data);
+  if (data[2] & (1U << 5)) {
+    if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
+      feature_flags_ |= kX64EmitLZCNT;
+    }
+  }
  if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
  
      bool is_zennish = cpu_.displayFamily >= 0x17;
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -2749,11 +2749,17 @@ struct AND_I32 : Sequence<AND_I32, I<OPCODE_AND, I32Op, I32Op, I32Op>> {
 };
 struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    EmitAndXX<AND_I64, Reg64>(e, i);
+    if (i.src2.is_constant && i.src2.constant() == 0xFFFFFFFF) {
+      // special case for rlwinm codegen
+      e.mov(((Reg64)i.dest).cvt32(), ((Reg64)i.src1).cvt32());
+    } else {
+      EmitAndXX<AND_I64, Reg64>(e, i);
+    }
  }
 };
 struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
+
    EmitCommutativeBinaryXmmOp(e, i,
                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
                                 e.vpand(dest, src1, src2);
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@ -9,8 +9,8 @@

 #include "xenia/cpu/compiler/passes/simplification_pass.h"

+#include "xenia/base/byte_order.h"
 #include "xenia/base/profiling.h"
-
 namespace xe {
 namespace cpu {
 namespace compiler {
@ -29,11 +29,241 @@ SimplificationPass::~SimplificationPass() {}

 bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
  result = false;
+  result |= SimplifyBitArith(builder);
  result |= EliminateConversions(builder);
  result |= SimplifyAssignments(builder);
  return true;
 }
+// simplifications that apply to both or and xor
+bool SimplificationPass::CheckOrXorZero(hir::Instr* i) {
+  auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar();

+  if (constant_value && constant_value->IsConstantZero()) {
+    i->Replace(&OPCODE_ASSIGN_info, 0);
+    i->set_src1(variable_value);
+    return true;
+  }
+  return false;
+}
+bool SimplificationPass::CheckOr(hir::Instr* i) { return CheckOrXorZero(i); }
+bool SimplificationPass::CheckXor(hir::Instr* i) {
+  if (CheckOrXorZero(i)) {
+    return true;
+  } else {
+    uint64_t type_mask = GetScalarTypeMask(i->dest->type);
+
+    auto [constant_value, variable_value] =
+        i->BinaryValueArrangeAsConstAndVar();
+
+    if (!constant_value) return false;
+
+    if (constant_value->AsUint64() == type_mask) {
+      i->Replace(&OPCODE_NOT_info, 0);
+      i->set_src1(variable_value);
+      return true;
+    }
+  }
+  return false;
+}
+bool SimplificationPass::Is1BitOpcode(hir::Opcode def_opcode) {
+  return def_opcode >= OPCODE_IS_TRUE && def_opcode <= OPCODE_DID_SATURATE;
+}
+uint64_t SimplificationPass::GetScalarNZM(hir::Value* value, hir::Instr* def,
+
+                                          uint64_t typemask,
+                                          hir::Opcode def_opcode) {
+  if (def_opcode == OPCODE_SHL) {
+    hir::Value* shifted = def->src1.value;
+    hir::Value* shiftby = def->src2.value;
+    // todo: nzm shift
+    if (shiftby->IsConstant()) {
+      uint64_t shifted_nzm = GetScalarNZM(shifted);
+      return shifted_nzm << shiftby->AsUint64();
+    }
+  } else if (def_opcode == OPCODE_SHR) {
+    hir::Value* shifted = def->src1.value;
+    hir::Value* shiftby = def->src2.value;
+    // todo: nzm shift
+    if (shiftby->IsConstant()) {
+      uint64_t shifted_nzm = GetScalarNZM(shifted);
+      return shifted_nzm >> shiftby->AsUint64();
+    }
+  }
+  // todo : sha, check signbit
+  else if (def_opcode == OPCODE_ROTATE_LEFT) {
+    hir::Value* shifted = def->src1.value;
+    hir::Value* shiftby = def->src2.value;
+    // todo: nzm shift
+    if (shiftby->IsConstant()) {
+      uint64_t shifted_nzm = GetScalarNZM(shifted);
+      return xe::rotate_left(shifted_nzm,
+                             static_cast<uint8_t>(shiftby->AsUint64()));
+    }
+  } else if (def_opcode == OPCODE_XOR || def_opcode == OPCODE_OR) {
+    return GetScalarNZM(def->src1.value) | GetScalarNZM(def->src2.value);
+  } else if (def_opcode == OPCODE_NOT) {
+    return typemask;
+  } else if (def_opcode == OPCODE_ASSIGN) {
+    return GetScalarNZM(def->src1.value);
+  } else if (def_opcode == OPCODE_BYTE_SWAP) {
+    uint64_t input_nzm = GetScalarNZM(def->src1.value);
+    switch (GetTypeSize(def->dest->type)) {
+      case 1:
+        return input_nzm;
+      case 2:
+        return xe::byte_swap<unsigned short>(
+            static_cast<unsigned short>(input_nzm));
+
+      case 4:
+        return xe::byte_swap<unsigned int>(
+            static_cast<unsigned int>(input_nzm));
+      case 8:
+        return xe::byte_swap<unsigned long long>(input_nzm);
+      default:
+        xenia_assert(0);
+        return typemask;
+    }
+  } else if (def_opcode == OPCODE_ZERO_EXTEND) {
+    return GetScalarNZM(def->src1.value);
+  } else if (def_opcode == OPCODE_TRUNCATE) {
+    return GetScalarNZM(def->src1.value);  // caller will truncate by masking
+  } else if (def_opcode == OPCODE_AND) {
+    return GetScalarNZM(def->src1.value) & GetScalarNZM(def->src2.value);
+  } else if (def_opcode == OPCODE_SELECT) {
+    return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src3.value);
+  } else if (def_opcode == OPCODE_MIN) {
+    /*
+        the nzm will be that of the narrowest operand, because if one value is
+       capable of being much larger than the other it can never actually reach
+        a value that is outside the range of the other values nzm, because that
+       would make it not the minimum of the two
+
+        ahh, actually, we have to be careful about constants then.... for now,
+       just return or
+    */
+    return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src1.value);
+  } else if (def_opcode == OPCODE_MAX) {
+    return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src1.value);
+  } else if (Is1BitOpcode(def_opcode)) {
+    return 1ULL;
+  } else if (def_opcode == OPCODE_CAST) {
+    return GetScalarNZM(def->src1.value);
+  }
+
+  return typemask;
+}
+uint64_t SimplificationPass::GetScalarNZM(hir::Value* value) {
+  if (value->IsConstant()) {
+    return value->AsUint64();
+  }
+
+  uint64_t default_return = GetScalarTypeMask(value->type);
+
+  hir::Instr* def = value->def;
+  if (!def) {
+    return default_return;
+  }
+  return GetScalarNZM(value, def, default_return, def->opcode->num) &
+         default_return;
+}
+bool SimplificationPass::CheckAnd(hir::Instr* i) {
+retry_and_simplification:
+  auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar();
+  if (!constant_value) return false;
+
+  // todo: check if masking with mask that covers all of zero extension source
+  uint64_t type_mask = GetScalarTypeMask(i->dest->type);
+  // if masking with entire width, pointless instruction so become an assign
+
+  if (constant_value->AsUint64() == type_mask) {
+    i->Replace(&OPCODE_ASSIGN_info, 0);
+    i->set_src1(variable_value);
+    return true;
+  }
+
+  auto variable_def = variable_value->def;
+
+  if (variable_def) {
+    auto true_variable_def = variable_def->GetDestDefSkipAssigns();
+    if (true_variable_def) {
+      if (true_variable_def->opcode == &OPCODE_AND_info) {
+        auto [variable_def_constant, variable_def_variable] =
+            true_variable_def->BinaryValueArrangeAsConstAndVar();
+
+        if (variable_def_constant) {
+          // todo: check if masked with mask that was a subset of the current
+          // one and elim if so
+          if (variable_def_constant->AsUint64() == constant_value->AsUint64()) {
+            // we already masked the input with the same mask
+            i->Replace(&OPCODE_ASSIGN_info, 0);
+            i->set_src1(variable_value);
+            return true;
+          }
+        }
+      } else if (true_variable_def->opcode == &OPCODE_OR_info) {
+        Value* or_left = true_variable_def->src1.value;
+        Value* or_right = true_variable_def->src2.value;
+
+        uint64_t left_nzm = GetScalarNZM(or_left);
+
+        // use the other or input instead of the or output
+        if ((constant_value->AsUint64() & left_nzm) == 0) {
+          i->Replace(&OPCODE_AND_info, 0);
+          i->set_src1(or_right);
+          i->set_src2(constant_value);
+          return true;
+        }
+
+        uint64_t right_nzm = GetScalarNZM(or_right);
+
+        if ((constant_value->AsUint64() & right_nzm) == 0) {
+          i->Replace(&OPCODE_AND_info, 0);
+          i->set_src1(or_left);
+          i->set_src2(constant_value);
+          return true;
+        }
+      } else if (true_variable_def->opcode == &OPCODE_ROTATE_LEFT_info) {
+        if (true_variable_def->src2.value->IsConstant()) {
+          if (((type_mask << true_variable_def->src2.value->AsUint64()) &
+               type_mask) ==
+              constant_value->AsUint64()) {  // rotated bits are unused, convert
+                                             // to shift if we are the only use
+            if (true_variable_def->dest->use_head->next == nullptr) {
+              // one use, convert to shift
+              true_variable_def->opcode = &OPCODE_SHL_info;
+              goto retry_and_simplification;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return false;
+}
+bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
+  bool result = false;
+  auto block = builder->first_block();
+  while (block) {
+    auto i = block->instr_head;
+    while (i) {
+      // vector types use the same opcodes as scalar ones for AND/OR/XOR! we
+      // don't handle these in our simplifications, so skip
+      if (i->dest && i->dest->type != VEC128_TYPE) {
+        if (i->opcode == &OPCODE_OR_info) {
+          result |= CheckOr(i);
+        } else if (i->opcode == &OPCODE_XOR_info) {
+          result |= CheckXor(i);
+        } else if (i->opcode == &OPCODE_AND_info) {
+          result |= CheckAnd(i);
+        }
+      }
+      i = i->next;
+    }
+    block = block->next;
+  }
+  return result;
+}
 bool SimplificationPass::EliminateConversions(HIRBuilder* builder) {
  // First, we check for truncates/extensions that can be skipped.
  // This generates some assignments which then the second step will clean up.
@ -158,6 +388,7 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
        i->set_src3(CheckValue(i->src3.value, modified));
        result |= modified;
      }
+
      i = i->next;
    }
    block = block->next;
--- a/src/xenia/cpu/compiler/passes/simplification_pass.h
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.h
@ -31,6 +31,19 @@ class SimplificationPass : public ConditionalGroupSubpass {

  bool SimplifyAssignments(hir::HIRBuilder* builder);
  hir::Value* CheckValue(hir::Value* value, bool& result);
+  bool SimplifyBitArith(hir::HIRBuilder* builder);
+  // handle either or or xor with 0
+  bool CheckOrXorZero(hir::Instr* i);
+  bool CheckOr(hir::Instr* i);
+  bool CheckXor(hir::Instr* i);
+  bool CheckAnd(hir::Instr* i);
+  static bool Is1BitOpcode(hir::Opcode def_opcode);
+  static uint64_t GetScalarNZM(hir::Value* value, hir::Instr* def,
+                               uint64_t typemask, hir::Opcode def_opcode);
+  // todo: use valuemask
+  // returns maybenonzeromask for value (mask of bits that may possibly hold
+  // information)
+  static uint64_t GetScalarNZM(hir::Value* value);
 };

 }  // namespace passes
--- a/src/xenia/cpu/hir/instr.cc
+++ b/src/xenia/cpu/hir/instr.cc
@ -114,7 +114,20 @@ void Instr::Remove() {
    block->instr_tail = prev;
  }
 }
+Instr* Instr::GetDestDefSkipAssigns() {
+  Instr* current_def = this;

+  while (current_def->opcode == &OPCODE_ASSIGN_info) {
+    Instr* next_def = current_def->src1.value->def;
+
+    if (!next_def) {
+      return nullptr;
+    }
+
+    current_def = next_def;
+  }
+  return current_def;
+}
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/instr.h
+++ b/src/xenia/cpu/hir/instr.h
@ -59,6 +59,52 @@ class Instr {
  void MoveBefore(Instr* other);
  void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
  void Remove();
+
+  template <typename TPredicate>
+  std::pair<Value*, Value*> BinaryValueArrangeByPredicateExclusive(
+      TPredicate&& pred) {
+    auto src1_value = src1.value;
+    auto src2_value = src2.value;
+    if (!src1_value || !src2_value) return {nullptr, nullptr};
+
+    if (!opcode) return {nullptr, nullptr};  // impossible!
+
+    // check if binary opcode taking two values. we dont care if the dest is a
+    // value
+
+    if (!IsOpcodeBinaryValue(opcode->signature)) return {nullptr, nullptr};
+
+    if (pred(src1_value)) {
+      if (pred(src2_value)) {
+        return {nullptr, nullptr};
+      } else {
+        return {src1_value, src2_value};
+      }
+    } else if (pred(src2_value)) {
+      return {src2_value, src1_value};
+    } else {
+      return {nullptr, nullptr};
+    }
+  }
+
+  /*
+if src1 is constant, and src2 is not, return [src1, src2]
+if src2 is constant, and src1 is not, return [src2, src1]
+if neither is constant, return nullptr, nullptr
+if both are constant, return nullptr, nullptr
+*/
+  std::pair<Value*, Value*> BinaryValueArrangeAsConstAndVar() {
+    return BinaryValueArrangeByPredicateExclusive(
+        [](Value* value) { return value->IsConstant(); });
+  }
+  std::pair<Value*, Value*> BinaryValueArrangeByDefiningOpcode(
+      const OpcodeInfo* op_ptr) {
+    return BinaryValueArrangeByPredicateExclusive([op_ptr](Value* value) {
+      return value->def && value->def->opcode == op_ptr;
+    });
+  }
+
+  Instr* GetDestDefSkipAssigns();
 };

 }  // namespace hir
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@ -347,6 +347,10 @@ enum OpcodeSignature {
 #define GET_OPCODE_SIG_TYPE_SRC1(sig) (OpcodeSignatureType)((sig >> 3) & 0x7)
 #define GET_OPCODE_SIG_TYPE_SRC2(sig) (OpcodeSignatureType)((sig >> 6) & 0x7)
 #define GET_OPCODE_SIG_TYPE_SRC3(sig) (OpcodeSignatureType)((sig >> 9) & 0x7)
+static bool IsOpcodeBinaryValue(uint32_t signature) {
+  return (signature & ~(0x7)) ==
+         ((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6));
+}

 typedef struct {
  uint32_t flags;
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@ -57,6 +57,15 @@ inline size_t GetTypeSize(TypeName type_name) {
      return 0;
  }
 }
+inline uint64_t GetScalarTypeMask(TypeName type_name) {
+  size_t mask_width = GetTypeSize(type_name);
+
+  if (mask_width == 8) {
+    return ~0ULL;
+  } else {
+    return (1ULL << (mask_width * CHAR_BIT)) - 1;
+  }
+}

 enum ValueFlags {
  VALUE_IS_CONSTANT = (1 << 1),
@ -68,6 +77,23 @@ struct RegAssignment {
  int32_t index;
 };

+struct ValueMask {
+  uint64_t low;   // low 64 bits, usually for scalar values
+  uint64_t high;  // high 64 bits, only used for vector types
+
+  ValueMask(uint64_t _low, uint64_t _high) : low(_low), high(_high) {}
+
+  ValueMask operator&(ValueMask other) const {
+    return ValueMask{low & other.low, high & other.high};
+  }
+  ValueMask operator|(ValueMask other) const {
+    return ValueMask{low | other.low, high | other.high};
+  }
+  ValueMask operator^(ValueMask other) const {
+    return ValueMask{low ^ other.low, high ^ other.high};
+  }
+};
+
 class Value {
 public:
  typedef struct Use_s {
--- a/src/xenia/cpu/ppc/ppc_emit_alu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_alu.cc
@ -1023,6 +1023,17 @@ int InstrEmit_rlwimix(PPCHIRBuilder& f, const InstrData& i) {
  }
  return 0;
 }
+static bool InstrCheck_rlx_only_needs_low(unsigned rotation, uint64_t mask) {
+  uint32_t mask32 = static_cast<uint32_t>(mask);
+  if (static_cast<uint64_t>(mask32) != mask) {
+    return false;
+  }
+  uint32_t all_ones_32 = ~0U;
+  all_ones_32 <<= rotation;
+
+  return all_ones_32 == mask32;  // mask is only 32 bits and all bits from the
+                                 // rotation are discarded
+}

 int InstrEmit_rlwinmx(PPCHIRBuilder& f, const InstrData& i) {
  // n <- SH
@ -1031,23 +1042,47 @@ int InstrEmit_rlwinmx(PPCHIRBuilder& f, const InstrData& i) {
  // RA <- r & m
  Value* v = f.LoadGPR(i.M.RT);

-  // (x||x)
-  v = f.Or(f.Shl(v, 32), f.ZeroExtend(f.Truncate(v, INT32_TYPE), INT64_TYPE));
+  unsigned rotation = i.M.SH;

-  // TODO(benvanik): optimize srwi
-  // TODO(benvanik): optimize slwi
-  // The compiler will generate a bunch of these for the special case of SH=0.
-  // Which seems to just select some bits and set cr0 for use with a branch.
-  // We can detect this and do less work.
-  if (i.M.SH) {
-    v = f.RotateLeft(v, f.LoadConstantInt8(i.M.SH));
-  }
-  // Compiler sometimes masks with 0xFFFFFFFF (identity) - avoid the work here
-  // as our truncation/zero-extend does it for us.
  uint64_t m = XEMASK(i.M.MB + 32, i.M.ME + 32);
-  if (m != 0xFFFFFFFFFFFFFFFFull) {
+
+  // in uint32 range (so no register concat/truncate/zx needed) and no rotation
+  if (m < (1ULL << 32) && (rotation == 0)) {
    v = f.And(v, f.LoadConstantUint64(m));
  }
+  // masks out all the bits that are rotated in from the right, so just do a
+  // shift + and. the and with 0xFFFFFFFF is done instead of a truncate/zx
+  // because we have a special case for it in the emitters that will just do a
+  // single insn (mov reg32, lowpartofreg64), otherwise we generate
+  // significantly more code from setting up the opnds of the truncate/zx
+  else if (InstrCheck_rlx_only_needs_low(rotation, m)) {
+    // this path is taken for like 90% of all rlwinms
+    v = f.And(f.Shl(v, rotation), f.LoadConstantUint64(0xFFFFFFFF));
+  }
+
+  else {
+    // (x||x)
+    // cs: changed this to mask with UINT32_MAX instead of doing the
+    // truncate/extend, this generates better code in the backend and is easier
+    // to do analysis on
+    v = f.And(v, f.LoadConstantUint64(0xFFFFFFFF));
+
+    v = f.Or(f.Shl(v, 32), v);
+
+    // TODO(benvanik): optimize srwi
+    // TODO(benvanik): optimize slwi
+    // The compiler will generate a bunch of these for the special case of SH=0.
+    // Which seems to just select some bits and set cr0 for use with a branch.
+    // We can detect this and do less work.
+    if (i.M.SH) {
+      v = f.RotateLeft(v, f.LoadConstantInt8(rotation));
+    }
+    // Compiler sometimes masks with 0xFFFFFFFF (identity) - avoid the work here
+    // as our truncation/zero-extend does it for us.
+    if (m != 0xFFFFFFFFFFFFFFFFull) {
+      v = f.And(v, f.LoadConstantUint64(m));
+    }
+  }
  f.StoreGPR(i.M.RA, v);
  if (i.M.Rc) {
    f.UpdateCR(0, v);