drastically reduce size of final generated code for rlwinm by adding special paths for rotations of 0, masks that discard the rotated bits and using And w/ UINT_MAX instead of truncate/zero extend

Add special case to TYPE_INT64's EmitAnd for UINT_MAX mask. Do mov32 to 32 if detected to take advantage of implicit zero xt/reg renaming Add helper function for skipping assignment defs in instr. Add helper function for checking if an opcode is binary value type Add several new optimizations to simplificationpass, plus weak NZM calculation code (better full evaluation of Z/NZ will be done later) . List of optimizations: If a value is anded with a bitmask that it was already masked against, reuse the old value (this cuts out most FPSCR update garbage, although it does cause a local variable to be allocated for the masked FPSCR and it still repeatedly stores the masked value to the context) If masking a value that was or'ed against another check whether our mask only considers bits from one value or another. if so, change the operand to the OR input that actually matters If the only usage of a rotate left's output is an AND against a mask that discards the bits that were rotated in change the opcode to SHIFT_LEFT If masking against all ones, become an assign. If XOR or OR against 0, become an assign (additional FPSCR codegen cleanup) If XOR against all ones, become a NOT Adding a direct CPUID check to x64_emitter for lzcnt, the version of xbyak we are using is skipping checking for lzcnt on all non-intel cpus, meaning we are generating the much slower bitscan path for AMD cpus.
2022-06-25 09:58:13 -07:00 · 2022-06-25 09:58:13 -07:00 · 327cc9eff5
parent 2b3686f0e9
commit 327cc9eff5
9 changed files with 399 additions and 16 deletions
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -106,7 +106,16 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
 #undef TEST_EMIT_FEATURE
-
+  /*
  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in latest version of xbyak
 */
  unsigned int data[4];
  Xbyak::util::Cpu::getCpuid(0x80000001, data);
  if (data[2] & (1U << 5)) {
    if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
      feature_flags_ |= kX64EmitLZCNT;
    }
  }
  if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
      bool is_zennish = cpu_.displayFamily >= 0x17;
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -2749,11 +2749,17 @@ struct AND_I32 : Sequence<AND_I32, I<OPCODE_AND, I32Op, I32Op, I32Op>> {
 };
 struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    if (i.src2.is_constant && i.src2.constant() == 0xFFFFFFFF) {
      // special case for rlwinm codegen
      e.mov(((Reg64)i.dest).cvt32(), ((Reg64)i.src1).cvt32());
    } else {
      EmitAndXX<AND_I64, Reg64>(e, i);
    }
  }
 };
 struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    EmitCommutativeBinaryXmmOp(e, i,
                               [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
                                 e.vpand(dest, src1, src2);
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@ -9,8 +9,8 @@
 #include "xenia/cpu/compiler/passes/simplification_pass.h"
 #include "xenia/base/byte_order.h"
 #include "xenia/base/profiling.h"
 namespace xe {
 namespace cpu {
 namespace compiler {
@ -29,11 +29,241 @@ SimplificationPass::~SimplificationPass() {}
 bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
  result = false;
  result |= SimplifyBitArith(builder);
  result |= EliminateConversions(builder);
  result |= SimplifyAssignments(builder);
  return true;
 }
 // simplifications that apply to both or and xor
 bool SimplificationPass::CheckOrXorZero(hir::Instr* i) {
  auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar();
  if (constant_value && constant_value->IsConstantZero()) {
    i->Replace(&OPCODE_ASSIGN_info, 0);
    i->set_src1(variable_value);
    return true;
  }
  return false;
 }
 bool SimplificationPass::CheckOr(hir::Instr* i) { return CheckOrXorZero(i); }
 bool SimplificationPass::CheckXor(hir::Instr* i) {
  if (CheckOrXorZero(i)) {
    return true;
  } else {
    uint64_t type_mask = GetScalarTypeMask(i->dest->type);
    auto [constant_value, variable_value] =
        i->BinaryValueArrangeAsConstAndVar();
    if (!constant_value) return false;
    if (constant_value->AsUint64() == type_mask) {
      i->Replace(&OPCODE_NOT_info, 0);
      i->set_src1(variable_value);
      return true;
    }
  }
  return false;
 }
 bool SimplificationPass::Is1BitOpcode(hir::Opcode def_opcode) {
  return def_opcode >= OPCODE_IS_TRUE && def_opcode <= OPCODE_DID_SATURATE;
 }
 uint64_t SimplificationPass::GetScalarNZM(hir::Value* value, hir::Instr* def,
                                          uint64_t typemask,
                                          hir::Opcode def_opcode) {
  if (def_opcode == OPCODE_SHL) {
    hir::Value* shifted = def->src1.value;
    hir::Value* shiftby = def->src2.value;
    // todo: nzm shift
    if (shiftby->IsConstant()) {
      uint64_t shifted_nzm = GetScalarNZM(shifted);
      return shifted_nzm << shiftby->AsUint64();
    }
  } else if (def_opcode == OPCODE_SHR) {
    hir::Value* shifted = def->src1.value;
    hir::Value* shiftby = def->src2.value;
    // todo: nzm shift
    if (shiftby->IsConstant()) {
      uint64_t shifted_nzm = GetScalarNZM(shifted);
      return shifted_nzm >> shiftby->AsUint64();
    }
  }
  // todo : sha, check signbit
  else if (def_opcode == OPCODE_ROTATE_LEFT) {
    hir::Value* shifted = def->src1.value;
    hir::Value* shiftby = def->src2.value;
    // todo: nzm shift
    if (shiftby->IsConstant()) {
      uint64_t shifted_nzm = GetScalarNZM(shifted);
      return xe::rotate_left(shifted_nzm,
                             static_cast<uint8_t>(shiftby->AsUint64()));
    }
  } else if (def_opcode == OPCODE_XOR || def_opcode == OPCODE_OR) {
    return GetScalarNZM(def->src1.value) | GetScalarNZM(def->src2.value);
  } else if (def_opcode == OPCODE_NOT) {
    return ~GetScalarNZM(def->src1.value);
  } else if (def_opcode == OPCODE_ASSIGN) {
    return GetScalarNZM(def->src1.value);
  } else if (def_opcode == OPCODE_BYTE_SWAP) {
    uint64_t input_nzm = GetScalarNZM(def->src1.value);
    switch (GetTypeSize(def->dest->type)) {
      case 1:
        return input_nzm;
      case 2:
        return xe::byte_swap<unsigned short>(
            static_cast<unsigned short>(input_nzm));
      case 4:
        return xe::byte_swap<unsigned int>(
            static_cast<unsigned int>(input_nzm));
      case 8:
        return xe::byte_swap<unsigned long long>(input_nzm);
      default:
        xenia_assert(0);
        return typemask;
    }
  } else if (def_opcode == OPCODE_ZERO_EXTEND) {
    return GetScalarNZM(def->src1.value);
  } else if (def_opcode == OPCODE_TRUNCATE) {
    return GetScalarNZM(def->src1.value);  // caller will truncate by masking
  } else if (def_opcode == OPCODE_AND) {
    return GetScalarNZM(def->src1.value) & GetScalarNZM(def->src2.value);
  } else if (def_opcode == OPCODE_SELECT) {
    return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src3.value);
  } else if (def_opcode == OPCODE_MIN) {
    /*
        the nzm will be that of the narrowest operand, because if one value is
       capable of being much larger than the other it can never actually reach
        a value that is outside the range of the other values nzm, because that
       would make it not the minimum of the two
        ahh, actually, we have to be careful about constants then.... for now,
       just return or
    */
    return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src1.value);
  } else if (def_opcode == OPCODE_MAX) {
    return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src1.value);
  } else if (Is1BitOpcode(def_opcode)) {
    return 1ULL;
  } else if (def_opcode == OPCODE_CAST) {
    return GetScalarNZM(def->src1.value);
  }
  return typemask;
 }
 uint64_t SimplificationPass::GetScalarNZM(hir::Value* value) {
  if (value->IsConstant()) {
    return value->AsUint64();
  }
  uint64_t default_return = GetScalarTypeMask(value->type);
  hir::Instr* def = value->def;
  if (!def) {
    return default_return;
  }
  return GetScalarNZM(value, def, default_return, def->opcode->num) &
         default_return;
 }
 bool SimplificationPass::CheckAnd(hir::Instr* i) {
 retry_and_simplification:
  auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar();
  if (!constant_value) return false;
  // todo: check if masking with mask that covers all of zero extension source
  uint64_t type_mask = GetScalarTypeMask(i->dest->type);
  // if masking with entire width, pointless instruction so become an assign
  if (constant_value->AsUint64() == type_mask) {
    i->Replace(&OPCODE_ASSIGN_info, 0);
    i->set_src1(variable_value);
    return true;
  }
  auto variable_def = variable_value->def;
  if (variable_def) {
    auto true_variable_def = variable_def->GetDestDefSkipAssigns();
    if (true_variable_def) {
      if (true_variable_def->opcode == &OPCODE_AND_info) {
        auto [variable_def_constant, variable_def_variable] =
            true_variable_def->BinaryValueArrangeAsConstAndVar();
        if (variable_def_constant) {
          // todo: check if masked with mask that was a subset of the current
          // one and elim if so
          if (variable_def_constant->AsUint64() == constant_value->AsUint64()) {
            // we already masked the input with the same mask
            i->Replace(&OPCODE_ASSIGN_info, 0);
            i->set_src1(variable_value);
            return true;
          }
        }
      } else if (true_variable_def->opcode == &OPCODE_OR_info) {
        Value* or_left = true_variable_def->src1.value;
        Value* or_right = true_variable_def->src2.value;
        uint64_t left_nzm = GetScalarNZM(or_left);
        // use the other or input instead of the or output
        if ((constant_value->AsUint64() & left_nzm) == 0) {
          i->Replace(&OPCODE_AND_info, 0);
          i->set_src1(or_right);
          i->set_src2(constant_value);
          return true;
        }
        uint64_t right_nzm = GetScalarNZM(or_right);
        if ((constant_value->AsUint64() & right_nzm) == 0) {
          i->Replace(&OPCODE_AND_info, 0);
          i->set_src1(or_left);
          i->set_src2(constant_value);
          return true;
        }
      } else if (true_variable_def->opcode == &OPCODE_ROTATE_LEFT_info) {
        if (true_variable_def->src2.value->IsConstant()) {
          if (((type_mask << true_variable_def->src2.value->AsUint64()) &
               type_mask) ==
              constant_value->AsUint64()) {  // rotated bits are unused, convert
                                             // to shift if we are the only use
            if (true_variable_def->dest->use_head->next == nullptr) {
              // one use, convert to shift
              true_variable_def->opcode = &OPCODE_SHL_info;
              goto retry_and_simplification;
            }
          }
        }
      }
    }
  }
  return false;
 }
 bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
  bool result = false;
  auto block = builder->first_block();
  while (block) {
    auto i = block->instr_head;
    while (i) {
      // vector types use the same opcodes as scalar ones for AND/OR/XOR! we
      // don't handle these in our simplifications, so skip
      if (i->dest && i->dest->type != VEC128_TYPE) {
        if (i->opcode == &OPCODE_OR_info) {
          result |= CheckOr(i);
        } else if (i->opcode == &OPCODE_XOR_info) {
          result |= CheckXor(i);
        } else if (i->opcode == &OPCODE_AND_info) {
          result |= CheckAnd(i);
        }
      }
      i = i->next;
    }
    block = block->next;
  }
  return result;
 }
 bool SimplificationPass::EliminateConversions(HIRBuilder* builder) {
  // First, we check for truncates/extensions that can be skipped.
  // This generates some assignments which then the second step will clean up.
@ -158,6 +388,7 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
        i->set_src3(CheckValue(i->src3.value, modified));
        result |= modified;
      }
      i = i->next;
    }
    block = block->next;
--- a/src/xenia/cpu/compiler/passes/simplification_pass.h
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.h
@ -31,6 +31,19 @@ class SimplificationPass : public ConditionalGroupSubpass {
  bool SimplifyAssignments(hir::HIRBuilder* builder);
  hir::Value* CheckValue(hir::Value* value, bool& result);
  bool SimplifyBitArith(hir::HIRBuilder* builder);
  // handle either or or xor with 0
  bool CheckOrXorZero(hir::Instr* i);
  bool CheckOr(hir::Instr* i);
  bool CheckXor(hir::Instr* i);
  bool CheckAnd(hir::Instr* i);
  static bool Is1BitOpcode(hir::Opcode def_opcode);
  static uint64_t GetScalarNZM(hir::Value* value, hir::Instr* def,
                               uint64_t typemask, hir::Opcode def_opcode);
  // todo: use valuemask
  // returns maybenonzeromask for value (mask of bits that may possibly hold
  // information)
  static uint64_t GetScalarNZM(hir::Value* value);
 };
 }  // namespace passes
--- a/src/xenia/cpu/hir/instr.cc
+++ b/src/xenia/cpu/hir/instr.cc
@ -114,7 +114,20 @@ void Instr::Remove() {
    block->instr_tail = prev;
  }
 }
 Instr* Instr::GetDestDefSkipAssigns() {
  Instr* current_def = this;
  while (current_def->opcode == &OPCODE_ASSIGN_info) {
    Instr* next_def = current_def->src1.value->def;
    if (!next_def) {
      return nullptr;
    }
    current_def = next_def;
  }
  return current_def;
 }
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
--- a/src/xenia/cpu/hir/instr.h
+++ b/src/xenia/cpu/hir/instr.h
@ -59,6 +59,52 @@ class Instr {
  void MoveBefore(Instr* other);
  void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
  void Remove();
  template <typename TPredicate>
  std::pair<Value*, Value*> BinaryValueArrangeByPredicateExclusive(
      TPredicate&& pred) {
    auto src1_value = src1.value;
    auto src2_value = src2.value;
    if (!src1_value || !src2_value) return {nullptr, nullptr};
    if (!opcode) return {nullptr, nullptr};  // impossible!
    // check if binary opcode taking two values. we dont care if the dest is a
    // value
    if (!IsOpcodeBinaryValue(opcode->signature)) return {nullptr, nullptr};
    if (pred(src1_value)) {
      if (pred(src2_value)) {
        return {nullptr, nullptr};
      } else {
        return {src1_value, src2_value};
      }
    } else if (pred(src2_value)) {
      return {src2_value, src1_value};
    } else {
      return {nullptr, nullptr};
    }
  }
  /*
 if src1 is constant, and src2 is not, return [src1, src2]
 if src2 is constant, and src1 is not, return [src2, src1]
 if neither is constant, return nullptr, nullptr
 if both are constant, return nullptr, nullptr
 */
  std::pair<Value*, Value*> BinaryValueArrangeAsConstAndVar() {
    return BinaryValueArrangeByPredicateExclusive(
        [](Value* value) { return value->IsConstant(); });
  }
  std::pair<Value*, Value*> BinaryValueArrangeByDefiningOpcode(
      const OpcodeInfo* op_ptr) {
    return BinaryValueArrangeByPredicateExclusive([op_ptr](Value* value) {
      return value->def && value->def->opcode == op_ptr;
    });
  }
  Instr* GetDestDefSkipAssigns();
 };
 }  // namespace hir
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@ -347,6 +347,10 @@ enum OpcodeSignature {
 #define GET_OPCODE_SIG_TYPE_SRC1(sig) (OpcodeSignatureType)((sig >> 3) & 0x7)
 #define GET_OPCODE_SIG_TYPE_SRC2(sig) (OpcodeSignatureType)((sig >> 6) & 0x7)
 #define GET_OPCODE_SIG_TYPE_SRC3(sig) (OpcodeSignatureType)((sig >> 9) & 0x7)
 static bool IsOpcodeBinaryValue(uint32_t signature) {
  return (signature & ~(0x7)) ==
         ((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6));
 }
 typedef struct {
  uint32_t flags;
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@ -57,6 +57,15 @@ inline size_t GetTypeSize(TypeName type_name) {
      return 0;
  }
 }
 inline uint64_t GetScalarTypeMask(TypeName type_name) {
  size_t mask_width = GetTypeSize(type_name);
  if (mask_width == 8) {
    return ~0ULL;
  } else {
    return (1ULL << (mask_width * CHAR_BIT)) - 1;
  }
 }
 enum ValueFlags {
  VALUE_IS_CONSTANT = (1 << 1),
@ -68,6 +77,23 @@ struct RegAssignment {
  int32_t index;
 };
 struct ValueMask {
  uint64_t low;   // low 64 bits, usually for scalar values
  uint64_t high;  // high 64 bits, only used for vector types
  ValueMask(uint64_t _low, uint64_t _high) : low(_low), high(_high) {}
  ValueMask operator&(ValueMask other) const {
    return ValueMask{low & other.low, high & other.high};
  }
  ValueMask operator|(ValueMask other) const {
    return ValueMask{low | other.low, high | other.high};
  }
  ValueMask operator^(ValueMask other) const {
    return ValueMask{low ^ other.low, high ^ other.high};
  }
 };
 class Value {
 public:
  typedef struct Use_s {
--- a/src/xenia/cpu/ppc/ppc_emit_alu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_alu.cc
@ -1023,6 +1023,17 @@ int InstrEmit_rlwimix(PPCHIRBuilder& f, const InstrData& i) {
  }
  return 0;
 }
 static bool InstrCheck_rlx_only_needs_low(unsigned rotation, uint64_t mask) {
  uint32_t mask32 = static_cast<uint32_t>(mask);
  if (static_cast<uint64_t>(mask32) != mask) {
    return false;
  }
  uint32_t all_ones_32 = ~0U;
  all_ones_32 <<= rotation;
  return all_ones_32 == mask32;  // mask is only 32 bits and all bits from the
                                 // rotation are discarded
 }
 int InstrEmit_rlwinmx(PPCHIRBuilder& f, const InstrData& i) {
  // n <- SH
@ -1031,8 +1042,32 @@ int InstrEmit_rlwinmx(PPCHIRBuilder& f, const InstrData& i) {
  // RA <- r & m
  Value* v = f.LoadGPR(i.M.RT);
  unsigned rotation = i.M.SH;
  uint64_t m = XEMASK(i.M.MB + 32, i.M.ME + 32);
  // in uint32 range (so no register concat/truncate/zx needed) and no rotation
  if (m < (1ULL << 32) && (rotation == 0)) {
    v = f.And(v, f.LoadConstantUint64(m));
  }
  // masks out all the bits that are rotated in from the right, so just do a
  // shift + and. the and with 0xFFFFFFFF is done instead of a truncate/zx
  // because we have a special case for it in the emitters that will just do a
  // single insn (mov reg32, lowpartofreg64), otherwise we generate
  // significantly more code from setting up the opnds of the truncate/zx
  else if (InstrCheck_rlx_only_needs_low(rotation, m)) {
    // this path is taken for like 90% of all rlwinms
    v = f.And(f.Shl(v, rotation), f.LoadConstantUint64(0xFFFFFFFF));
  }
  else {
    // (x||x)
-  v = f.Or(f.Shl(v, 32), f.ZeroExtend(f.Truncate(v, INT32_TYPE), INT64_TYPE));
+    // cs: changed this to mask with UINT32_MAX instead of doing the
    // truncate/extend, this generates better code in the backend and is easier
    // to do analysis on
    v = f.And(v, f.LoadConstantUint64(0xFFFFFFFF));
    v = f.Or(f.Shl(v, 32), v);
    // TODO(benvanik): optimize srwi
    // TODO(benvanik): optimize slwi
@ -1040,14 +1075,14 @@ int InstrEmit_rlwinmx(PPCHIRBuilder& f, const InstrData& i) {
    // Which seems to just select some bits and set cr0 for use with a branch.
    // We can detect this and do less work.
    if (i.M.SH) {
-    v = f.RotateLeft(v, f.LoadConstantInt8(i.M.SH));
+      v = f.RotateLeft(v, f.LoadConstantInt8(rotation));
    }
    // Compiler sometimes masks with 0xFFFFFFFF (identity) - avoid the work here
    // as our truncation/zero-extend does it for us.
  uint64_t m = XEMASK(i.M.MB + 32, i.M.ME + 32);
    if (m != 0xFFFFFFFFFFFFFFFFull) {
      v = f.And(v, f.LoadConstantUint64(m));
    }
  }
  f.StoreGPR(i.M.RA, v);
  if (i.M.Rc) {
    f.UpdateCR(0, v);