From 71c5f8f0fae29c417f5eac3e61ffc5002ba574a2 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Thu, 14 Jul 2022 14:32:14 -0700 Subject: [PATCH] Optimized GetScalarNZM, add limit to how far it can recurse. Add rlwinm elimination rule --- .../compiler/passes/simplification_pass.cc | 398 ++++++++++++++---- .../cpu/compiler/passes/simplification_pass.h | 23 +- src/xenia/cpu/hir/value.cc | 9 +- src/xenia/cpu/hir/value.h | 1 + 4 files changed, 341 insertions(+), 90 deletions(-) diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index 5ac7a17de..10862fd54 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -178,103 +178,329 @@ bool SimplificationPass::CheckXor(hir::Instr* i, hir::HIRBuilder* builder) { bool SimplificationPass::Is1BitOpcode(hir::Opcode def_opcode) { return def_opcode >= OPCODE_IS_TRUE && def_opcode <= OPCODE_DID_SATURATE; } -uint64_t SimplificationPass::GetScalarNZM(hir::Value* value, hir::Instr* def, - uint64_t typemask, - hir::Opcode def_opcode) { - if (def_opcode == OPCODE_SHL) { - hir::Value* shifted = def->src1.value; - hir::Value* shiftby = def->src2.value; - // todo: nzm shift - if (shiftby->IsConstant()) { - uint64_t shifted_nzm = GetScalarNZM(shifted); - return shifted_nzm << shiftby->AsUint64(); - } - } else if (def_opcode == OPCODE_SHR) { - hir::Value* shifted = def->src1.value; - hir::Value* shiftby = def->src2.value; - // todo: nzm shift - if (shiftby->IsConstant()) { - uint64_t shifted_nzm = GetScalarNZM(shifted); - return shifted_nzm >> shiftby->AsUint64(); - } +inline static uint64_t RotateOfSize(ScalarNZM nzm, unsigned rotation, + TypeName typ) { + unsigned mask = (static_cast(GetTypeSize(typ)) * CHAR_BIT) - 1; + + rotation &= mask; + return (nzm << rotation) | + (nzm >> + ((static_cast(-static_cast(rotation))) & mask)); +} + +static inline uint64_t BswapOfSize(ScalarNZM nzm, TypeName typ) { + // should work for all sizes, including uint8 + + return xe::byte_swap(nzm) >> (64 - (GetTypeSize(typ) * CHAR_BIT)); +} + +ScalarNZM SimplificationPass::GetScalarNZM(const hir::Value* value, + unsigned depth) { +redo: + + uint64_t typemask = GetScalarTypeMask(value->type); + if (value->IsConstant()) { + return value->constant.u64 & typemask; } - // todo : sha, check signbit - else if (def_opcode == OPCODE_ROTATE_LEFT) { - hir::Value* shifted = def->src1.value; - hir::Value* shiftby = def->src2.value; - // todo: nzm shift - if (shiftby->IsConstant()) { - uint64_t shifted_nzm = GetScalarNZM(shifted); - return xe::rotate_left(shifted_nzm, - static_cast(shiftby->AsUint64())); + if (depth >= MAX_NZM_DEPTH) { + return typemask; + } + const hir::Instr* def = value->def; + if (!def) { + return typemask; + } + + const hir::Value *def_src1 = def->src1.value, *def_src2 = def->src2.value, + *def_src3 = def->src3.value; + + const hir::Opcode def_opcode = def->opcode->num; + auto RecurseScalarNZM = [depth](const hir::Value* value) { + return GetScalarNZM(value, depth + 1); + }; + + switch (def_opcode) { + case OPCODE_SHL: { + const hir::Value* shifted = def_src1; + const hir::Value* shiftby = def_src2; + // todo: nzm shift + if (shiftby->IsConstant()) { + ScalarNZM shifted_nzm = RecurseScalarNZM(shifted); + return (shifted_nzm << shiftby->constant.u8) & typemask; + } + break; } - } else if (def_opcode == OPCODE_XOR || def_opcode == OPCODE_OR) { - return GetScalarNZM(def->src1.value) | GetScalarNZM(def->src2.value); - } else if (def_opcode == OPCODE_NOT) { - return typemask; //~GetScalarNZM(def->src1.value); - } else if (def_opcode == OPCODE_ASSIGN) { - return GetScalarNZM(def->src1.value); - } else if (def_opcode == OPCODE_BYTE_SWAP) { - uint64_t input_nzm = GetScalarNZM(def->src1.value); - switch (GetTypeSize(def->dest->type)) { - case 1: + case OPCODE_SHR: { + const hir::Value* shifted = def_src1; + const hir::Value* shiftby = def_src2; + // todo: nzm shift + if (shiftby->IsConstant()) { + ScalarNZM shifted_nzm = RecurseScalarNZM(shifted); + return shifted_nzm >> shiftby->constant.u8; + } + break; + } + case OPCODE_SHA: { + uint64_t signmask = GetScalarSignbitMask(value->type); + const hir::Value* shifted = def_src1; + const hir::Value* shiftby = def_src2; + + if (shiftby->IsConstant()) { + ScalarNZM shifted_nzm = RecurseScalarNZM(shifted); + uint32_t shift_amnt = shiftby->constant.u8; + if ((shifted_nzm & signmask) == 0) { + return shifted_nzm >> shift_amnt; + } else { + uint64_t sign_shifted_in = typemask >> shift_amnt; // vacate + sign_shifted_in = (~sign_shifted_in) & + typemask; // generate duplication of the signbit + + return ((shifted_nzm >> shift_amnt) | sign_shifted_in) & + typemask; // todo: mask with typemask might not be needed + } + } + break; + } + case OPCODE_ROTATE_LEFT: { + const hir::Value* shifted = def_src1; + const hir::Value* shiftby = def_src2; + // todo: nzm shift + if (shiftby->IsConstant()) { + ScalarNZM rotated_nzm = RecurseScalarNZM(shifted); + return RotateOfSize(rotated_nzm, shiftby->constant.u8, value->type); + } + break; + } + + case OPCODE_OR: + case OPCODE_XOR: + + return RecurseScalarNZM(def_src1) | RecurseScalarNZM(def_src2); + case OPCODE_NOT: + return typemask; //~GetScalarNZM(def->src1.value); + case OPCODE_ASSIGN: + value = def_src1; + goto redo; + case OPCODE_BYTE_SWAP: + return BswapOfSize(RecurseScalarNZM(def_src1), value->type); + case OPCODE_ZERO_EXTEND: + value = def_src1; + goto redo; + case OPCODE_TRUNCATE: + return RecurseScalarNZM(def_src1) & typemask; + + case OPCODE_AND: + return RecurseScalarNZM(def_src1) & RecurseScalarNZM(def_src2); + + case OPCODE_MIN: + /* + * for min: + the nzm will be that of the narrowest operand, because if one value + is capable of being much larger than the other it can never actually + reach a value that is outside the range of the other values nzm, + because that would make it not the minimum of the two + + ahh, actually, we have to be careful about constants then.... for + now, just return or +*/ + case OPCODE_MAX: + return RecurseScalarNZM(def_src1) | RecurseScalarNZM(def_src2); + case OPCODE_SELECT: + return RecurseScalarNZM(def_src2) | RecurseScalarNZM(def_src3); + + case OPCODE_CNTLZ: { + size_t type_bit_size = (CHAR_BIT * GetTypeSize(def_src1->type)); + // if 0, ==type_bit_size + return type_bit_size | ((type_bit_size)-1); + } + case OPCODE_SIGN_EXTEND: { + ScalarNZM input_nzm = RecurseScalarNZM(def_src1); + if ((input_nzm & GetScalarSignbitMask(def_src1->type)) == 0) { return input_nzm; - case 2: - return xe::byte_swap( - static_cast(input_nzm)); - case 4: - return xe::byte_swap( - static_cast(input_nzm)); - case 8: - return xe::byte_swap(input_nzm); - default: - xenia_assert(0); - return typemask; + // return input_nzm; + } else { + uint64_t from_mask = GetScalarTypeMask(def_src1->type); + uint64_t to_mask = typemask; + to_mask &= ~from_mask; + to_mask |= input_nzm; + return to_mask & typemask; + } + break; } - } else if (def_opcode == OPCODE_ZERO_EXTEND) { - return GetScalarNZM(def->src1.value); - } else if (def_opcode == OPCODE_TRUNCATE) { - return GetScalarNZM(def->src1.value); // caller will truncate by masking - } else if (def_opcode == OPCODE_AND) { - return GetScalarNZM(def->src1.value) & GetScalarNZM(def->src2.value); - } else if (def_opcode == OPCODE_SELECT) { - return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src3.value); - } else if (def_opcode == OPCODE_MIN) { - /* - the nzm will be that of the narrowest operand, because if one value is - capable of being much larger than the other it can never actually reach - a value that is outside the range of the other values nzm, because that - would make it not the minimum of the two - - ahh, actually, we have to be careful about constants then.... for now, - just return or - */ - return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src1.value); - } else if (def_opcode == OPCODE_MAX) { - return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src1.value); - } else if (Is1BitOpcode(def_opcode)) { - return 1ULL; - } else if (def_opcode == OPCODE_CAST) { - return GetScalarNZM(def->src1.value); } + if (Is1BitOpcode(def_opcode)) { + return 1ULL; + } return typemask; } -uint64_t SimplificationPass::GetScalarNZM(hir::Value* value) { - if (value->IsConstant()) { - return value->AsUint64(); + +bool SimplificationPass::TransformANDROLORSHLSeq( + hir::Instr* i, // the final instr in the rlwinm sequence (the AND) + hir::HIRBuilder* builder, + ScalarNZM input_nzm, // the NZM of the value being rot'ed/masked (input) + uint64_t mask, // mask after rotation + uint64_t rotation, // rot amount prior to masking + hir::Instr* input_def, // the defining instr of the input var + hir::Value* input // the input variable to the rlwinm +) { + /* + + Usually you perform a rotation at the source level by decomposing it into + two shifts, one left and one right, and an or of those two. your compiler + then recognizes that you are doing a shift and generates the machine + instruction for a 32 bit int, you would do (value<>( (-LEFT_ROT_AMNT)&31 )); in order to reason about the bits actually + used by our rotate and mask operation we decompose our rotate into these two + shifts and check them against our mask to determine whether the masking is + useless, or if it only covers bits from the left or right shift if it is a + right shift, and the mask exactly covers all bits that were not vacated + right, we eliminate the mask + + */ + uint32_t right_shift_for_rotate = + (static_cast(-static_cast(rotation)) & 31); + uint32_t left_shift_for_rotate = (rotation & 31); + + uint64_t rotation_shiftmask_right = + static_cast((~0U) >> right_shift_for_rotate); + uint64_t rotation_shiftmask_left = + static_cast((~0U) << left_shift_for_rotate); + + if (rotation_shiftmask_left == mask) { + Value* rotation_fix = builder->Shl(input, left_shift_for_rotate); + rotation_fix->def->MoveBefore(i); + i->Replace(&OPCODE_AND_info, 0); + i->set_src1(rotation_fix); + i->set_src2(builder->LoadConstantUint64(0xFFFFFFFFULL)); + return true; + } else if (rotation_shiftmask_right == mask) { + // no need to AND at all, the bits were vacated + + i->Replace(&OPCODE_SHR_info, 0); + i->set_src1(input); + i->set_src2( + builder->LoadConstantInt8(static_cast(right_shift_for_rotate))); + return true; + } else if ((rotation_shiftmask_right & mask) == + mask) { // only uses bits from right + Value* rotation_fix = builder->Shr(input, right_shift_for_rotate); + rotation_fix->def->MoveBefore(i); + i->set_src1(rotation_fix); + return true; + + } else if ((rotation_shiftmask_left & mask) == mask) { + Value* rotation_fix = builder->Shl(input, left_shift_for_rotate); + rotation_fix->def->MoveBefore(i); + i->set_src1(rotation_fix); + return true; + } + return false; +} +/* + todo: investigate breaking this down into several different optimization + rules. one for rotates that can be made into shifts one for 64 bit rotates + that can become 32 bit and probably one or two other rules would need to be + added for this) +*/ +bool SimplificationPass::TryHandleANDROLORSHLSeq(hir::Instr* i, + hir::HIRBuilder* builder) { + // todo: could probably make this work for all types without much trouble, but + // not sure if it would benefit any of them + if (i->dest->type != INT64_TYPE) { + return false; + } + auto [defined_by_rotate, mask] = + i->BinaryValueArrangeByDefOpAndConstant(&OPCODE_ROTATE_LEFT_info); + + if (!defined_by_rotate) { + return false; } - uint64_t default_return = GetScalarTypeMask(value->type); + uint64_t mask_value = mask->AsUint64(); - hir::Instr* def = value->def; - if (!def) { - return default_return; + if (static_cast(static_cast(mask_value)) != mask_value) { + return false; } - return GetScalarNZM(value, def, default_return, def->opcode->num) & - default_return; + + Instr* rotinsn = defined_by_rotate->def; + + Value* rotated_variable = rotinsn->src1.value; + Value* rotation_amount = rotinsn->src2.value; + if (!rotation_amount->IsConstant()) { + return false; + } + + uint32_t rotation_value = rotation_amount->constant.u8; + if (rotation_value > 31) { + return false; + } + + if (rotated_variable->IsConstant()) { + return false; + } + Instr* rotated_variable_def = rotated_variable->GetDefSkipAssigns(); + + if (!rotated_variable_def) { + return false; + } + if (rotated_variable_def->opcode != &OPCODE_OR_info) { + return false; + } + // now we have our or that concatenates the var with itself, find the shift by + // 32 + auto [shl_insn_defined, expected_input] = + rotated_variable_def->BinaryValueArrangeByPredicateExclusive( + [](Value* shl_check) { + Instr* shl_def = shl_check->GetDefSkipAssigns(); + + if (!shl_def) { + return false; + } + + Value* shl_left = shl_def->src1.value; + Value* shl_right = shl_def->src2.value; + if (!shl_left || !shl_right) { + return false; + } + if (shl_def->opcode != &OPCODE_SHL_info) { + return false; + } + return !shl_left->IsConstant() && shl_right->IsConstant() && + shl_right->constant.u8 == 32; + }); + + if (!shl_insn_defined) { + return false; + } + + Instr* shl_insn = shl_insn_defined->GetDefSkipAssigns(); + + Instr* shl_left_definition = shl_insn->src1.value->GetDefSkipAssigns(); + Instr* expected_input_definition = expected_input->GetDefSkipAssigns(); + + // we expect (x << 32) | x + + if (shl_left_definition != expected_input_definition) { + return false; + } + // next check the nzm for the input, if the high 32 bits are clear we know we + // can optimize. the input ought to have been cleared by an AND with ~0U or + // truncate zeroextend if its a rlwinx + + // todo: it shouldnt have to search very far for the nzm, we dont need the + // whole depth of the use-def chain traversed, add a maxdef var? or do + // additional opts here depending on the nzm + ScalarNZM input_nzm = GetScalarNZM(expected_input); + + if (static_cast(static_cast(input_nzm)) != input_nzm) { + return false; // bits from the or may overlap!! + } + + return TransformANDROLORSHLSeq(i, builder, input_nzm, mask_value, + rotation_value, expected_input_definition, + expected_input); } bool SimplificationPass::CheckAnd(hir::Instr* i, hir::HIRBuilder* builder) { retry_and_simplification: @@ -292,6 +518,10 @@ retry_and_simplification: return false; } + if (TryHandleANDROLORSHLSeq(i, builder)) { + return true; + } + // todo: check if masking with mask that covers all of zero extension source uint64_t type_mask = GetScalarTypeMask(i->dest->type); // if masking with entire width, pointless instruction so become an assign @@ -591,7 +821,7 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) { while (i) { // vector types use the same opcodes as scalar ones for AND/OR/XOR! we // don't handle these in our simplifications, so skip - if (i->dest && IsScalarIntegralType( i->dest->type) ) { + if (i->dest && IsScalarIntegralType(i->dest->type)) { if (i->opcode == &OPCODE_OR_info) { result |= CheckOr(i, builder); } else if (i->opcode == &OPCODE_XOR_info) { diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.h b/src/xenia/cpu/compiler/passes/simplification_pass.h index 6a7f3c672..d805ea27c 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.h +++ b/src/xenia/cpu/compiler/passes/simplification_pass.h @@ -16,7 +16,7 @@ namespace xe { namespace cpu { namespace compiler { namespace passes { - +using ScalarNZM = uint64_t; class SimplificationPass : public ConditionalGroupSubpass { public: SimplificationPass(); @@ -44,14 +44,27 @@ class SimplificationPass : public ConditionalGroupSubpass { bool CheckSelect(hir::Instr* i, hir::HIRBuilder* builder); bool CheckScalarConstCmp(hir::Instr* i, hir::HIRBuilder* builder); bool CheckIsTrueIsFalse(hir::Instr* i, hir::HIRBuilder* builder); - + // for rlwinm + bool TryHandleANDROLORSHLSeq(hir::Instr* i, hir::HIRBuilder* builder); + bool TransformANDROLORSHLSeq( + hir::Instr* i, // the final instr in the rlwinm sequence (the AND) + hir::HIRBuilder* builder, + ScalarNZM input_nzm, // the NZM of the value being rot'ed/masked (input) + uint64_t mask, // mask after rotation + uint64_t rotation, // rot amount prior to masking + hir::Instr* input_def, // the defining instr of the input var + hir::Value* input // the input variable to the rlwinm + ); static bool Is1BitOpcode(hir::Opcode def_opcode); - static uint64_t GetScalarNZM(hir::Value* value, hir::Instr* def, - uint64_t typemask, hir::Opcode def_opcode); + + /* we currently have no caching, and although it is rare some games + may have massive (in the hundreds) chains of ands + ors + shifts in + a single unbroken basic block (exsploderman web of shadows) */ + static constexpr unsigned MAX_NZM_DEPTH = 16; // todo: use valuemask // returns maybenonzeromask for value (mask of bits that may possibly hold // information) - static uint64_t GetScalarNZM(hir::Value* value); + static ScalarNZM GetScalarNZM(const hir::Value* value, unsigned depth = 0); }; } // namespace passes diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc index 999182e35..2bf52a05d 100644 --- a/src/xenia/cpu/hir/value.cc +++ b/src/xenia/cpu/hir/value.cc @@ -8,6 +8,7 @@ */ #include "xenia/cpu/hir/value.h" +#include "xenia/cpu/hir/instr.h" #include #include @@ -1680,7 +1681,13 @@ bool Value::CompareInt64(Opcode opcode, Value* a, Value* b) { return false; } } - +hir::Instr* Value::GetDefSkipAssigns() { + if (def) { + return def->GetDestDefSkipAssigns(); + } else { + return nullptr; + } +} } // namespace hir } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h index 742ea8add..f0312d424 100644 --- a/src/xenia/cpu/hir/value.h +++ b/src/xenia/cpu/hir/value.h @@ -577,6 +577,7 @@ class Value { void ByteSwap(); void CountLeadingZeros(const Value* other); bool Compare(Opcode opcode, Value* other); + hir::Instr* GetDefSkipAssigns(); private: static bool CompareInt8(Opcode opcode, Value* a, Value* b);