drastically reduce size of final generated code for rlwinm by adding special paths for rotations of 0, masks that discard the rotated bits and using And w/ UINT_MAX instead of truncate/zero extend

Add special case to TYPE_INT64's EmitAnd for UINT_MAX mask. Do mov32 to 32 if detected to take advantage of implicit zero xt/reg renaming

Add helper function for skipping assignment defs in instr.
Add helper function for checking if an opcode is binary value type
Add several new optimizations to simplificationpass, plus weak NZM calculation code (better full evaluation of Z/NZ will be done later) .
 List of optimizations:
  If a value is anded with a bitmask that it was already masked against, reuse the old value (this cuts out most FPSCR update garbage, although it does cause a local variable to be allocated for the masked FPSCR and it still repeatedly stores the masked value to the context)
  If masking a value that was or'ed against another check whether our mask only considers bits from one value or another. if so, change the operand to the OR input that actually matters
  If the only usage of a rotate left's output is an AND against a mask that discards the bits that were rotated in change the opcode to SHIFT_LEFT
  If masking against all ones, become an assign.
  If XOR or OR against 0, become an assign (additional FPSCR codegen cleanup)
  If XOR against all ones, become a NOT
Adding a direct CPUID check to x64_emitter for lzcnt, the version of xbyak we are using is skipping checking for lzcnt on all non-intel cpus, meaning we are generating the much slower bitscan path for AMD cpus.
This commit is contained in:
chss95cs@gmail.com 2022-06-25 09:58:13 -07:00
parent 2b3686f0e9
commit 327cc9eff5
9 changed files with 399 additions and 16 deletions

View File

@ -106,7 +106,16 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
#undef TEST_EMIT_FEATURE #undef TEST_EMIT_FEATURE
/*
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in latest version of xbyak
*/
unsigned int data[4];
Xbyak::util::Cpu::getCpuid(0x80000001, data);
if (data[2] & (1U << 5)) {
if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
feature_flags_ |= kX64EmitLZCNT;
}
}
if (cpu_.has(Xbyak::util::Cpu::tAMD)) { if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
bool is_zennish = cpu_.displayFamily >= 0x17; bool is_zennish = cpu_.displayFamily >= 0x17;

View File

@ -2749,11 +2749,17 @@ struct AND_I32 : Sequence<AND_I32, I<OPCODE_AND, I32Op, I32Op, I32Op>> {
}; };
struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> { struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.src2.is_constant && i.src2.constant() == 0xFFFFFFFF) {
// special case for rlwinm codegen
e.mov(((Reg64)i.dest).cvt32(), ((Reg64)i.src1).cvt32());
} else {
EmitAndXX<AND_I64, Reg64>(e, i); EmitAndXX<AND_I64, Reg64>(e, i);
} }
}
}; };
struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> { struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
EmitCommutativeBinaryXmmOp(e, i, EmitCommutativeBinaryXmmOp(e, i,
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
e.vpand(dest, src1, src2); e.vpand(dest, src1, src2);

View File

@ -9,8 +9,8 @@
#include "xenia/cpu/compiler/passes/simplification_pass.h" #include "xenia/cpu/compiler/passes/simplification_pass.h"
#include "xenia/base/byte_order.h"
#include "xenia/base/profiling.h" #include "xenia/base/profiling.h"
namespace xe { namespace xe {
namespace cpu { namespace cpu {
namespace compiler { namespace compiler {
@ -29,11 +29,241 @@ SimplificationPass::~SimplificationPass() {}
bool SimplificationPass::Run(HIRBuilder* builder, bool& result) { bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
result = false; result = false;
result |= SimplifyBitArith(builder);
result |= EliminateConversions(builder); result |= EliminateConversions(builder);
result |= SimplifyAssignments(builder); result |= SimplifyAssignments(builder);
return true; return true;
} }
// simplifications that apply to both or and xor
bool SimplificationPass::CheckOrXorZero(hir::Instr* i) {
auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar();
if (constant_value && constant_value->IsConstantZero()) {
i->Replace(&OPCODE_ASSIGN_info, 0);
i->set_src1(variable_value);
return true;
}
return false;
}
bool SimplificationPass::CheckOr(hir::Instr* i) { return CheckOrXorZero(i); }
bool SimplificationPass::CheckXor(hir::Instr* i) {
if (CheckOrXorZero(i)) {
return true;
} else {
uint64_t type_mask = GetScalarTypeMask(i->dest->type);
auto [constant_value, variable_value] =
i->BinaryValueArrangeAsConstAndVar();
if (!constant_value) return false;
if (constant_value->AsUint64() == type_mask) {
i->Replace(&OPCODE_NOT_info, 0);
i->set_src1(variable_value);
return true;
}
}
return false;
}
bool SimplificationPass::Is1BitOpcode(hir::Opcode def_opcode) {
return def_opcode >= OPCODE_IS_TRUE && def_opcode <= OPCODE_DID_SATURATE;
}
uint64_t SimplificationPass::GetScalarNZM(hir::Value* value, hir::Instr* def,
uint64_t typemask,
hir::Opcode def_opcode) {
if (def_opcode == OPCODE_SHL) {
hir::Value* shifted = def->src1.value;
hir::Value* shiftby = def->src2.value;
// todo: nzm shift
if (shiftby->IsConstant()) {
uint64_t shifted_nzm = GetScalarNZM(shifted);
return shifted_nzm << shiftby->AsUint64();
}
} else if (def_opcode == OPCODE_SHR) {
hir::Value* shifted = def->src1.value;
hir::Value* shiftby = def->src2.value;
// todo: nzm shift
if (shiftby->IsConstant()) {
uint64_t shifted_nzm = GetScalarNZM(shifted);
return shifted_nzm >> shiftby->AsUint64();
}
}
// todo : sha, check signbit
else if (def_opcode == OPCODE_ROTATE_LEFT) {
hir::Value* shifted = def->src1.value;
hir::Value* shiftby = def->src2.value;
// todo: nzm shift
if (shiftby->IsConstant()) {
uint64_t shifted_nzm = GetScalarNZM(shifted);
return xe::rotate_left(shifted_nzm,
static_cast<uint8_t>(shiftby->AsUint64()));
}
} else if (def_opcode == OPCODE_XOR || def_opcode == OPCODE_OR) {
return GetScalarNZM(def->src1.value) | GetScalarNZM(def->src2.value);
} else if (def_opcode == OPCODE_NOT) {
return ~GetScalarNZM(def->src1.value);
} else if (def_opcode == OPCODE_ASSIGN) {
return GetScalarNZM(def->src1.value);
} else if (def_opcode == OPCODE_BYTE_SWAP) {
uint64_t input_nzm = GetScalarNZM(def->src1.value);
switch (GetTypeSize(def->dest->type)) {
case 1:
return input_nzm;
case 2:
return xe::byte_swap<unsigned short>(
static_cast<unsigned short>(input_nzm));
case 4:
return xe::byte_swap<unsigned int>(
static_cast<unsigned int>(input_nzm));
case 8:
return xe::byte_swap<unsigned long long>(input_nzm);
default:
xenia_assert(0);
return typemask;
}
} else if (def_opcode == OPCODE_ZERO_EXTEND) {
return GetScalarNZM(def->src1.value);
} else if (def_opcode == OPCODE_TRUNCATE) {
return GetScalarNZM(def->src1.value); // caller will truncate by masking
} else if (def_opcode == OPCODE_AND) {
return GetScalarNZM(def->src1.value) & GetScalarNZM(def->src2.value);
} else if (def_opcode == OPCODE_SELECT) {
return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src3.value);
} else if (def_opcode == OPCODE_MIN) {
/*
the nzm will be that of the narrowest operand, because if one value is
capable of being much larger than the other it can never actually reach
a value that is outside the range of the other values nzm, because that
would make it not the minimum of the two
ahh, actually, we have to be careful about constants then.... for now,
just return or
*/
return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src1.value);
} else if (def_opcode == OPCODE_MAX) {
return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src1.value);
} else if (Is1BitOpcode(def_opcode)) {
return 1ULL;
} else if (def_opcode == OPCODE_CAST) {
return GetScalarNZM(def->src1.value);
}
return typemask;
}
uint64_t SimplificationPass::GetScalarNZM(hir::Value* value) {
if (value->IsConstant()) {
return value->AsUint64();
}
uint64_t default_return = GetScalarTypeMask(value->type);
hir::Instr* def = value->def;
if (!def) {
return default_return;
}
return GetScalarNZM(value, def, default_return, def->opcode->num) &
default_return;
}
bool SimplificationPass::CheckAnd(hir::Instr* i) {
retry_and_simplification:
auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar();
if (!constant_value) return false;
// todo: check if masking with mask that covers all of zero extension source
uint64_t type_mask = GetScalarTypeMask(i->dest->type);
// if masking with entire width, pointless instruction so become an assign
if (constant_value->AsUint64() == type_mask) {
i->Replace(&OPCODE_ASSIGN_info, 0);
i->set_src1(variable_value);
return true;
}
auto variable_def = variable_value->def;
if (variable_def) {
auto true_variable_def = variable_def->GetDestDefSkipAssigns();
if (true_variable_def) {
if (true_variable_def->opcode == &OPCODE_AND_info) {
auto [variable_def_constant, variable_def_variable] =
true_variable_def->BinaryValueArrangeAsConstAndVar();
if (variable_def_constant) {
// todo: check if masked with mask that was a subset of the current
// one and elim if so
if (variable_def_constant->AsUint64() == constant_value->AsUint64()) {
// we already masked the input with the same mask
i->Replace(&OPCODE_ASSIGN_info, 0);
i->set_src1(variable_value);
return true;
}
}
} else if (true_variable_def->opcode == &OPCODE_OR_info) {
Value* or_left = true_variable_def->src1.value;
Value* or_right = true_variable_def->src2.value;
uint64_t left_nzm = GetScalarNZM(or_left);
// use the other or input instead of the or output
if ((constant_value->AsUint64() & left_nzm) == 0) {
i->Replace(&OPCODE_AND_info, 0);
i->set_src1(or_right);
i->set_src2(constant_value);
return true;
}
uint64_t right_nzm = GetScalarNZM(or_right);
if ((constant_value->AsUint64() & right_nzm) == 0) {
i->Replace(&OPCODE_AND_info, 0);
i->set_src1(or_left);
i->set_src2(constant_value);
return true;
}
} else if (true_variable_def->opcode == &OPCODE_ROTATE_LEFT_info) {
if (true_variable_def->src2.value->IsConstant()) {
if (((type_mask << true_variable_def->src2.value->AsUint64()) &
type_mask) ==
constant_value->AsUint64()) { // rotated bits are unused, convert
// to shift if we are the only use
if (true_variable_def->dest->use_head->next == nullptr) {
// one use, convert to shift
true_variable_def->opcode = &OPCODE_SHL_info;
goto retry_and_simplification;
}
}
}
}
}
}
return false;
}
bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
bool result = false;
auto block = builder->first_block();
while (block) {
auto i = block->instr_head;
while (i) {
// vector types use the same opcodes as scalar ones for AND/OR/XOR! we
// don't handle these in our simplifications, so skip
if (i->dest && i->dest->type != VEC128_TYPE) {
if (i->opcode == &OPCODE_OR_info) {
result |= CheckOr(i);
} else if (i->opcode == &OPCODE_XOR_info) {
result |= CheckXor(i);
} else if (i->opcode == &OPCODE_AND_info) {
result |= CheckAnd(i);
}
}
i = i->next;
}
block = block->next;
}
return result;
}
bool SimplificationPass::EliminateConversions(HIRBuilder* builder) { bool SimplificationPass::EliminateConversions(HIRBuilder* builder) {
// First, we check for truncates/extensions that can be skipped. // First, we check for truncates/extensions that can be skipped.
// This generates some assignments which then the second step will clean up. // This generates some assignments which then the second step will clean up.
@ -158,6 +388,7 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) {
i->set_src3(CheckValue(i->src3.value, modified)); i->set_src3(CheckValue(i->src3.value, modified));
result |= modified; result |= modified;
} }
i = i->next; i = i->next;
} }
block = block->next; block = block->next;

View File

@ -31,6 +31,19 @@ class SimplificationPass : public ConditionalGroupSubpass {
bool SimplifyAssignments(hir::HIRBuilder* builder); bool SimplifyAssignments(hir::HIRBuilder* builder);
hir::Value* CheckValue(hir::Value* value, bool& result); hir::Value* CheckValue(hir::Value* value, bool& result);
bool SimplifyBitArith(hir::HIRBuilder* builder);
// handle either or or xor with 0
bool CheckOrXorZero(hir::Instr* i);
bool CheckOr(hir::Instr* i);
bool CheckXor(hir::Instr* i);
bool CheckAnd(hir::Instr* i);
static bool Is1BitOpcode(hir::Opcode def_opcode);
static uint64_t GetScalarNZM(hir::Value* value, hir::Instr* def,
uint64_t typemask, hir::Opcode def_opcode);
// todo: use valuemask
// returns maybenonzeromask for value (mask of bits that may possibly hold
// information)
static uint64_t GetScalarNZM(hir::Value* value);
}; };
} // namespace passes } // namespace passes

View File

@ -114,7 +114,20 @@ void Instr::Remove() {
block->instr_tail = prev; block->instr_tail = prev;
} }
} }
Instr* Instr::GetDestDefSkipAssigns() {
Instr* current_def = this;
while (current_def->opcode == &OPCODE_ASSIGN_info) {
Instr* next_def = current_def->src1.value->def;
if (!next_def) {
return nullptr;
}
current_def = next_def;
}
return current_def;
}
} // namespace hir } // namespace hir
} // namespace cpu } // namespace cpu
} // namespace xe } // namespace xe

View File

@ -59,6 +59,52 @@ class Instr {
void MoveBefore(Instr* other); void MoveBefore(Instr* other);
void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags); void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
void Remove(); void Remove();
template <typename TPredicate>
std::pair<Value*, Value*> BinaryValueArrangeByPredicateExclusive(
TPredicate&& pred) {
auto src1_value = src1.value;
auto src2_value = src2.value;
if (!src1_value || !src2_value) return {nullptr, nullptr};
if (!opcode) return {nullptr, nullptr}; // impossible!
// check if binary opcode taking two values. we dont care if the dest is a
// value
if (!IsOpcodeBinaryValue(opcode->signature)) return {nullptr, nullptr};
if (pred(src1_value)) {
if (pred(src2_value)) {
return {nullptr, nullptr};
} else {
return {src1_value, src2_value};
}
} else if (pred(src2_value)) {
return {src2_value, src1_value};
} else {
return {nullptr, nullptr};
}
}
/*
if src1 is constant, and src2 is not, return [src1, src2]
if src2 is constant, and src1 is not, return [src2, src1]
if neither is constant, return nullptr, nullptr
if both are constant, return nullptr, nullptr
*/
std::pair<Value*, Value*> BinaryValueArrangeAsConstAndVar() {
return BinaryValueArrangeByPredicateExclusive(
[](Value* value) { return value->IsConstant(); });
}
std::pair<Value*, Value*> BinaryValueArrangeByDefiningOpcode(
const OpcodeInfo* op_ptr) {
return BinaryValueArrangeByPredicateExclusive([op_ptr](Value* value) {
return value->def && value->def->opcode == op_ptr;
});
}
Instr* GetDestDefSkipAssigns();
}; };
} // namespace hir } // namespace hir

View File

@ -347,6 +347,10 @@ enum OpcodeSignature {
#define GET_OPCODE_SIG_TYPE_SRC1(sig) (OpcodeSignatureType)((sig >> 3) & 0x7) #define GET_OPCODE_SIG_TYPE_SRC1(sig) (OpcodeSignatureType)((sig >> 3) & 0x7)
#define GET_OPCODE_SIG_TYPE_SRC2(sig) (OpcodeSignatureType)((sig >> 6) & 0x7) #define GET_OPCODE_SIG_TYPE_SRC2(sig) (OpcodeSignatureType)((sig >> 6) & 0x7)
#define GET_OPCODE_SIG_TYPE_SRC3(sig) (OpcodeSignatureType)((sig >> 9) & 0x7) #define GET_OPCODE_SIG_TYPE_SRC3(sig) (OpcodeSignatureType)((sig >> 9) & 0x7)
static bool IsOpcodeBinaryValue(uint32_t signature) {
return (signature & ~(0x7)) ==
((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6));
}
typedef struct { typedef struct {
uint32_t flags; uint32_t flags;

View File

@ -57,6 +57,15 @@ inline size_t GetTypeSize(TypeName type_name) {
return 0; return 0;
} }
} }
inline uint64_t GetScalarTypeMask(TypeName type_name) {
size_t mask_width = GetTypeSize(type_name);
if (mask_width == 8) {
return ~0ULL;
} else {
return (1ULL << (mask_width * CHAR_BIT)) - 1;
}
}
enum ValueFlags { enum ValueFlags {
VALUE_IS_CONSTANT = (1 << 1), VALUE_IS_CONSTANT = (1 << 1),
@ -68,6 +77,23 @@ struct RegAssignment {
int32_t index; int32_t index;
}; };
struct ValueMask {
uint64_t low; // low 64 bits, usually for scalar values
uint64_t high; // high 64 bits, only used for vector types
ValueMask(uint64_t _low, uint64_t _high) : low(_low), high(_high) {}
ValueMask operator&(ValueMask other) const {
return ValueMask{low & other.low, high & other.high};
}
ValueMask operator|(ValueMask other) const {
return ValueMask{low | other.low, high | other.high};
}
ValueMask operator^(ValueMask other) const {
return ValueMask{low ^ other.low, high ^ other.high};
}
};
class Value { class Value {
public: public:
typedef struct Use_s { typedef struct Use_s {

View File

@ -1023,6 +1023,17 @@ int InstrEmit_rlwimix(PPCHIRBuilder& f, const InstrData& i) {
} }
return 0; return 0;
} }
static bool InstrCheck_rlx_only_needs_low(unsigned rotation, uint64_t mask) {
uint32_t mask32 = static_cast<uint32_t>(mask);
if (static_cast<uint64_t>(mask32) != mask) {
return false;
}
uint32_t all_ones_32 = ~0U;
all_ones_32 <<= rotation;
return all_ones_32 == mask32; // mask is only 32 bits and all bits from the
// rotation are discarded
}
int InstrEmit_rlwinmx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_rlwinmx(PPCHIRBuilder& f, const InstrData& i) {
// n <- SH // n <- SH
@ -1031,8 +1042,32 @@ int InstrEmit_rlwinmx(PPCHIRBuilder& f, const InstrData& i) {
// RA <- r & m // RA <- r & m
Value* v = f.LoadGPR(i.M.RT); Value* v = f.LoadGPR(i.M.RT);
unsigned rotation = i.M.SH;
uint64_t m = XEMASK(i.M.MB + 32, i.M.ME + 32);
// in uint32 range (so no register concat/truncate/zx needed) and no rotation
if (m < (1ULL << 32) && (rotation == 0)) {
v = f.And(v, f.LoadConstantUint64(m));
}
// masks out all the bits that are rotated in from the right, so just do a
// shift + and. the and with 0xFFFFFFFF is done instead of a truncate/zx
// because we have a special case for it in the emitters that will just do a
// single insn (mov reg32, lowpartofreg64), otherwise we generate
// significantly more code from setting up the opnds of the truncate/zx
else if (InstrCheck_rlx_only_needs_low(rotation, m)) {
// this path is taken for like 90% of all rlwinms
v = f.And(f.Shl(v, rotation), f.LoadConstantUint64(0xFFFFFFFF));
}
else {
// (x||x) // (x||x)
v = f.Or(f.Shl(v, 32), f.ZeroExtend(f.Truncate(v, INT32_TYPE), INT64_TYPE)); // cs: changed this to mask with UINT32_MAX instead of doing the
// truncate/extend, this generates better code in the backend and is easier
// to do analysis on
v = f.And(v, f.LoadConstantUint64(0xFFFFFFFF));
v = f.Or(f.Shl(v, 32), v);
// TODO(benvanik): optimize srwi // TODO(benvanik): optimize srwi
// TODO(benvanik): optimize slwi // TODO(benvanik): optimize slwi
@ -1040,14 +1075,14 @@ int InstrEmit_rlwinmx(PPCHIRBuilder& f, const InstrData& i) {
// Which seems to just select some bits and set cr0 for use with a branch. // Which seems to just select some bits and set cr0 for use with a branch.
// We can detect this and do less work. // We can detect this and do less work.
if (i.M.SH) { if (i.M.SH) {
v = f.RotateLeft(v, f.LoadConstantInt8(i.M.SH)); v = f.RotateLeft(v, f.LoadConstantInt8(rotation));
} }
// Compiler sometimes masks with 0xFFFFFFFF (identity) - avoid the work here // Compiler sometimes masks with 0xFFFFFFFF (identity) - avoid the work here
// as our truncation/zero-extend does it for us. // as our truncation/zero-extend does it for us.
uint64_t m = XEMASK(i.M.MB + 32, i.M.ME + 32);
if (m != 0xFFFFFFFFFFFFFFFFull) { if (m != 0xFFFFFFFFFFFFFFFFull) {
v = f.And(v, f.LoadConstantUint64(m)); v = f.And(v, f.LoadConstantUint64(m));
} }
}
f.StoreGPR(i.M.RA, v); f.StoreGPR(i.M.RA, v);
if (i.M.Rc) { if (i.M.Rc) {
f.UpdateCR(0, v); f.UpdateCR(0, v);