add initial xop codepaths, still need to finish the rest of the compares, and then do shifts, rotates, and PERMUTE

Add vector simplification pass, so far it only recognizes whether VECTOR_DENORMFLUSH is useless and optimizes them away
Tag restgplr/savegplr/restvmx/savevmx/restfpr/savefpr with useful information, i intend to inline them (they tend to be the most heavily called guest functions)
This commit is contained in:
chss95cs@gmail.com 2022-08-21 08:55:42 -07:00
parent 0b013fdc6b
commit 0ebc109d4d
8 changed files with 574 additions and 78 deletions

View File

@ -0,0 +1,334 @@
#ifndef XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_
#define XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_
#include <stdio.h>
#include <string.h>
#include <string>
namespace xe {
namespace cpu {
namespace backend {
namespace x64 {
namespace amdfx {
enum xopcodemap_e : unsigned char {
XOPCODE_HAS_IMMBYTE = 0x8,
XOPCODE_NO_IMMBYTE = 0x9
};
// base opcodes, without their size specified
enum xopcode_e : unsigned char {
xop_VFRCZPD = 0x81,
xop_VFRCZPS = 0x80,
xop_VFRCZSD = 0x83,
xop_VFRCZSS = 0x82,
xop_VPCMOV = 0xA2,
xop_VPCOMB = 0xCC,
xop_VPCOMD = 0xCE,
xop_VPCOMQ = 0xCF,
xop_VPCOMUB = 0xEC,
xop_VPCOMUD = 0xEE,
xop_VPCOMUQ = 0xEF,
xop_VPCOMUW = 0xED,
xop_VPCOMW = 0xCD,
xop_VPERMIL2PD = 0x49,
xop_VPERMIL2PS = 0x48,
xop_VPHADDBD = 0xC2,
xop_VPHADDBQ = 0xC3,
xop_VPHADDBW = 0xC1,
xop_VPHADDDQ = 0xCB,
xop_VPHADDUBD = 0xD2,
xop_VPHADDUBQ = 0xD3,
xop_VPHADDUBW = 0xD1,
xop_VPHADDUDQ = 0xDB,
xop_VPHADDUWD = 0xD6,
xop_VPHADDUWQ = 0xD7,
xop_VPHADDWD = 0xC6,
xop_VPHADDWQ = 0xC7,
xop_VPHSUBBW = 0xE1,
xop_VPHSUBDQ = 0xE3,
xop_VPHSUBWD = 0xE2,
xop_VPMACSDD = 0x9E,
xop_VPMACSDQH = 0x9F,
xop_VPMACSDQL = 0x97,
xop_VPMACSSDD = 0x8E,
xop_VPMACSSDQH = 0x8F,
xop_VPMACSSDQL = 0x87,
xop_VPMACSSWD = 0x86,
xop_VPMACSSWW = 0x85,
xop_VPMACSWD = 0x96,
xop_VPMACSWW = 0x95,
xop_VPMADCSSWD = 0xA6,
xop_VPMADCSWD = 0xB6,
xop_VPPERM = 0xA3,
xop_VPROTB = 0x90,
xop_VPROTBI = 0xC0, // imm version
xop_VPROTD = 0x92,
xop_VPROTDI = 0xC2,
xop_VPROTQ = 0x93,
xop_VPROTQI = 0xC3,
xop_VPROTW = 0x91,
xop_VPROTWI = 0xC1,
xop_VPSHAB = 0x98,
xop_VPSHAD = 0x9A,
xop_VPSHAQ = 0x9B,
xop_VPSHAW = 0x99,
xop_VPSHLB = 0x94,
xop_VPSHLD = 0x96,
xop_VPSHLQ = 0x97,
xop_VPSHLW = 0x95,
};
enum xop_iop_e : unsigned char {
XOP_BYTE = 0,
XOP_WORD = 1,
XOP_DOUBLEWORD = 2,
XOP_QUADWORD = 3
};
enum xop_fop_e : unsigned char {
XOP_PS = 0,
XOP_PD = 1,
XOP_SS = 2,
XOP_SD = 3
};
class xop_byte1_t {
public:
union {
// informative names
struct {
/*
A five bit field encoding a one- or two-byte opcode prefix.
*/
unsigned char opcode_map_select : 5;
/*
This bit provides a one-bit extension of either the ModRM.r/m
field to specify a GPR or XMM register or to the SIB base field to
specify a GPR. This permits access to 16 registers. In 32-bit protected
and compatibility modes, this bit is ignored. This bit is the
bit-inverted equivalent of the REX.B bit and is available only in the
3-byte prefix format.
*/
unsigned char inv_1bit_ext_modrm_or_sib : 1;
/*
This bit provides a one bit extension of the SIB.index field in
64-bit mode, permitting access to 16 YMM/XMM and GPR registers. In
32-bit protected and compatibility modes, this bit must be set to 1.
This bit is the bit-inverted equivalent of the REX.X bit
*/
unsigned char inv_1bit_ext_sib_index : 1;
/*
This bit provides a one bit extension of the ModRM.reg field in
64-bit mode, permitting access to all 16 YMM/XMM and GPR registers. In
32-bit protected and compatibility modes, this bit must be set to 1.
This bit is the bit-inverted equivalent of the REX.R bit.
*/
unsigned char inv_1bit_ext_modrm_reg_field : 1;
};
// amd manual names
struct {
unsigned char mmmmm : 5;
unsigned char B : 1;
unsigned char X : 1;
unsigned char R : 1;
};
unsigned char encoded;
};
};
class xop_byte2_t {
public:
union {
struct {
unsigned char
implied_66f2f3_ext : 2; // 0 = no implied, 1 = 66, 2 = F3, 3 = F2
unsigned char vector_length : 1;
unsigned char source_or_dest_reg_specifier_inverted_1s_compl : 4;
unsigned char scalar_reg_size_override_special : 1;
};
// amd manual names
struct {
unsigned char pp : 2; // presumably 0 = no implied, 1 = 66, 2 = F2, 3 =
// F3
unsigned char L : 1;
unsigned char vvvv : 4; // src1 for four operand form
unsigned char W : 1;
};
unsigned char encoded;
};
};
class xop_opcode_byte_t {
public:
union {
struct {
xop_fop_e float_datatype : 2;
unsigned char __unused0 : 6;
};
struct {
xop_iop_e int_datatype : 2;
unsigned char __unused1 : 6;
};
struct {
unsigned char oes : 2;
unsigned char opcode : 6;
};
unsigned char encoded;
};
};
class modrm_byte_t {
public:
union {
struct {
unsigned char rm : 3;
unsigned char mod : 5; // 4 opnd form dest reg
};
unsigned char encoded;
};
};
#pragma pack(push, 1)
class xop_t {
public:
unsigned char imm_8F; // always 0x8F
xop_byte1_t byte1;
xop_byte2_t byte2;
xop_opcode_byte_t opcode;
modrm_byte_t modrm;
unsigned char imm8;
xop_t() : imm_8F(0x8F) {
byte1.encoded = 0;
byte2.encoded = 0;
opcode.encoded = 0;
modrm.encoded = 0;
}
unsigned AssembledSize() const {
if (byte1.opcode_map_select == XOPCODE_NO_IMMBYTE) {
return 5;
} else {
return 6;
}
}
template <typename TCall>
void ForeachByte(TCall&& cb) {
cb(imm_8F);
cb(byte1.encoded);
cb(byte2.encoded);
cb(opcode.encoded);
cb(modrm.encoded);
if (AssembledSize() == 6) {
cb(imm8);
}
}
};
#pragma pack(pop)
static void xop_set_fouroperand_form(xop_t& xop, unsigned xmmidx_dest,
unsigned xmmidx_src1, unsigned xmmidx_src2,
unsigned xmmidx_src3, xopcode_e opcode,
bool has_immbyte = true) {
xop.opcode.encoded = opcode;
xop.byte1.encoded = 0xe8;
if (has_immbyte) {
xop.byte1.opcode_map_select = XOPCODE_HAS_IMMBYTE;
} else {
xop.byte1.opcode_map_select = XOPCODE_NO_IMMBYTE;
}
xop.imm8 = xmmidx_src3 << 4;
xop.modrm.rm = xmmidx_src2 & 0b111;
xop.byte1.inv_1bit_ext_modrm_reg_field = (xmmidx_dest >> 3) ^ 1;
xop.byte1.inv_1bit_ext_modrm_or_sib = (xmmidx_src2 >> 3) ^ 1;
xop.byte2.vvvv = ~xmmidx_src1;
xop.modrm.encoded |= 0xC0;
xop.modrm.mod |= xmmidx_dest & 0b111;
}
enum class xopcompare_e : uint32_t {
LT = 0b000,
LTE = 0b001,
GT = 0b010,
GTE = 0b011,
EQ = 0b100,
NEQ = 0b101,
FALSEY = 0b110, // there doesnt seem to be much in the way of documentation
// for these two
TRUTHEY = 0b111
};
namespace operations {
#define SIMPLE_FOUROPERAND(funcname, opcode) \
static xop_t funcname(unsigned destidx, unsigned src1idx, unsigned src2idx, \
unsigned src3idx) { \
xop_t result{}; \
xop_set_fouroperand_form(result, destidx, src1idx, src2idx, src3idx, \
opcode, true); \
return result; \
}
SIMPLE_FOUROPERAND(vpcmov, xop_VPCMOV)
SIMPLE_FOUROPERAND(vpperm, xop_VPPERM)
#define COMPAREFUNC(name, opcode) \
static xop_t name(unsigned dst, unsigned src1, unsigned src2, \
xopcompare_e imm8) { \
xop_t xop; \
xop_set_fouroperand_form(xop, dst, src1, src2, 0, opcode, true); \
xop.imm8 = static_cast<uint8_t>(static_cast<uint32_t>(imm8)); \
return xop; \
}
COMPAREFUNC(vpcomb, xop_VPCOMB)
COMPAREFUNC(vpcomub, xop_VPCOMUB)
COMPAREFUNC(vpcomw, xop_VPCOMW)
COMPAREFUNC(vpcomuw, xop_VPCOMUW)
COMPAREFUNC(vpcomd, xop_VPCOMD)
COMPAREFUNC(vpcomud, xop_VPCOMUD)
COMPAREFUNC(vpcomq, xop_VPCOMQ)
COMPAREFUNC(vpcomuq, xop_VPCOMUQ)
#define SIMPLE_THREEOPERAND(funcname, opcode) \
static xop_t funcname(unsigned destidx, unsigned src1idx, \
unsigned src2idx) { \
xop_t result{}; \
xop_set_fouroperand_form(result, destidx, src1idx, src2idx, 0, opcode, \
false); \
return result; \
}
SIMPLE_THREEOPERAND(vprotb, xop_VPROTB)
SIMPLE_THREEOPERAND(vprotw, xop_VPROTW)
SIMPLE_THREEOPERAND(vprotd, xop_VPROTD)
SIMPLE_THREEOPERAND(vprotq, xop_VPROTQ)
SIMPLE_THREEOPERAND(vpshab, xop_VPSHAB)
SIMPLE_THREEOPERAND(vpshaw, xop_VPSHAW)
SIMPLE_THREEOPERAND(vpshad, xop_VPSHAD)
SIMPLE_THREEOPERAND(vpshaq, xop_VPSHAQ)
SIMPLE_THREEOPERAND(vpshlb, xop_VPSHLB)
SIMPLE_THREEOPERAND(vpshlw, xop_VPSHLW)
SIMPLE_THREEOPERAND(vpshld, xop_VPSHLD)
SIMPLE_THREEOPERAND(vpshlq, xop_VPSHLQ)
#undef SIMPLE_THREEOPERAND
#undef SIMPLE_FOUROPERAND
#undef COMPAREFUNC
} // namespace operations
} // namespace amdfx
} // namespace x64
} // namespace backend
} // namespace cpu
} // namespace xe
#endif // XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_

View File

@ -23,7 +23,7 @@
// NOTE: must be included last as it expects windows.h to already be included.
#include "third_party/xbyak/xbyak/xbyak.h"
#include "third_party/xbyak/xbyak/xbyak_util.h"
#include "x64_amdfx_extensions.h"
namespace xe {
namespace cpu {
class Processor;
@ -169,6 +169,8 @@ enum XmmConst {
XMMF16PackLCPI5,
XMMF16PackLCPI6
};
using amdfx::xopcompare_e;
using Xbyak::Xmm;
// X64Backend specific Instr->runtime_flags
enum : uint32_t {
INSTR_X64_FLAGS_ELIMINATED =
@ -351,6 +353,37 @@ class X64Emitter : public Xbyak::CodeGenerator {
void EmitProfilerEpilogue();
void EmitXOP(amdfx::xop_t xoperation) {
xoperation.ForeachByte([this](uint8_t b) { this->db(b); });
}
void vpcmov(Xmm dest, Xmm src1, Xmm src2, Xmm selector) {
auto xop_bytes = amdfx::operations::vpcmov(
dest.getIdx(), src1.getIdx(), src2.getIdx(), selector.getIdx());
EmitXOP(xop_bytes);
}
void vpperm(Xmm dest, Xmm src1, Xmm src2, Xmm selector) {
auto xop_bytes = amdfx::operations::vpperm(
dest.getIdx(), src1.getIdx(), src2.getIdx(), selector.getIdx());
EmitXOP(xop_bytes);
}
#define DEFINECOMPARE(name) \
void name(Xmm dest, Xmm src1, Xmm src2, xopcompare_e compareop) { \
auto xop_bytes = amdfx::operations::name(dest.getIdx(), src1.getIdx(), \
src2.getIdx(), compareop); \
EmitXOP(xop_bytes); \
}
DEFINECOMPARE(vpcomb);
DEFINECOMPARE(vpcomub);
DEFINECOMPARE(vpcomw);
DEFINECOMPARE(vpcomuw);
DEFINECOMPARE(vpcomd);
DEFINECOMPARE(vpcomud);
DEFINECOMPARE(vpcomq);
DEFINECOMPARE(vpcomuq);
#undef DEFINECOMPARE
protected:
void* Emplace(const EmitFunctionInfo& func_info,
GuestFunction* function = nullptr);

View File

@ -406,26 +406,44 @@ struct VECTOR_COMPARE_SGE_V128
static void Emit(X64Emitter& e, const EmitArgType& i) {
EmitAssociativeBinaryXmmOp(
e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcmpeqb(e.xmm0, src1, src2);
e.vpcmpgtb(dest, src1, src2);
e.vpor(dest, e.xmm0);
break;
case INT16_TYPE:
e.vpcmpeqw(e.xmm0, src1, src2);
e.vpcmpgtw(dest, src1, src2);
e.vpor(dest, e.xmm0);
break;
case INT32_TYPE:
e.vpcmpeqd(e.xmm0, src1, src2);
e.vpcmpgtd(dest, src1, src2);
e.vpor(dest, e.xmm0);
break;
case FLOAT32_TYPE:
e.ChangeMxcsrMode(MXCSRMode::Vmx);
e.vcmpgeps(dest, src1, src2);
break;
if (e.IsFeatureEnabled(kX64EmitXOP)) {
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcomb(dest, src1, src2, xopcompare_e::GTE);
break;
case INT16_TYPE:
e.vpcomw(dest, src1, src2, xopcompare_e::GTE);
break;
case INT32_TYPE:
e.vpcomd(dest, src1, src2, xopcompare_e::GTE);
break;
case FLOAT32_TYPE:
e.ChangeMxcsrMode(MXCSRMode::Vmx);
e.vcmpgeps(dest, src1, src2);
break;
}
} else {
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcmpeqb(e.xmm0, src1, src2);
e.vpcmpgtb(dest, src1, src2);
e.vpor(dest, e.xmm0);
break;
case INT16_TYPE:
e.vpcmpeqw(e.xmm0, src1, src2);
e.vpcmpgtw(dest, src1, src2);
e.vpor(dest, e.xmm0);
break;
case INT32_TYPE:
e.vpcmpeqd(e.xmm0, src1, src2);
e.vpcmpgtd(dest, src1, src2);
e.vpor(dest, e.xmm0);
break;
case FLOAT32_TYPE:
e.ChangeMxcsrMode(MXCSRMode::Vmx);
e.vcmpgeps(dest, src1, src2);
break;
}
}
});
}
@ -439,52 +457,68 @@ struct VECTOR_COMPARE_UGT_V128
: Sequence<VECTOR_COMPARE_UGT_V128,
I<OPCODE_VECTOR_COMPARE_UGT, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy
switch (i.instr->flags) {
case INT8_TYPE:
sign_addr = e.GetXmmConstPtr(XMMSignMaskI8);
break;
case INT16_TYPE:
sign_addr = e.GetXmmConstPtr(XMMSignMaskI16);
break;
case INT32_TYPE:
sign_addr = e.GetXmmConstPtr(XMMSignMaskI32);
break;
case FLOAT32_TYPE:
e.ChangeMxcsrMode(MXCSRMode::Vmx);
sign_addr = e.GetXmmConstPtr(XMMSignMaskF32);
break;
default:
assert_always();
break;
}
if (i.src1.is_constant) {
// TODO(benvanik): make this constant.
e.LoadConstantXmm(e.xmm0, i.src1.constant());
e.vpxor(e.xmm0, sign_addr);
if (i.instr->flags != FLOAT32_TYPE && e.IsFeatureEnabled(kX64EmitXOP)) {
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcomub(i.dest, src1, src2, xopcompare_e::GT);
break;
case INT16_TYPE:
e.vpcomuw(i.dest, src1, src2, xopcompare_e::GT);
break;
case INT32_TYPE:
e.vpcomud(i.dest, src1, src2, xopcompare_e::GT);
break;
}
} else {
e.vpxor(e.xmm0, i.src1, sign_addr);
}
if (i.src2.is_constant) {
// TODO(benvanik): make this constant.
e.LoadConstantXmm(e.xmm1, i.src2.constant());
e.vpxor(e.xmm1, sign_addr);
} else {
e.vpxor(e.xmm1, i.src2, sign_addr);
}
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcmpgtb(i.dest, e.xmm0, e.xmm1);
break;
case INT16_TYPE:
e.vpcmpgtw(i.dest, e.xmm0, e.xmm1);
break;
case INT32_TYPE:
e.vpcmpgtd(i.dest, e.xmm0, e.xmm1);
break;
case FLOAT32_TYPE:
e.vcmpgtps(i.dest, e.xmm0, e.xmm1);
break;
Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy
switch (i.instr->flags) {
case INT8_TYPE:
sign_addr = e.GetXmmConstPtr(XMMSignMaskI8);
break;
case INT16_TYPE:
sign_addr = e.GetXmmConstPtr(XMMSignMaskI16);
break;
case INT32_TYPE:
sign_addr = e.GetXmmConstPtr(XMMSignMaskI32);
break;
case FLOAT32_TYPE:
e.ChangeMxcsrMode(MXCSRMode::Vmx);
sign_addr = e.GetXmmConstPtr(XMMSignMaskF32);
break;
default:
assert_always();
break;
}
if (i.src1.is_constant) {
// TODO(benvanik): make this constant.
e.LoadConstantXmm(e.xmm0, i.src1.constant());
e.vpxor(e.xmm0, sign_addr);
} else {
e.vpxor(e.xmm0, i.src1, sign_addr);
}
if (i.src2.is_constant) {
// TODO(benvanik): make this constant.
e.LoadConstantXmm(e.xmm1, i.src2.constant());
e.vpxor(e.xmm1, sign_addr);
} else {
e.vpxor(e.xmm1, i.src2, sign_addr);
}
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcmpgtb(i.dest, e.xmm0, e.xmm1);
break;
case INT16_TYPE:
e.vpcmpgtw(i.dest, e.xmm0, e.xmm1);
break;
case INT32_TYPE:
e.vpcmpgtd(i.dest, e.xmm0, e.xmm1);
break;
case FLOAT32_TYPE:
e.vcmpgtps(i.dest, e.xmm0, e.xmm1);
break;
}
}
}
};

View File

@ -50,10 +50,10 @@ DEFINE_bool(no_round_to_single, false,
"Not for users, breaks games. Skip rounding double values to "
"single precision and back",
"CPU");
DEFINE_bool(
inline_loadclock, false,
"Directly read cached guest clock without calling the LoadClock method (it gets repeatedly updated by calls from other threads)",
"CPU");
DEFINE_bool(inline_loadclock, false,
"Directly read cached guest clock without calling the LoadClock "
"method (it gets repeatedly updated by calls from other threads)",
"CPU");
namespace xe {
namespace cpu {
namespace backend {
@ -549,7 +549,7 @@ struct MAX_F64 : Sequence<MAX_F64, I<OPCODE_MAX, F64Op, F64Op, F64Op>> {
struct MAX_V128 : Sequence<MAX_V128, I<OPCODE_MAX, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Vmx);
//if 0 and -0, return 0! opposite of minfp
// if 0 and -0, return 0! opposite of minfp
auto src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
auto src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
e.vmaxps(e.xmm2, src1, src2);
@ -781,11 +781,15 @@ struct SELECT_V128_V128
} else if (mayblend == PermittedBlend::Ps) {
e.vblendvps(i.dest, src2, src3, src1);
} else {
//ideally we would have an xop path here...
// src1 ? src2 : src3;
e.vpandn(e.xmm3, src1, src2);
e.vpand(i.dest, src1, src3);
e.vpor(i.dest, i.dest, e.xmm3);
if (e.IsFeatureEnabled(kX64EmitXOP)) {
XELOGCPU("Doing vpcmov!!");
e.vpcmov(i.dest, src2, src3, src1);
} else {
// src1 ? src2 : src3;
e.vpandn(e.xmm3, src1, src2);
e.vpand(i.dest, src1, src3);
e.vpor(i.dest, i.dest, e.xmm3);
}
}
}
};

View File

@ -84,7 +84,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
iter_result |= EliminateConversions(builder);
iter_result |= SimplifyAssignments(builder);
iter_result |= SimplifyBasicArith(builder);
iter_result |= SimplifyVectorOps(builder);
result |= iter_result;
} while (iter_result);
return true;
@ -1393,6 +1393,65 @@ bool SimplificationPass::SimplifyBasicArith(hir::HIRBuilder* builder) {
return result;
}
static bool CouldEverProduceDenormal(hir::Instr* i) {
if (!i) {
return false;
}
Opcode denflushed_opcode = i->GetOpcodeNum();
if (denflushed_opcode == OPCODE_VECTOR_DENORMFLUSH) {
return false;
} else if (denflushed_opcode == OPCODE_UNPACK) {
// todo: more unpack operations likely cannot produce denormals
if (i->flags == PACK_TYPE_FLOAT16_4 || i->flags == PACK_TYPE_FLOAT16_2) {
return false; // xenos half float format does not support denormals
}
} else if (denflushed_opcode == OPCODE_VECTOR_CONVERT_I2F) {
return false;
}
return true; // todo: recurse, check values for min/max, abs, and others
}
bool SimplificationPass::SimplifyVectorOps(hir::Instr* i,
hir::HIRBuilder* builder) {
Opcode opc = i->GetOpcodeNum();
/*
if the input to an unconditional denormal flush is an output of an
unconditional denormal flush, it is a pointless instruction and should be
elimed
*/
if (opc == OPCODE_VECTOR_DENORMFLUSH) {
hir::Instr* denflushed_def = i->src1.value->GetDefSkipAssigns();
if (denflushed_def) {
if (!CouldEverProduceDenormal(denflushed_def)) {
i->opcode = &OPCODE_ASSIGN_info;
return true;
}
}
}
return false;
}
bool SimplificationPass::SimplifyVectorOps(hir::HIRBuilder* builder) {
bool result = false;
auto block = builder->first_block();
while (block) {
auto i = block->instr_head;
while (i) {
bool looks_vectory = false;
i->VisitValueOperands([&looks_vectory](Value* val, uint32_t idx) {
if (val->type == VEC128_TYPE) {
looks_vectory = true;
}
});
result |= SimplifyVectorOps(i, builder);
i = i->next;
}
block = block->next;
}
return result;
}
/*
todo: add load-store simplification pass

View File

@ -35,6 +35,9 @@ class SimplificationPass : public ConditionalGroupSubpass {
// handles simple multiplication/addition rules
bool SimplifyBasicArith(hir::HIRBuilder* builder);
bool SimplifyVectorOps(hir::HIRBuilder* builder);
bool SimplifyVectorOps(hir::Instr* i, hir::HIRBuilder* builder);
bool SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder);
bool SimplifyAddWithSHL(hir::Instr* i, hir::HIRBuilder* builder);
bool SimplifyAddToSelf(hir::Instr* i, hir::HIRBuilder* builder);

View File

@ -31,10 +31,11 @@ struct SourceMapEntry {
uint32_t hir_offset; // Block ordinal (16b) | Instr ordinal (16b)
uint32_t code_offset; // Offset from emitted code start.
};
enum class SaveRestoreType : uint8_t { NONE, GPR, VMX, FPR };
class Function : public Symbol {
public:
enum class Behavior {
enum class Behavior : uint8_t {
kDefault = 0,
kProlog,
kEpilog,
@ -53,6 +54,20 @@ class Function : public Symbol {
void set_behavior(Behavior value) { behavior_ = value; }
bool is_guest() const { return behavior_ != Behavior::kBuiltin; }
void SetSaverest(SaveRestoreType type, bool is_rest, uint8_t index) {
saverest_type_ = type;
is_restore_ = is_rest;
saverest_index_ = index;
}
bool IsSaverest() const { return saverest_type_ != SaveRestoreType::NONE; }
SaveRestoreType SaverestType() const { return saverest_type_; }
unsigned SaverestIndex() const { return saverest_index_; }
bool IsSave() const { return IsSaverest() && is_restore_ == 0; }
bool IsRestore() const { return IsSaverest() && is_restore_; }
bool ContainsAddress(uint32_t address) const {
if (!address_ || !end_address_) {
return false;
@ -71,7 +86,11 @@ class Function : public Symbol {
Function(Module* module, uint32_t address);
uint32_t end_address_ = 0;
Behavior behavior_ = Behavior::kDefault;
SaveRestoreType saverest_type_ = SaveRestoreType::NONE;
uint8_t is_restore_ = 0;
uint8_t saverest_index_ = 0;
};
class BuiltinFunction : public Function {

View File

@ -1598,6 +1598,8 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveGprLr;
function->set_behavior(Function::Behavior::kProlog);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::GPR, false, n);
address += 4;
}
address = gplr_start + 20 * 4;
@ -1612,6 +1614,7 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestGprLr;
function->set_behavior(Function::Behavior::kEpilogReturn);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::GPR, true, n);
address += 4;
}
}
@ -1628,6 +1631,8 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveFpr;
function->set_behavior(Function::Behavior::kProlog);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::FPR, false, n);
address += 4;
}
address = fpr_start + (18 * 4) + (1 * 4);
@ -1642,6 +1647,7 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestFpr;
function->set_behavior(Function::Behavior::kEpilog);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::FPR, true, n);
address += 4;
}
}
@ -1662,6 +1668,7 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx;
function->set_behavior(Function::Behavior::kProlog);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::VMX, false, n);
address += 2 * 4;
}
address += 4;
@ -1675,6 +1682,7 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx;
function->set_behavior(Function::Behavior::kProlog);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::VMX, false, n);
address += 2 * 4;
}
address = vmx_start + (18 * 2 * 4) + (1 * 4) + (64 * 2 * 4) + (1 * 4);
@ -1688,6 +1696,7 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx;
function->set_behavior(Function::Behavior::kEpilog);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::VMX, true, n);
address += 2 * 4;
}
address += 4;
@ -1701,6 +1710,7 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx;
function->set_behavior(Function::Behavior::kEpilog);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::VMX, true, n);
address += 2 * 4;
}
}