Merge pull request #62 from chrisps/canary_experimental
Minor correctness/constant folding fixes, guest code optimizations for pre-ryzen amd processors
This commit is contained in:
commit
9006b309af
|
@ -0,0 +1,334 @@
|
|||
#ifndef XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_
|
||||
#define XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace backend {
|
||||
namespace x64 {
|
||||
namespace amdfx {
|
||||
enum xopcodemap_e : unsigned char {
|
||||
XOPCODE_HAS_IMMBYTE = 0x8,
|
||||
XOPCODE_NO_IMMBYTE = 0x9
|
||||
};
|
||||
|
||||
// base opcodes, without their size specified
|
||||
enum xopcode_e : unsigned char {
|
||||
xop_VFRCZPD = 0x81,
|
||||
xop_VFRCZPS = 0x80,
|
||||
xop_VFRCZSD = 0x83,
|
||||
xop_VFRCZSS = 0x82,
|
||||
xop_VPCMOV = 0xA2,
|
||||
xop_VPCOMB = 0xCC,
|
||||
xop_VPCOMD = 0xCE,
|
||||
xop_VPCOMQ = 0xCF,
|
||||
xop_VPCOMUB = 0xEC,
|
||||
xop_VPCOMUD = 0xEE,
|
||||
xop_VPCOMUQ = 0xEF,
|
||||
xop_VPCOMUW = 0xED,
|
||||
xop_VPCOMW = 0xCD,
|
||||
xop_VPERMIL2PD = 0x49,
|
||||
xop_VPERMIL2PS = 0x48,
|
||||
xop_VPHADDBD = 0xC2,
|
||||
xop_VPHADDBQ = 0xC3,
|
||||
xop_VPHADDBW = 0xC1,
|
||||
xop_VPHADDDQ = 0xCB,
|
||||
xop_VPHADDUBD = 0xD2,
|
||||
xop_VPHADDUBQ = 0xD3,
|
||||
xop_VPHADDUBW = 0xD1,
|
||||
xop_VPHADDUDQ = 0xDB,
|
||||
xop_VPHADDUWD = 0xD6,
|
||||
xop_VPHADDUWQ = 0xD7,
|
||||
xop_VPHADDWD = 0xC6,
|
||||
xop_VPHADDWQ = 0xC7,
|
||||
xop_VPHSUBBW = 0xE1,
|
||||
xop_VPHSUBDQ = 0xE3,
|
||||
xop_VPHSUBWD = 0xE2,
|
||||
xop_VPMACSDD = 0x9E,
|
||||
xop_VPMACSDQH = 0x9F,
|
||||
xop_VPMACSDQL = 0x97,
|
||||
xop_VPMACSSDD = 0x8E,
|
||||
xop_VPMACSSDQH = 0x8F,
|
||||
xop_VPMACSSDQL = 0x87,
|
||||
xop_VPMACSSWD = 0x86,
|
||||
xop_VPMACSSWW = 0x85,
|
||||
xop_VPMACSWD = 0x96,
|
||||
xop_VPMACSWW = 0x95,
|
||||
xop_VPMADCSSWD = 0xA6,
|
||||
xop_VPMADCSWD = 0xB6,
|
||||
xop_VPPERM = 0xA3,
|
||||
xop_VPROTB = 0x90,
|
||||
xop_VPROTBI = 0xC0, // imm version
|
||||
xop_VPROTD = 0x92,
|
||||
xop_VPROTDI = 0xC2,
|
||||
xop_VPROTQ = 0x93,
|
||||
xop_VPROTQI = 0xC3,
|
||||
xop_VPROTW = 0x91,
|
||||
xop_VPROTWI = 0xC1,
|
||||
xop_VPSHAB = 0x98,
|
||||
xop_VPSHAD = 0x9A,
|
||||
xop_VPSHAQ = 0x9B,
|
||||
xop_VPSHAW = 0x99,
|
||||
xop_VPSHLB = 0x94,
|
||||
xop_VPSHLD = 0x96,
|
||||
xop_VPSHLQ = 0x97,
|
||||
xop_VPSHLW = 0x95,
|
||||
|
||||
};
|
||||
|
||||
enum xop_iop_e : unsigned char {
|
||||
XOP_BYTE = 0,
|
||||
XOP_WORD = 1,
|
||||
XOP_DOUBLEWORD = 2,
|
||||
XOP_QUADWORD = 3
|
||||
};
|
||||
|
||||
enum xop_fop_e : unsigned char {
|
||||
XOP_PS = 0,
|
||||
XOP_PD = 1,
|
||||
XOP_SS = 2,
|
||||
XOP_SD = 3
|
||||
};
|
||||
class xop_byte1_t {
|
||||
public:
|
||||
union {
|
||||
// informative names
|
||||
struct {
|
||||
/*
|
||||
A five bit field encoding a one- or two-byte opcode prefix.
|
||||
*/
|
||||
unsigned char opcode_map_select : 5;
|
||||
/*
|
||||
This bit provides a one-bit extension of either the ModRM.r/m
|
||||
field to specify a GPR or XMM register or to the SIB base field to
|
||||
specify a GPR. This permits access to 16 registers. In 32-bit protected
|
||||
and compatibility modes, this bit is ignored. This bit is the
|
||||
bit-inverted equivalent of the REX.B bit and is available only in the
|
||||
3-byte prefix format.
|
||||
*/
|
||||
unsigned char inv_1bit_ext_modrm_or_sib : 1;
|
||||
/*
|
||||
This bit provides a one bit extension of the SIB.index field in
|
||||
64-bit mode, permitting access to 16 YMM/XMM and GPR registers. In
|
||||
32-bit protected and compatibility modes, this bit must be set to 1.
|
||||
This bit is the bit-inverted equivalent of the REX.X bit
|
||||
*/
|
||||
unsigned char inv_1bit_ext_sib_index : 1;
|
||||
/*
|
||||
This bit provides a one bit extension of the ModRM.reg field in
|
||||
64-bit mode, permitting access to all 16 YMM/XMM and GPR registers. In
|
||||
32-bit protected and compatibility modes, this bit must be set to 1.
|
||||
This bit is the bit-inverted equivalent of the REX.R bit.
|
||||
*/
|
||||
unsigned char inv_1bit_ext_modrm_reg_field : 1;
|
||||
};
|
||||
// amd manual names
|
||||
struct {
|
||||
unsigned char mmmmm : 5;
|
||||
unsigned char B : 1;
|
||||
unsigned char X : 1;
|
||||
unsigned char R : 1;
|
||||
};
|
||||
unsigned char encoded;
|
||||
};
|
||||
};
|
||||
|
||||
class xop_byte2_t {
|
||||
public:
|
||||
union {
|
||||
struct {
|
||||
unsigned char
|
||||
implied_66f2f3_ext : 2; // 0 = no implied, 1 = 66, 2 = F3, 3 = F2
|
||||
unsigned char vector_length : 1;
|
||||
unsigned char source_or_dest_reg_specifier_inverted_1s_compl : 4;
|
||||
unsigned char scalar_reg_size_override_special : 1;
|
||||
};
|
||||
// amd manual names
|
||||
|
||||
struct {
|
||||
unsigned char pp : 2; // presumably 0 = no implied, 1 = 66, 2 = F2, 3 =
|
||||
// F3
|
||||
unsigned char L : 1;
|
||||
unsigned char vvvv : 4; // src1 for four operand form
|
||||
unsigned char W : 1;
|
||||
};
|
||||
unsigned char encoded;
|
||||
};
|
||||
};
|
||||
|
||||
class xop_opcode_byte_t {
|
||||
public:
|
||||
union {
|
||||
struct {
|
||||
xop_fop_e float_datatype : 2;
|
||||
unsigned char __unused0 : 6;
|
||||
};
|
||||
|
||||
struct {
|
||||
xop_iop_e int_datatype : 2;
|
||||
unsigned char __unused1 : 6;
|
||||
};
|
||||
|
||||
struct {
|
||||
unsigned char oes : 2;
|
||||
unsigned char opcode : 6;
|
||||
};
|
||||
unsigned char encoded;
|
||||
};
|
||||
};
|
||||
|
||||
class modrm_byte_t {
|
||||
public:
|
||||
union {
|
||||
struct {
|
||||
unsigned char rm : 3;
|
||||
unsigned char mod : 5; // 4 opnd form dest reg
|
||||
};
|
||||
unsigned char encoded;
|
||||
};
|
||||
};
|
||||
|
||||
#pragma pack(push, 1)
|
||||
class xop_t {
|
||||
public:
|
||||
unsigned char imm_8F; // always 0x8F
|
||||
xop_byte1_t byte1;
|
||||
xop_byte2_t byte2;
|
||||
xop_opcode_byte_t opcode;
|
||||
modrm_byte_t modrm;
|
||||
unsigned char imm8;
|
||||
|
||||
xop_t() : imm_8F(0x8F) {
|
||||
byte1.encoded = 0;
|
||||
byte2.encoded = 0;
|
||||
opcode.encoded = 0;
|
||||
modrm.encoded = 0;
|
||||
}
|
||||
|
||||
unsigned AssembledSize() const {
|
||||
if (byte1.opcode_map_select == XOPCODE_NO_IMMBYTE) {
|
||||
return 5;
|
||||
} else {
|
||||
return 6;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TCall>
|
||||
void ForeachByte(TCall&& cb) {
|
||||
cb(imm_8F);
|
||||
cb(byte1.encoded);
|
||||
cb(byte2.encoded);
|
||||
cb(opcode.encoded);
|
||||
cb(modrm.encoded);
|
||||
if (AssembledSize() == 6) {
|
||||
cb(imm8);
|
||||
}
|
||||
}
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
static void xop_set_fouroperand_form(xop_t& xop, unsigned xmmidx_dest,
|
||||
unsigned xmmidx_src1, unsigned xmmidx_src2,
|
||||
unsigned xmmidx_src3, xopcode_e opcode,
|
||||
bool has_immbyte = true) {
|
||||
xop.opcode.encoded = opcode;
|
||||
xop.byte1.encoded = 0xe8;
|
||||
if (has_immbyte) {
|
||||
xop.byte1.opcode_map_select = XOPCODE_HAS_IMMBYTE;
|
||||
} else {
|
||||
xop.byte1.opcode_map_select = XOPCODE_NO_IMMBYTE;
|
||||
}
|
||||
xop.imm8 = xmmidx_src3 << 4;
|
||||
|
||||
xop.modrm.rm = xmmidx_src2 & 0b111;
|
||||
xop.byte1.inv_1bit_ext_modrm_reg_field = (xmmidx_dest >> 3) ^ 1;
|
||||
xop.byte1.inv_1bit_ext_modrm_or_sib = (xmmidx_src2 >> 3) ^ 1;
|
||||
xop.byte2.vvvv = ~xmmidx_src1;
|
||||
xop.modrm.encoded |= 0xC0;
|
||||
xop.modrm.mod |= xmmidx_dest & 0b111;
|
||||
}
|
||||
|
||||
enum class xopcompare_e : uint32_t {
|
||||
LT = 0b000,
|
||||
LTE = 0b001,
|
||||
GT = 0b010,
|
||||
GTE = 0b011,
|
||||
EQ = 0b100,
|
||||
NEQ = 0b101,
|
||||
FALSEY = 0b110, // there doesnt seem to be much in the way of documentation
|
||||
// for these two
|
||||
TRUTHEY = 0b111
|
||||
};
|
||||
|
||||
namespace operations {
|
||||
#define SIMPLE_FOUROPERAND(funcname, opcode) \
|
||||
static xop_t funcname(unsigned destidx, unsigned src1idx, unsigned src2idx, \
|
||||
unsigned src3idx) { \
|
||||
xop_t result{}; \
|
||||
xop_set_fouroperand_form(result, destidx, src1idx, src2idx, src3idx, \
|
||||
opcode, true); \
|
||||
return result; \
|
||||
}
|
||||
|
||||
SIMPLE_FOUROPERAND(vpcmov, xop_VPCMOV)
|
||||
|
||||
SIMPLE_FOUROPERAND(vpperm, xop_VPPERM)
|
||||
|
||||
#define COMPAREFUNC(name, opcode) \
|
||||
static xop_t name(unsigned dst, unsigned src1, unsigned src2, \
|
||||
xopcompare_e imm8) { \
|
||||
xop_t xop; \
|
||||
xop_set_fouroperand_form(xop, dst, src1, src2, 0, opcode, true); \
|
||||
xop.imm8 = static_cast<uint8_t>(static_cast<uint32_t>(imm8)); \
|
||||
return xop; \
|
||||
}
|
||||
|
||||
COMPAREFUNC(vpcomb, xop_VPCOMB)
|
||||
COMPAREFUNC(vpcomub, xop_VPCOMUB)
|
||||
COMPAREFUNC(vpcomw, xop_VPCOMW)
|
||||
COMPAREFUNC(vpcomuw, xop_VPCOMUW)
|
||||
COMPAREFUNC(vpcomd, xop_VPCOMD)
|
||||
COMPAREFUNC(vpcomud, xop_VPCOMUD)
|
||||
COMPAREFUNC(vpcomq, xop_VPCOMQ)
|
||||
COMPAREFUNC(vpcomuq, xop_VPCOMUQ)
|
||||
|
||||
#define SIMPLE_THREEOPERAND(funcname, opcode) \
|
||||
static xop_t funcname(unsigned destidx, unsigned src1idx, \
|
||||
unsigned src2idx) { \
|
||||
xop_t result{}; \
|
||||
xop_set_fouroperand_form(result, destidx, src1idx, src2idx, 0, opcode, \
|
||||
false); \
|
||||
return result; \
|
||||
}
|
||||
|
||||
SIMPLE_THREEOPERAND(vprotb, xop_VPROTB)
|
||||
SIMPLE_THREEOPERAND(vprotw, xop_VPROTW)
|
||||
SIMPLE_THREEOPERAND(vprotd, xop_VPROTD)
|
||||
SIMPLE_THREEOPERAND(vprotq, xop_VPROTQ)
|
||||
|
||||
SIMPLE_THREEOPERAND(vpshab, xop_VPSHAB)
|
||||
SIMPLE_THREEOPERAND(vpshaw, xop_VPSHAW)
|
||||
SIMPLE_THREEOPERAND(vpshad, xop_VPSHAD)
|
||||
SIMPLE_THREEOPERAND(vpshaq, xop_VPSHAQ)
|
||||
|
||||
|
||||
SIMPLE_THREEOPERAND(vpshlb, xop_VPSHLB)
|
||||
SIMPLE_THREEOPERAND(vpshlw, xop_VPSHLW)
|
||||
SIMPLE_THREEOPERAND(vpshld, xop_VPSHLD)
|
||||
SIMPLE_THREEOPERAND(vpshlq, xop_VPSHLQ)
|
||||
|
||||
#undef SIMPLE_THREEOPERAND
|
||||
#undef SIMPLE_FOUROPERAND
|
||||
#undef COMPAREFUNC
|
||||
} // namespace operations
|
||||
|
||||
} // namespace amdfx
|
||||
} // namespace x64
|
||||
} // namespace backend
|
||||
} // namespace cpu
|
||||
} // namespace xe
|
||||
|
||||
#endif // XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_
|
|
@ -143,6 +143,12 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
|||
feature_flags_ |= kX64EmitTBM;
|
||||
}
|
||||
}
|
||||
if (amd_flags & (1U << 11)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
|
||||
feature_flags_ |= kX64EmitXOP;
|
||||
XELOGCPU("Cpu support XOP!\n\n");
|
||||
}
|
||||
}
|
||||
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
|
||||
bool is_zennish = cpu_.displayFamily >= 0x17;
|
||||
/*
|
||||
|
@ -1024,8 +1030,13 @@ static const vec128_t xmm_consts[] = {
|
|||
/*
|
||||
XMMF16PackLCPI6
|
||||
*/
|
||||
vec128i(0x8000)
|
||||
|
||||
vec128i(0x8000),
|
||||
/* XMMXOPByteShiftMask,*/
|
||||
vec128b(7),
|
||||
/*XMMXOPWordShiftMask*/
|
||||
vec128s(15),
|
||||
/*XMMXOPDwordShiftMask*/
|
||||
vec128i(31)
|
||||
};
|
||||
|
||||
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
// NOTE: must be included last as it expects windows.h to already be included.
|
||||
#include "third_party/xbyak/xbyak/xbyak.h"
|
||||
#include "third_party/xbyak/xbyak/xbyak_util.h"
|
||||
|
||||
#include "x64_amdfx_extensions.h"
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
class Processor;
|
||||
|
@ -167,8 +167,14 @@ enum XmmConst {
|
|||
XMMF16PackLCPI3,
|
||||
XMMF16PackLCPI4,
|
||||
XMMF16PackLCPI5,
|
||||
XMMF16PackLCPI6
|
||||
XMMF16PackLCPI6,
|
||||
XMMXOPByteShiftMask,
|
||||
XMMXOPWordShiftMask,
|
||||
XMMXOPDwordShiftMask,
|
||||
|
||||
};
|
||||
using amdfx::xopcompare_e;
|
||||
using Xbyak::Xmm;
|
||||
// X64Backend specific Instr->runtime_flags
|
||||
enum : uint32_t {
|
||||
INSTR_X64_FLAGS_ELIMINATED =
|
||||
|
@ -351,6 +357,60 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
|
||||
void EmitProfilerEpilogue();
|
||||
|
||||
void EmitXOP(amdfx::xop_t xoperation) {
|
||||
xoperation.ForeachByte([this](uint8_t b) { this->db(b); });
|
||||
}
|
||||
|
||||
void vpcmov(Xmm dest, Xmm src1, Xmm src2, Xmm selector) {
|
||||
auto xop_bytes = amdfx::operations::vpcmov(
|
||||
dest.getIdx(), src1.getIdx(), src2.getIdx(), selector.getIdx());
|
||||
EmitXOP(xop_bytes);
|
||||
}
|
||||
|
||||
void vpperm(Xmm dest, Xmm src1, Xmm src2, Xmm selector) {
|
||||
auto xop_bytes = amdfx::operations::vpperm(
|
||||
dest.getIdx(), src1.getIdx(), src2.getIdx(), selector.getIdx());
|
||||
EmitXOP(xop_bytes);
|
||||
}
|
||||
|
||||
#define DEFINECOMPARE(name) \
|
||||
void name(Xmm dest, Xmm src1, Xmm src2, xopcompare_e compareop) { \
|
||||
auto xop_bytes = amdfx::operations::name(dest.getIdx(), src1.getIdx(), \
|
||||
src2.getIdx(), compareop); \
|
||||
EmitXOP(xop_bytes); \
|
||||
}
|
||||
DEFINECOMPARE(vpcomb);
|
||||
DEFINECOMPARE(vpcomub);
|
||||
DEFINECOMPARE(vpcomw);
|
||||
DEFINECOMPARE(vpcomuw);
|
||||
DEFINECOMPARE(vpcomd);
|
||||
DEFINECOMPARE(vpcomud);
|
||||
DEFINECOMPARE(vpcomq);
|
||||
DEFINECOMPARE(vpcomuq);
|
||||
#undef DEFINECOMPARE
|
||||
|
||||
#define DEFINESHIFTER(name) \
|
||||
void name(Xmm dest, Xmm src1, Xmm src2) { \
|
||||
auto xop_bytes = \
|
||||
amdfx::operations::name(dest.getIdx(), src1.getIdx(), src2.getIdx()); \
|
||||
EmitXOP(xop_bytes); \
|
||||
}
|
||||
|
||||
DEFINESHIFTER(vprotb)
|
||||
DEFINESHIFTER(vprotw)
|
||||
DEFINESHIFTER(vprotd)
|
||||
DEFINESHIFTER(vprotq)
|
||||
|
||||
DEFINESHIFTER(vpshab)
|
||||
DEFINESHIFTER(vpshaw)
|
||||
DEFINESHIFTER(vpshad)
|
||||
DEFINESHIFTER(vpshaq)
|
||||
|
||||
DEFINESHIFTER(vpshlb)
|
||||
DEFINESHIFTER(vpshlw)
|
||||
DEFINESHIFTER(vpshld)
|
||||
DEFINESHIFTER(vpshlq)
|
||||
|
||||
protected:
|
||||
void* Emplace(const EmitFunctionInfo& func_info,
|
||||
GuestFunction* function = nullptr);
|
||||
|
|
|
@ -19,6 +19,16 @@
|
|||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
|
||||
|
||||
DEFINE_bool(xop_rotates, false, "rotate via xop", "X64");
|
||||
|
||||
DEFINE_bool(xop_left_shifts, false, "shl via xop", "X64");
|
||||
|
||||
DEFINE_bool(xop_right_shifts, false, "shr via xop", "X64");
|
||||
|
||||
DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "X64");
|
||||
|
||||
DEFINE_bool(xop_compares, true, "compare via xop", "X64");
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace backend {
|
||||
|
@ -143,6 +153,7 @@ struct VECTOR_DENORMFLUSH
|
|||
e.vandps(e.xmm0, i.src1,
|
||||
e.GetXmmConstPtr(XMMSingleDenormalMask)); // 0.25 P0123
|
||||
e.vcmpneqps(e.xmm2, e.xmm0, e.xmm1); // 0.5 P01
|
||||
// todo: xop vpcmov here
|
||||
e.vandps(e.xmm1, i.src1,
|
||||
e.GetXmmConstPtr(XMMSignMaskF32)); // 0.5 P0123 take signs, zeros
|
||||
// must keep their signs
|
||||
|
@ -406,26 +417,44 @@ struct VECTOR_COMPARE_SGE_V128
|
|||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
EmitAssociativeBinaryXmmOp(
|
||||
e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
e.vpcmpeqb(e.xmm0, src1, src2);
|
||||
e.vpcmpgtb(dest, src1, src2);
|
||||
e.vpor(dest, e.xmm0);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
e.vpcmpeqw(e.xmm0, src1, src2);
|
||||
e.vpcmpgtw(dest, src1, src2);
|
||||
e.vpor(dest, e.xmm0);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
e.vpcmpeqd(e.xmm0, src1, src2);
|
||||
e.vpcmpgtd(dest, src1, src2);
|
||||
e.vpor(dest, e.xmm0);
|
||||
break;
|
||||
case FLOAT32_TYPE:
|
||||
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
||||
e.vcmpgeps(dest, src1, src2);
|
||||
break;
|
||||
if (cvars::xop_compares && e.IsFeatureEnabled(kX64EmitXOP)) {
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
e.vpcomb(dest, src1, src2, xopcompare_e::GTE);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
e.vpcomw(dest, src1, src2, xopcompare_e::GTE);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
e.vpcomd(dest, src1, src2, xopcompare_e::GTE);
|
||||
break;
|
||||
case FLOAT32_TYPE:
|
||||
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
||||
e.vcmpgeps(dest, src1, src2);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
e.vpcmpeqb(e.xmm0, src1, src2);
|
||||
e.vpcmpgtb(dest, src1, src2);
|
||||
e.vpor(dest, e.xmm0);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
e.vpcmpeqw(e.xmm0, src1, src2);
|
||||
e.vpcmpgtw(dest, src1, src2);
|
||||
e.vpor(dest, e.xmm0);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
e.vpcmpeqd(e.xmm0, src1, src2);
|
||||
e.vpcmpgtd(dest, src1, src2);
|
||||
e.vpor(dest, e.xmm0);
|
||||
break;
|
||||
case FLOAT32_TYPE:
|
||||
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
||||
e.vcmpgeps(dest, src1, src2);
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -600,6 +629,7 @@ struct VECTOR_ADD
|
|||
// overflowed (only need to check one input)
|
||||
// if (src1 > res) then overflowed
|
||||
// http://locklessinc.com/articles/sat_arithmetic/
|
||||
// chrispy: todo - add xop stuff here
|
||||
e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32));
|
||||
e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32));
|
||||
e.vpcmpgtd(e.xmm0, e.xmm2, e.xmm0);
|
||||
|
@ -755,23 +785,52 @@ static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) {
|
|||
// Store result and return it.
|
||||
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
|
||||
}
|
||||
|
||||
static XmmConst GetShiftmaskForType(unsigned typ) {
|
||||
if (typ == INT8_TYPE) {
|
||||
return XMMXOPByteShiftMask;
|
||||
} else if (typ == INT16_TYPE) {
|
||||
return XMMXOPWordShiftMask;
|
||||
} else {
|
||||
return XMMXOPDwordShiftMask;
|
||||
}
|
||||
}
|
||||
struct VECTOR_SHL_V128
|
||||
: Sequence<VECTOR_SHL_V128, I<OPCODE_VECTOR_SHL, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
EmitInt8(e, i);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
EmitInt16(e, i);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
EmitInt32(e, i);
|
||||
break;
|
||||
default:
|
||||
assert_always();
|
||||
break;
|
||||
if (cvars::xop_left_shifts && e.IsFeatureEnabled(kX64EmitXOP)) {
|
||||
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
|
||||
e.vpand(e.xmm2, src2,
|
||||
e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
|
||||
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
e.vpshlb(i.dest, src1, e.xmm2);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
e.vpshlw(i.dest, src1, e.xmm2);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
e.vpshld(i.dest, src1, e.xmm2);
|
||||
break;
|
||||
}
|
||||
|
||||
} else {
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
EmitInt8(e, i);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
EmitInt16(e, i);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
EmitInt32(e, i);
|
||||
break;
|
||||
default:
|
||||
assert_always();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1041,19 +1100,45 @@ static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) {
|
|||
struct VECTOR_SHR_V128
|
||||
: Sequence<VECTOR_SHR_V128, I<OPCODE_VECTOR_SHR, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
EmitInt8(e, i);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
EmitInt16(e, i);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
EmitInt32(e, i);
|
||||
break;
|
||||
default:
|
||||
assert_always();
|
||||
break;
|
||||
if (cvars::xop_right_shifts && e.IsFeatureEnabled(kX64EmitXOP)) {
|
||||
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
|
||||
e.vpand(e.xmm2, src2,
|
||||
e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
|
||||
|
||||
e.vpcmpeqb(e.xmm3, e.xmm3);
|
||||
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
e.vpsignb(e.xmm2, e.xmm3);
|
||||
e.vpshlb(i.dest, src1, e.xmm2);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
e.vpsignw(e.xmm2, e.xmm3);
|
||||
e.vpshlw(i.dest, src1, e.xmm2);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
e.vpsignd(e.xmm2, e.xmm3);
|
||||
e.vpshld(i.dest, src1, e.xmm2);
|
||||
break;
|
||||
}
|
||||
|
||||
} else {
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
EmitInt8(e, i);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
EmitInt16(e, i);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
EmitInt32(e, i);
|
||||
break;
|
||||
default:
|
||||
assert_always();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1224,19 +1309,45 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128);
|
|||
struct VECTOR_SHA_V128
|
||||
: Sequence<VECTOR_SHA_V128, I<OPCODE_VECTOR_SHA, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
EmitInt8(e, i);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
EmitInt16(e, i);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
EmitInt32(e, i);
|
||||
break;
|
||||
default:
|
||||
assert_always();
|
||||
break;
|
||||
if (cvars::xop_arithmetic_right_shifts && e.IsFeatureEnabled(kX64EmitXOP)) {
|
||||
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
|
||||
e.vpand(e.xmm2, src2,
|
||||
e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
|
||||
|
||||
e.vpcmpeqb(e.xmm3, e.xmm3);
|
||||
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
e.vpsignb(e.xmm2, e.xmm3);
|
||||
e.vpshab(i.dest, src1, e.xmm2);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
e.vpsignw(e.xmm2, e.xmm3);
|
||||
e.vpshaw(i.dest, src1, e.xmm2);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
e.vpsignd(e.xmm2, e.xmm3);
|
||||
e.vpshad(i.dest, src1, e.xmm2);
|
||||
break;
|
||||
}
|
||||
|
||||
} else {
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
EmitInt8(e, i);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
EmitInt16(e, i);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
EmitInt32(e, i);
|
||||
break;
|
||||
default:
|
||||
assert_always();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1412,55 +1523,29 @@ struct VECTOR_ROTATE_LEFT_V128
|
|||
: Sequence<VECTOR_ROTATE_LEFT_V128,
|
||||
I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(
|
||||
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint8_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(
|
||||
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint16_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
break;
|
||||
case INT32_TYPE: {
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
|
||||
e.vprolvd(i.dest, i.src1, i.src2);
|
||||
} else if (e.IsFeatureEnabled(kX64EmitAVX2)) {
|
||||
Xmm temp = i.dest;
|
||||
if (i.dest == i.src1 || i.dest == i.src2) {
|
||||
temp = e.xmm2;
|
||||
}
|
||||
// Shift left (to get high bits):
|
||||
if (i.src2.is_constant) {
|
||||
e.LoadConstantXmm(temp, i.src2.constant());
|
||||
e.vpand(e.xmm0, temp, e.GetXmmConstPtr(XMMShiftMaskPS));
|
||||
} else {
|
||||
e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
|
||||
}
|
||||
e.vpsllvd(e.xmm1, i.src1, e.xmm0);
|
||||
// Shift right (to get low bits):
|
||||
e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32));
|
||||
e.vpsubd(temp, e.xmm0);
|
||||
e.vpsrlvd(i.dest, i.src1, temp);
|
||||
// Merge:
|
||||
e.vpor(i.dest, e.xmm1);
|
||||
} else {
|
||||
// TODO(benvanik): non-AVX2 native version.
|
||||
if (cvars::xop_rotates && e.IsFeatureEnabled(kX64EmitXOP)) {
|
||||
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
|
||||
e.vpand(e.xmm2, src2,
|
||||
e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
|
||||
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
e.vprotb(i.dest, src1, e.xmm2);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
e.vprotw(i.dest, src1, e.xmm2);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
e.vprotd(i.dest, src1, e.xmm2);
|
||||
break;
|
||||
}
|
||||
|
||||
} else {
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1),
|
||||
e.StashConstantXmm(1, i.src2.constant()));
|
||||
|
@ -1469,14 +1554,63 @@ struct VECTOR_ROTATE_LEFT_V128
|
|||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(
|
||||
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint32_t>));
|
||||
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint8_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1),
|
||||
e.StashConstantXmm(1, i.src2.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(
|
||||
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint16_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
break;
|
||||
case INT32_TYPE: {
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
|
||||
e.vprolvd(i.dest, i.src1, i.src2);
|
||||
} else if (e.IsFeatureEnabled(kX64EmitAVX2)) {
|
||||
Xmm temp = i.dest;
|
||||
if (i.dest == i.src1 || i.dest == i.src2) {
|
||||
temp = e.xmm2;
|
||||
}
|
||||
// Shift left (to get high bits):
|
||||
if (i.src2.is_constant) {
|
||||
e.LoadConstantXmm(temp, i.src2.constant());
|
||||
e.vpand(e.xmm0, temp, e.GetXmmConstPtr(XMMShiftMaskPS));
|
||||
} else {
|
||||
e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
|
||||
}
|
||||
e.vpsllvd(e.xmm1, i.src1, e.xmm0);
|
||||
// Shift right (to get low bits):
|
||||
e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32));
|
||||
e.vpsubd(temp, e.xmm0);
|
||||
e.vpsrlvd(i.dest, i.src1, temp);
|
||||
// Merge:
|
||||
e.vpor(i.dest, e.xmm1);
|
||||
} else {
|
||||
// TODO(benvanik): non-AVX2 native version.
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1),
|
||||
e.StashConstantXmm(1, i.src2.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
}
|
||||
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
|
||||
e.CallNativeSafe(
|
||||
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint32_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
assert_always();
|
||||
break;
|
||||
}
|
||||
default:
|
||||
assert_always();
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -50,10 +50,10 @@ DEFINE_bool(no_round_to_single, false,
|
|||
"Not for users, breaks games. Skip rounding double values to "
|
||||
"single precision and back",
|
||||
"CPU");
|
||||
DEFINE_bool(
|
||||
inline_loadclock, false,
|
||||
"Directly read cached guest clock without calling the LoadClock method (it gets repeatedly updated by calls from other threads)",
|
||||
"CPU");
|
||||
DEFINE_bool(inline_loadclock, false,
|
||||
"Directly read cached guest clock without calling the LoadClock "
|
||||
"method (it gets repeatedly updated by calls from other threads)",
|
||||
"CPU");
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace backend {
|
||||
|
@ -549,7 +549,7 @@ struct MAX_F64 : Sequence<MAX_F64, I<OPCODE_MAX, F64Op, F64Op, F64Op>> {
|
|||
struct MAX_V128 : Sequence<MAX_V128, I<OPCODE_MAX, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
||||
//if 0 and -0, return 0! opposite of minfp
|
||||
// if 0 and -0, return 0! opposite of minfp
|
||||
auto src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
auto src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
e.vmaxps(e.xmm2, src1, src2);
|
||||
|
@ -781,11 +781,15 @@ struct SELECT_V128_V128
|
|||
} else if (mayblend == PermittedBlend::Ps) {
|
||||
e.vblendvps(i.dest, src2, src3, src1);
|
||||
} else {
|
||||
//ideally we would have an xop path here...
|
||||
// src1 ? src2 : src3;
|
||||
e.vpandn(e.xmm3, src1, src2);
|
||||
e.vpand(i.dest, src1, src3);
|
||||
e.vpor(i.dest, i.dest, e.xmm3);
|
||||
if (e.IsFeatureEnabled(kX64EmitXOP)) {
|
||||
e.vpcmov(i.dest, src3, src2, src1);
|
||||
} else {
|
||||
// src1 ? src2 : src3;
|
||||
|
||||
e.vpandn(e.xmm3, src1, src2);
|
||||
e.vpand(i.dest, src1, src3);
|
||||
e.vpor(i.dest, i.dest, e.xmm3);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -84,7 +84,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
iter_result |= EliminateConversions(builder);
|
||||
iter_result |= SimplifyAssignments(builder);
|
||||
iter_result |= SimplifyBasicArith(builder);
|
||||
|
||||
iter_result |= SimplifyVectorOps(builder);
|
||||
result |= iter_result;
|
||||
} while (iter_result);
|
||||
return true;
|
||||
|
@ -1393,6 +1393,65 @@ bool SimplificationPass::SimplifyBasicArith(hir::HIRBuilder* builder) {
|
|||
return result;
|
||||
}
|
||||
|
||||
static bool CouldEverProduceDenormal(hir::Instr* i) {
|
||||
if (!i) {
|
||||
return false;
|
||||
}
|
||||
Opcode denflushed_opcode = i->GetOpcodeNum();
|
||||
|
||||
if (denflushed_opcode == OPCODE_VECTOR_DENORMFLUSH) {
|
||||
return false;
|
||||
} else if (denflushed_opcode == OPCODE_UNPACK) {
|
||||
// todo: more unpack operations likely cannot produce denormals
|
||||
if (i->flags == PACK_TYPE_FLOAT16_4 || i->flags == PACK_TYPE_FLOAT16_2) {
|
||||
return false; // xenos half float format does not support denormals
|
||||
}
|
||||
} else if (denflushed_opcode == OPCODE_VECTOR_CONVERT_I2F) {
|
||||
return false;
|
||||
}
|
||||
return true; // todo: recurse, check values for min/max, abs, and others
|
||||
}
|
||||
|
||||
bool SimplificationPass::SimplifyVectorOps(hir::Instr* i,
|
||||
hir::HIRBuilder* builder) {
|
||||
Opcode opc = i->GetOpcodeNum();
|
||||
/*
|
||||
if the input to an unconditional denormal flush is an output of an
|
||||
unconditional denormal flush, it is a pointless instruction and should be
|
||||
elimed
|
||||
*/
|
||||
if (opc == OPCODE_VECTOR_DENORMFLUSH) {
|
||||
hir::Instr* denflushed_def = i->src1.value->GetDefSkipAssigns();
|
||||
|
||||
if (denflushed_def) {
|
||||
if (!CouldEverProduceDenormal(denflushed_def)) {
|
||||
i->opcode = &OPCODE_ASSIGN_info;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool SimplificationPass::SimplifyVectorOps(hir::HIRBuilder* builder) {
|
||||
bool result = false;
|
||||
auto block = builder->first_block();
|
||||
while (block) {
|
||||
auto i = block->instr_head;
|
||||
while (i) {
|
||||
bool looks_vectory = false;
|
||||
|
||||
i->VisitValueOperands([&looks_vectory](Value* val, uint32_t idx) {
|
||||
if (val->type == VEC128_TYPE) {
|
||||
looks_vectory = true;
|
||||
}
|
||||
});
|
||||
result |= SimplifyVectorOps(i, builder);
|
||||
i = i->next;
|
||||
}
|
||||
block = block->next;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
/*
|
||||
todo: add load-store simplification pass
|
||||
|
||||
|
|
|
@ -35,6 +35,9 @@ class SimplificationPass : public ConditionalGroupSubpass {
|
|||
|
||||
// handles simple multiplication/addition rules
|
||||
bool SimplifyBasicArith(hir::HIRBuilder* builder);
|
||||
|
||||
bool SimplifyVectorOps(hir::HIRBuilder* builder);
|
||||
bool SimplifyVectorOps(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
bool SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
bool SimplifyAddWithSHL(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
bool SimplifyAddToSelf(hir::Instr* i, hir::HIRBuilder* builder);
|
||||
|
|
|
@ -31,10 +31,11 @@ struct SourceMapEntry {
|
|||
uint32_t hir_offset; // Block ordinal (16b) | Instr ordinal (16b)
|
||||
uint32_t code_offset; // Offset from emitted code start.
|
||||
};
|
||||
enum class SaveRestoreType : uint8_t { NONE, GPR, VMX, FPR };
|
||||
|
||||
class Function : public Symbol {
|
||||
public:
|
||||
enum class Behavior {
|
||||
enum class Behavior : uint8_t {
|
||||
kDefault = 0,
|
||||
kProlog,
|
||||
kEpilog,
|
||||
|
@ -53,6 +54,20 @@ class Function : public Symbol {
|
|||
void set_behavior(Behavior value) { behavior_ = value; }
|
||||
bool is_guest() const { return behavior_ != Behavior::kBuiltin; }
|
||||
|
||||
void SetSaverest(SaveRestoreType type, bool is_rest, uint8_t index) {
|
||||
saverest_type_ = type;
|
||||
is_restore_ = is_rest;
|
||||
saverest_index_ = index;
|
||||
}
|
||||
|
||||
bool IsSaverest() const { return saverest_type_ != SaveRestoreType::NONE; }
|
||||
|
||||
SaveRestoreType SaverestType() const { return saverest_type_; }
|
||||
unsigned SaverestIndex() const { return saverest_index_; }
|
||||
|
||||
bool IsSave() const { return IsSaverest() && is_restore_ == 0; }
|
||||
bool IsRestore() const { return IsSaverest() && is_restore_; }
|
||||
|
||||
bool ContainsAddress(uint32_t address) const {
|
||||
if (!address_ || !end_address_) {
|
||||
return false;
|
||||
|
@ -71,7 +86,11 @@ class Function : public Symbol {
|
|||
Function(Module* module, uint32_t address);
|
||||
|
||||
uint32_t end_address_ = 0;
|
||||
|
||||
Behavior behavior_ = Behavior::kDefault;
|
||||
SaveRestoreType saverest_type_ = SaveRestoreType::NONE;
|
||||
uint8_t is_restore_ = 0;
|
||||
uint8_t saverest_index_ = 0;
|
||||
};
|
||||
|
||||
class BuiltinFunction : public Function {
|
||||
|
|
|
@ -1023,13 +1023,6 @@ Value* HIRBuilder::Truncate(Value* value, TypeName target_type) {
|
|||
|
||||
Value* HIRBuilder::Convert(Value* value, TypeName target_type,
|
||||
RoundMode round_mode) {
|
||||
if (value->type == target_type) {
|
||||
return value;
|
||||
} else if (value->IsConstant()) {
|
||||
Value* dest = CloneValue(value);
|
||||
dest->Convert(target_type, round_mode);
|
||||
return dest;
|
||||
}
|
||||
|
||||
Instr* i =
|
||||
AppendInstr(OPCODE_CONVERT_info, round_mode, AllocValue(target_type));
|
||||
|
@ -1041,11 +1034,6 @@ Value* HIRBuilder::Convert(Value* value, TypeName target_type,
|
|||
Value* HIRBuilder::Round(Value* value, RoundMode round_mode) {
|
||||
ASSERT_FLOAT_OR_VECTOR_TYPE(value);
|
||||
|
||||
if (value->IsConstant()) {
|
||||
Value* dest = CloneValue(value);
|
||||
dest->Round(round_mode);
|
||||
return dest;
|
||||
}
|
||||
|
||||
Instr* i =
|
||||
AppendInstr(OPCODE_ROUND_info, round_mode, AllocValue(value->type));
|
||||
|
@ -1295,7 +1283,7 @@ void HIRBuilder::SetNJM(Value* value) {
|
|||
Value* HIRBuilder::Max(Value* value1, Value* value2) {
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
|
||||
if (value1->type != VEC128_TYPE && value1->IsConstant() &&
|
||||
if (IsScalarIntegralType( value1->type) && value1->IsConstant() &&
|
||||
value2->IsConstant()) {
|
||||
return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value2 : value1;
|
||||
}
|
||||
|
@ -1323,7 +1311,7 @@ Value* HIRBuilder::VectorMax(Value* value1, Value* value2, TypeName part_type,
|
|||
Value* HIRBuilder::Min(Value* value1, Value* value2) {
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
|
||||
if (value1->type != VEC128_TYPE && value1->IsConstant() &&
|
||||
if (IsScalarIntegralType(value1->type) && value1->IsConstant() &&
|
||||
value2->IsConstant()) {
|
||||
return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value1 : value2;
|
||||
}
|
||||
|
@ -1351,8 +1339,9 @@ Value* HIRBuilder::VectorMin(Value* value1, Value* value2, TypeName part_type,
|
|||
Value* HIRBuilder::Select(Value* cond, Value* value1, Value* value2) {
|
||||
assert_true(cond->type == INT8_TYPE || cond->type == VEC128_TYPE); // for now
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
|
||||
if (cond->IsConstant()) {
|
||||
// chrispy: this was being done with V128, which was breaking stuff obviously
|
||||
// because that should be an element by element select
|
||||
if (cond->IsConstant() && IsScalarIntegralType(cond->type)) {
|
||||
return cond->IsConstantTrue() ? value1 : value2;
|
||||
}
|
||||
|
||||
|
@ -1518,7 +1507,8 @@ Value* HIRBuilder::Add(Value* value1, Value* value2,
|
|||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
|
||||
// TODO(benvanik): optimize when flags set.
|
||||
if (!arithmetic_flags) {
|
||||
|
||||
if (!arithmetic_flags && IsScalarIntegralType(value1->type)) {
|
||||
if (value1->IsConstantZero()) {
|
||||
return value2;
|
||||
} else if (value2->IsConstantZero()) {
|
||||
|
|
|
@ -442,7 +442,18 @@ int InstrEmit_fabsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// frD <- abs(frB)
|
||||
Value* v = f.Abs(f.LoadFPR(i.X.RB));
|
||||
f.StoreFPR(i.X.RT, v);
|
||||
f.UpdateFPSCR(v, i.X.Rc);
|
||||
/*
|
||||
The contents of frB with bit 0 cleared are placed into frD.
|
||||
Note that the fabs instruction treats NaNs just like any other kind of value. That is, the sign
|
||||
bit of a NaN may be altered by fabs. This instruction does not alter the FPSCR.
|
||||
Other registers altered:
|
||||
• Condition Register (CR1 field):
|
||||
Affected: FX, FEX, VX, OX (if Rc = 1)
|
||||
*/
|
||||
// f.UpdateFPSCR(v, i.X.Rc);
|
||||
if (i.X.Rc) {
|
||||
// todo
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -458,7 +469,10 @@ int InstrEmit_fnabsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// frD <- !abs(frB)
|
||||
Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB)));
|
||||
f.StoreFPR(i.X.RT, v);
|
||||
f.UpdateFPSCR(v, i.X.Rc);
|
||||
//f.UpdateFPSCR(v, i.X.Rc);
|
||||
if (i.X.Rc) {
|
||||
//todo
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -466,7 +480,10 @@ int InstrEmit_fnegx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// frD <- ¬ frB[0] || frB[1-63]
|
||||
Value* v = f.Neg(f.LoadFPR(i.X.RB));
|
||||
f.StoreFPR(i.X.RT, v);
|
||||
f.UpdateFPSCR(v, i.X.Rc);
|
||||
//f.UpdateFPSCR(v, i.X.Rc);
|
||||
if (i.X.Rc) {
|
||||
//todo
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -1598,6 +1598,8 @@ bool XexModule::FindSaveRest() {
|
|||
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveGprLr;
|
||||
function->set_behavior(Function::Behavior::kProlog);
|
||||
function->set_status(Symbol::Status::kDeclared);
|
||||
function->SetSaverest(cpu::SaveRestoreType::GPR, false, n);
|
||||
|
||||
address += 4;
|
||||
}
|
||||
address = gplr_start + 20 * 4;
|
||||
|
@ -1612,6 +1614,7 @@ bool XexModule::FindSaveRest() {
|
|||
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestGprLr;
|
||||
function->set_behavior(Function::Behavior::kEpilogReturn);
|
||||
function->set_status(Symbol::Status::kDeclared);
|
||||
function->SetSaverest(cpu::SaveRestoreType::GPR, true, n);
|
||||
address += 4;
|
||||
}
|
||||
}
|
||||
|
@ -1628,6 +1631,8 @@ bool XexModule::FindSaveRest() {
|
|||
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveFpr;
|
||||
function->set_behavior(Function::Behavior::kProlog);
|
||||
function->set_status(Symbol::Status::kDeclared);
|
||||
|
||||
function->SetSaverest(cpu::SaveRestoreType::FPR, false, n);
|
||||
address += 4;
|
||||
}
|
||||
address = fpr_start + (18 * 4) + (1 * 4);
|
||||
|
@ -1642,6 +1647,7 @@ bool XexModule::FindSaveRest() {
|
|||
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestFpr;
|
||||
function->set_behavior(Function::Behavior::kEpilog);
|
||||
function->set_status(Symbol::Status::kDeclared);
|
||||
function->SetSaverest(cpu::SaveRestoreType::FPR, true, n);
|
||||
address += 4;
|
||||
}
|
||||
}
|
||||
|
@ -1662,6 +1668,7 @@ bool XexModule::FindSaveRest() {
|
|||
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx;
|
||||
function->set_behavior(Function::Behavior::kProlog);
|
||||
function->set_status(Symbol::Status::kDeclared);
|
||||
function->SetSaverest(cpu::SaveRestoreType::VMX, false, n);
|
||||
address += 2 * 4;
|
||||
}
|
||||
address += 4;
|
||||
|
@ -1675,6 +1682,7 @@ bool XexModule::FindSaveRest() {
|
|||
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx;
|
||||
function->set_behavior(Function::Behavior::kProlog);
|
||||
function->set_status(Symbol::Status::kDeclared);
|
||||
function->SetSaverest(cpu::SaveRestoreType::VMX, false, n);
|
||||
address += 2 * 4;
|
||||
}
|
||||
address = vmx_start + (18 * 2 * 4) + (1 * 4) + (64 * 2 * 4) + (1 * 4);
|
||||
|
@ -1688,6 +1696,7 @@ bool XexModule::FindSaveRest() {
|
|||
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx;
|
||||
function->set_behavior(Function::Behavior::kEpilog);
|
||||
function->set_status(Symbol::Status::kDeclared);
|
||||
function->SetSaverest(cpu::SaveRestoreType::VMX, true, n);
|
||||
address += 2 * 4;
|
||||
}
|
||||
address += 4;
|
||||
|
@ -1701,6 +1710,7 @@ bool XexModule::FindSaveRest() {
|
|||
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx;
|
||||
function->set_behavior(Function::Behavior::kEpilog);
|
||||
function->set_status(Symbol::Status::kDeclared);
|
||||
function->SetSaverest(cpu::SaveRestoreType::VMX, true, n);
|
||||
address += 2 * 4;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue