Merge pull request #62 from chrisps/canary_experimental

Minor correctness/constant folding fixes, guest code optimizations for pre-ryzen amd processors
This commit is contained in:
Radosław Gliński 2022-08-23 00:01:24 +02:00 committed by GitHub
commit 9006b309af
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 791 additions and 150 deletions

View File

@ -0,0 +1,334 @@
#ifndef XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_
#define XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_
#include <stdio.h>
#include <string.h>
#include <string>
namespace xe {
namespace cpu {
namespace backend {
namespace x64 {
namespace amdfx {
enum xopcodemap_e : unsigned char {
XOPCODE_HAS_IMMBYTE = 0x8,
XOPCODE_NO_IMMBYTE = 0x9
};
// base opcodes, without their size specified
enum xopcode_e : unsigned char {
xop_VFRCZPD = 0x81,
xop_VFRCZPS = 0x80,
xop_VFRCZSD = 0x83,
xop_VFRCZSS = 0x82,
xop_VPCMOV = 0xA2,
xop_VPCOMB = 0xCC,
xop_VPCOMD = 0xCE,
xop_VPCOMQ = 0xCF,
xop_VPCOMUB = 0xEC,
xop_VPCOMUD = 0xEE,
xop_VPCOMUQ = 0xEF,
xop_VPCOMUW = 0xED,
xop_VPCOMW = 0xCD,
xop_VPERMIL2PD = 0x49,
xop_VPERMIL2PS = 0x48,
xop_VPHADDBD = 0xC2,
xop_VPHADDBQ = 0xC3,
xop_VPHADDBW = 0xC1,
xop_VPHADDDQ = 0xCB,
xop_VPHADDUBD = 0xD2,
xop_VPHADDUBQ = 0xD3,
xop_VPHADDUBW = 0xD1,
xop_VPHADDUDQ = 0xDB,
xop_VPHADDUWD = 0xD6,
xop_VPHADDUWQ = 0xD7,
xop_VPHADDWD = 0xC6,
xop_VPHADDWQ = 0xC7,
xop_VPHSUBBW = 0xE1,
xop_VPHSUBDQ = 0xE3,
xop_VPHSUBWD = 0xE2,
xop_VPMACSDD = 0x9E,
xop_VPMACSDQH = 0x9F,
xop_VPMACSDQL = 0x97,
xop_VPMACSSDD = 0x8E,
xop_VPMACSSDQH = 0x8F,
xop_VPMACSSDQL = 0x87,
xop_VPMACSSWD = 0x86,
xop_VPMACSSWW = 0x85,
xop_VPMACSWD = 0x96,
xop_VPMACSWW = 0x95,
xop_VPMADCSSWD = 0xA6,
xop_VPMADCSWD = 0xB6,
xop_VPPERM = 0xA3,
xop_VPROTB = 0x90,
xop_VPROTBI = 0xC0, // imm version
xop_VPROTD = 0x92,
xop_VPROTDI = 0xC2,
xop_VPROTQ = 0x93,
xop_VPROTQI = 0xC3,
xop_VPROTW = 0x91,
xop_VPROTWI = 0xC1,
xop_VPSHAB = 0x98,
xop_VPSHAD = 0x9A,
xop_VPSHAQ = 0x9B,
xop_VPSHAW = 0x99,
xop_VPSHLB = 0x94,
xop_VPSHLD = 0x96,
xop_VPSHLQ = 0x97,
xop_VPSHLW = 0x95,
};
enum xop_iop_e : unsigned char {
XOP_BYTE = 0,
XOP_WORD = 1,
XOP_DOUBLEWORD = 2,
XOP_QUADWORD = 3
};
enum xop_fop_e : unsigned char {
XOP_PS = 0,
XOP_PD = 1,
XOP_SS = 2,
XOP_SD = 3
};
class xop_byte1_t {
public:
union {
// informative names
struct {
/*
A five bit field encoding a one- or two-byte opcode prefix.
*/
unsigned char opcode_map_select : 5;
/*
This bit provides a one-bit extension of either the ModRM.r/m
field to specify a GPR or XMM register or to the SIB base field to
specify a GPR. This permits access to 16 registers. In 32-bit protected
and compatibility modes, this bit is ignored. This bit is the
bit-inverted equivalent of the REX.B bit and is available only in the
3-byte prefix format.
*/
unsigned char inv_1bit_ext_modrm_or_sib : 1;
/*
This bit provides a one bit extension of the SIB.index field in
64-bit mode, permitting access to 16 YMM/XMM and GPR registers. In
32-bit protected and compatibility modes, this bit must be set to 1.
This bit is the bit-inverted equivalent of the REX.X bit
*/
unsigned char inv_1bit_ext_sib_index : 1;
/*
This bit provides a one bit extension of the ModRM.reg field in
64-bit mode, permitting access to all 16 YMM/XMM and GPR registers. In
32-bit protected and compatibility modes, this bit must be set to 1.
This bit is the bit-inverted equivalent of the REX.R bit.
*/
unsigned char inv_1bit_ext_modrm_reg_field : 1;
};
// amd manual names
struct {
unsigned char mmmmm : 5;
unsigned char B : 1;
unsigned char X : 1;
unsigned char R : 1;
};
unsigned char encoded;
};
};
class xop_byte2_t {
public:
union {
struct {
unsigned char
implied_66f2f3_ext : 2; // 0 = no implied, 1 = 66, 2 = F3, 3 = F2
unsigned char vector_length : 1;
unsigned char source_or_dest_reg_specifier_inverted_1s_compl : 4;
unsigned char scalar_reg_size_override_special : 1;
};
// amd manual names
struct {
unsigned char pp : 2; // presumably 0 = no implied, 1 = 66, 2 = F2, 3 =
// F3
unsigned char L : 1;
unsigned char vvvv : 4; // src1 for four operand form
unsigned char W : 1;
};
unsigned char encoded;
};
};
class xop_opcode_byte_t {
public:
union {
struct {
xop_fop_e float_datatype : 2;
unsigned char __unused0 : 6;
};
struct {
xop_iop_e int_datatype : 2;
unsigned char __unused1 : 6;
};
struct {
unsigned char oes : 2;
unsigned char opcode : 6;
};
unsigned char encoded;
};
};
class modrm_byte_t {
public:
union {
struct {
unsigned char rm : 3;
unsigned char mod : 5; // 4 opnd form dest reg
};
unsigned char encoded;
};
};
#pragma pack(push, 1)
class xop_t {
public:
unsigned char imm_8F; // always 0x8F
xop_byte1_t byte1;
xop_byte2_t byte2;
xop_opcode_byte_t opcode;
modrm_byte_t modrm;
unsigned char imm8;
xop_t() : imm_8F(0x8F) {
byte1.encoded = 0;
byte2.encoded = 0;
opcode.encoded = 0;
modrm.encoded = 0;
}
unsigned AssembledSize() const {
if (byte1.opcode_map_select == XOPCODE_NO_IMMBYTE) {
return 5;
} else {
return 6;
}
}
template <typename TCall>
void ForeachByte(TCall&& cb) {
cb(imm_8F);
cb(byte1.encoded);
cb(byte2.encoded);
cb(opcode.encoded);
cb(modrm.encoded);
if (AssembledSize() == 6) {
cb(imm8);
}
}
};
#pragma pack(pop)
static void xop_set_fouroperand_form(xop_t& xop, unsigned xmmidx_dest,
unsigned xmmidx_src1, unsigned xmmidx_src2,
unsigned xmmidx_src3, xopcode_e opcode,
bool has_immbyte = true) {
xop.opcode.encoded = opcode;
xop.byte1.encoded = 0xe8;
if (has_immbyte) {
xop.byte1.opcode_map_select = XOPCODE_HAS_IMMBYTE;
} else {
xop.byte1.opcode_map_select = XOPCODE_NO_IMMBYTE;
}
xop.imm8 = xmmidx_src3 << 4;
xop.modrm.rm = xmmidx_src2 & 0b111;
xop.byte1.inv_1bit_ext_modrm_reg_field = (xmmidx_dest >> 3) ^ 1;
xop.byte1.inv_1bit_ext_modrm_or_sib = (xmmidx_src2 >> 3) ^ 1;
xop.byte2.vvvv = ~xmmidx_src1;
xop.modrm.encoded |= 0xC0;
xop.modrm.mod |= xmmidx_dest & 0b111;
}
enum class xopcompare_e : uint32_t {
LT = 0b000,
LTE = 0b001,
GT = 0b010,
GTE = 0b011,
EQ = 0b100,
NEQ = 0b101,
FALSEY = 0b110, // there doesnt seem to be much in the way of documentation
// for these two
TRUTHEY = 0b111
};
namespace operations {
#define SIMPLE_FOUROPERAND(funcname, opcode) \
static xop_t funcname(unsigned destidx, unsigned src1idx, unsigned src2idx, \
unsigned src3idx) { \
xop_t result{}; \
xop_set_fouroperand_form(result, destidx, src1idx, src2idx, src3idx, \
opcode, true); \
return result; \
}
SIMPLE_FOUROPERAND(vpcmov, xop_VPCMOV)
SIMPLE_FOUROPERAND(vpperm, xop_VPPERM)
#define COMPAREFUNC(name, opcode) \
static xop_t name(unsigned dst, unsigned src1, unsigned src2, \
xopcompare_e imm8) { \
xop_t xop; \
xop_set_fouroperand_form(xop, dst, src1, src2, 0, opcode, true); \
xop.imm8 = static_cast<uint8_t>(static_cast<uint32_t>(imm8)); \
return xop; \
}
COMPAREFUNC(vpcomb, xop_VPCOMB)
COMPAREFUNC(vpcomub, xop_VPCOMUB)
COMPAREFUNC(vpcomw, xop_VPCOMW)
COMPAREFUNC(vpcomuw, xop_VPCOMUW)
COMPAREFUNC(vpcomd, xop_VPCOMD)
COMPAREFUNC(vpcomud, xop_VPCOMUD)
COMPAREFUNC(vpcomq, xop_VPCOMQ)
COMPAREFUNC(vpcomuq, xop_VPCOMUQ)
#define SIMPLE_THREEOPERAND(funcname, opcode) \
static xop_t funcname(unsigned destidx, unsigned src1idx, \
unsigned src2idx) { \
xop_t result{}; \
xop_set_fouroperand_form(result, destidx, src1idx, src2idx, 0, opcode, \
false); \
return result; \
}
SIMPLE_THREEOPERAND(vprotb, xop_VPROTB)
SIMPLE_THREEOPERAND(vprotw, xop_VPROTW)
SIMPLE_THREEOPERAND(vprotd, xop_VPROTD)
SIMPLE_THREEOPERAND(vprotq, xop_VPROTQ)
SIMPLE_THREEOPERAND(vpshab, xop_VPSHAB)
SIMPLE_THREEOPERAND(vpshaw, xop_VPSHAW)
SIMPLE_THREEOPERAND(vpshad, xop_VPSHAD)
SIMPLE_THREEOPERAND(vpshaq, xop_VPSHAQ)
SIMPLE_THREEOPERAND(vpshlb, xop_VPSHLB)
SIMPLE_THREEOPERAND(vpshlw, xop_VPSHLW)
SIMPLE_THREEOPERAND(vpshld, xop_VPSHLD)
SIMPLE_THREEOPERAND(vpshlq, xop_VPSHLQ)
#undef SIMPLE_THREEOPERAND
#undef SIMPLE_FOUROPERAND
#undef COMPAREFUNC
} // namespace operations
} // namespace amdfx
} // namespace x64
} // namespace backend
} // namespace cpu
} // namespace xe
#endif // XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_

View File

@ -143,6 +143,12 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
feature_flags_ |= kX64EmitTBM;
}
}
if (amd_flags & (1U << 11)) {
if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
feature_flags_ |= kX64EmitXOP;
XELOGCPU("Cpu support XOP!\n\n");
}
}
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
bool is_zennish = cpu_.displayFamily >= 0x17;
/*
@ -1024,8 +1030,13 @@ static const vec128_t xmm_consts[] = {
/*
XMMF16PackLCPI6
*/
vec128i(0x8000)
vec128i(0x8000),
/* XMMXOPByteShiftMask,*/
vec128b(7),
/*XMMXOPWordShiftMask*/
vec128s(15),
/*XMMXOPDwordShiftMask*/
vec128i(31)
};
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {

View File

@ -23,7 +23,7 @@
// NOTE: must be included last as it expects windows.h to already be included.
#include "third_party/xbyak/xbyak/xbyak.h"
#include "third_party/xbyak/xbyak/xbyak_util.h"
#include "x64_amdfx_extensions.h"
namespace xe {
namespace cpu {
class Processor;
@ -167,8 +167,14 @@ enum XmmConst {
XMMF16PackLCPI3,
XMMF16PackLCPI4,
XMMF16PackLCPI5,
XMMF16PackLCPI6
XMMF16PackLCPI6,
XMMXOPByteShiftMask,
XMMXOPWordShiftMask,
XMMXOPDwordShiftMask,
};
using amdfx::xopcompare_e;
using Xbyak::Xmm;
// X64Backend specific Instr->runtime_flags
enum : uint32_t {
INSTR_X64_FLAGS_ELIMINATED =
@ -351,6 +357,60 @@ class X64Emitter : public Xbyak::CodeGenerator {
void EmitProfilerEpilogue();
void EmitXOP(amdfx::xop_t xoperation) {
xoperation.ForeachByte([this](uint8_t b) { this->db(b); });
}
void vpcmov(Xmm dest, Xmm src1, Xmm src2, Xmm selector) {
auto xop_bytes = amdfx::operations::vpcmov(
dest.getIdx(), src1.getIdx(), src2.getIdx(), selector.getIdx());
EmitXOP(xop_bytes);
}
void vpperm(Xmm dest, Xmm src1, Xmm src2, Xmm selector) {
auto xop_bytes = amdfx::operations::vpperm(
dest.getIdx(), src1.getIdx(), src2.getIdx(), selector.getIdx());
EmitXOP(xop_bytes);
}
#define DEFINECOMPARE(name) \
void name(Xmm dest, Xmm src1, Xmm src2, xopcompare_e compareop) { \
auto xop_bytes = amdfx::operations::name(dest.getIdx(), src1.getIdx(), \
src2.getIdx(), compareop); \
EmitXOP(xop_bytes); \
}
DEFINECOMPARE(vpcomb);
DEFINECOMPARE(vpcomub);
DEFINECOMPARE(vpcomw);
DEFINECOMPARE(vpcomuw);
DEFINECOMPARE(vpcomd);
DEFINECOMPARE(vpcomud);
DEFINECOMPARE(vpcomq);
DEFINECOMPARE(vpcomuq);
#undef DEFINECOMPARE
#define DEFINESHIFTER(name) \
void name(Xmm dest, Xmm src1, Xmm src2) { \
auto xop_bytes = \
amdfx::operations::name(dest.getIdx(), src1.getIdx(), src2.getIdx()); \
EmitXOP(xop_bytes); \
}
DEFINESHIFTER(vprotb)
DEFINESHIFTER(vprotw)
DEFINESHIFTER(vprotd)
DEFINESHIFTER(vprotq)
DEFINESHIFTER(vpshab)
DEFINESHIFTER(vpshaw)
DEFINESHIFTER(vpshad)
DEFINESHIFTER(vpshaq)
DEFINESHIFTER(vpshlb)
DEFINESHIFTER(vpshlw)
DEFINESHIFTER(vpshld)
DEFINESHIFTER(vpshlq)
protected:
void* Emplace(const EmitFunctionInfo& func_info,
GuestFunction* function = nullptr);

View File

@ -19,6 +19,16 @@
#include "xenia/base/cvar.h"
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
DEFINE_bool(xop_rotates, false, "rotate via xop", "X64");
DEFINE_bool(xop_left_shifts, false, "shl via xop", "X64");
DEFINE_bool(xop_right_shifts, false, "shr via xop", "X64");
DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "X64");
DEFINE_bool(xop_compares, true, "compare via xop", "X64");
namespace xe {
namespace cpu {
namespace backend {
@ -143,6 +153,7 @@ struct VECTOR_DENORMFLUSH
e.vandps(e.xmm0, i.src1,
e.GetXmmConstPtr(XMMSingleDenormalMask)); // 0.25 P0123
e.vcmpneqps(e.xmm2, e.xmm0, e.xmm1); // 0.5 P01
// todo: xop vpcmov here
e.vandps(e.xmm1, i.src1,
e.GetXmmConstPtr(XMMSignMaskF32)); // 0.5 P0123 take signs, zeros
// must keep their signs
@ -406,26 +417,44 @@ struct VECTOR_COMPARE_SGE_V128
static void Emit(X64Emitter& e, const EmitArgType& i) {
EmitAssociativeBinaryXmmOp(
e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcmpeqb(e.xmm0, src1, src2);
e.vpcmpgtb(dest, src1, src2);
e.vpor(dest, e.xmm0);
break;
case INT16_TYPE:
e.vpcmpeqw(e.xmm0, src1, src2);
e.vpcmpgtw(dest, src1, src2);
e.vpor(dest, e.xmm0);
break;
case INT32_TYPE:
e.vpcmpeqd(e.xmm0, src1, src2);
e.vpcmpgtd(dest, src1, src2);
e.vpor(dest, e.xmm0);
break;
case FLOAT32_TYPE:
e.ChangeMxcsrMode(MXCSRMode::Vmx);
e.vcmpgeps(dest, src1, src2);
break;
if (cvars::xop_compares && e.IsFeatureEnabled(kX64EmitXOP)) {
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcomb(dest, src1, src2, xopcompare_e::GTE);
break;
case INT16_TYPE:
e.vpcomw(dest, src1, src2, xopcompare_e::GTE);
break;
case INT32_TYPE:
e.vpcomd(dest, src1, src2, xopcompare_e::GTE);
break;
case FLOAT32_TYPE:
e.ChangeMxcsrMode(MXCSRMode::Vmx);
e.vcmpgeps(dest, src1, src2);
break;
}
} else {
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcmpeqb(e.xmm0, src1, src2);
e.vpcmpgtb(dest, src1, src2);
e.vpor(dest, e.xmm0);
break;
case INT16_TYPE:
e.vpcmpeqw(e.xmm0, src1, src2);
e.vpcmpgtw(dest, src1, src2);
e.vpor(dest, e.xmm0);
break;
case INT32_TYPE:
e.vpcmpeqd(e.xmm0, src1, src2);
e.vpcmpgtd(dest, src1, src2);
e.vpor(dest, e.xmm0);
break;
case FLOAT32_TYPE:
e.ChangeMxcsrMode(MXCSRMode::Vmx);
e.vcmpgeps(dest, src1, src2);
break;
}
}
});
}
@ -600,6 +629,7 @@ struct VECTOR_ADD
// overflowed (only need to check one input)
// if (src1 > res) then overflowed
// http://locklessinc.com/articles/sat_arithmetic/
// chrispy: todo - add xop stuff here
e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32));
e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32));
e.vpcmpgtd(e.xmm0, e.xmm2, e.xmm0);
@ -755,23 +785,52 @@ static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) {
// Store result and return it.
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
}
static XmmConst GetShiftmaskForType(unsigned typ) {
if (typ == INT8_TYPE) {
return XMMXOPByteShiftMask;
} else if (typ == INT16_TYPE) {
return XMMXOPWordShiftMask;
} else {
return XMMXOPDwordShiftMask;
}
}
struct VECTOR_SHL_V128
: Sequence<VECTOR_SHL_V128, I<OPCODE_VECTOR_SHL, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
if (cvars::xop_left_shifts && e.IsFeatureEnabled(kX64EmitXOP)) {
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
e.vpand(e.xmm2, src2,
e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
switch (i.instr->flags) {
case INT8_TYPE:
e.vpshlb(i.dest, src1, e.xmm2);
break;
case INT16_TYPE:
e.vpshlw(i.dest, src1, e.xmm2);
break;
case INT32_TYPE:
e.vpshld(i.dest, src1, e.xmm2);
break;
}
} else {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
}
}
}
@ -1041,19 +1100,45 @@ static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) {
struct VECTOR_SHR_V128
: Sequence<VECTOR_SHR_V128, I<OPCODE_VECTOR_SHR, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
if (cvars::xop_right_shifts && e.IsFeatureEnabled(kX64EmitXOP)) {
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
e.vpand(e.xmm2, src2,
e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
e.vpcmpeqb(e.xmm3, e.xmm3);
switch (i.instr->flags) {
case INT8_TYPE:
e.vpsignb(e.xmm2, e.xmm3);
e.vpshlb(i.dest, src1, e.xmm2);
break;
case INT16_TYPE:
e.vpsignw(e.xmm2, e.xmm3);
e.vpshlw(i.dest, src1, e.xmm2);
break;
case INT32_TYPE:
e.vpsignd(e.xmm2, e.xmm3);
e.vpshld(i.dest, src1, e.xmm2);
break;
}
} else {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
}
}
}
@ -1224,19 +1309,45 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128);
struct VECTOR_SHA_V128
: Sequence<VECTOR_SHA_V128, I<OPCODE_VECTOR_SHA, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
if (cvars::xop_arithmetic_right_shifts && e.IsFeatureEnabled(kX64EmitXOP)) {
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
e.vpand(e.xmm2, src2,
e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
e.vpcmpeqb(e.xmm3, e.xmm3);
switch (i.instr->flags) {
case INT8_TYPE:
e.vpsignb(e.xmm2, e.xmm3);
e.vpshab(i.dest, src1, e.xmm2);
break;
case INT16_TYPE:
e.vpsignw(e.xmm2, e.xmm3);
e.vpshaw(i.dest, src1, e.xmm2);
break;
case INT32_TYPE:
e.vpsignd(e.xmm2, e.xmm3);
e.vpshad(i.dest, src1, e.xmm2);
break;
}
} else {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
}
}
}
@ -1412,55 +1523,29 @@ struct VECTOR_ROTATE_LEFT_V128
: Sequence<VECTOR_ROTATE_LEFT_V128,
I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
switch (i.instr->flags) {
case INT8_TYPE:
// TODO(benvanik): native version (with shift magic).
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
} else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint8_t>));
e.vmovaps(i.dest, e.xmm0);
break;
case INT16_TYPE:
// TODO(benvanik): native version (with shift magic).
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
} else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint16_t>));
e.vmovaps(i.dest, e.xmm0);
break;
case INT32_TYPE: {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vprolvd(i.dest, i.src1, i.src2);
} else if (e.IsFeatureEnabled(kX64EmitAVX2)) {
Xmm temp = i.dest;
if (i.dest == i.src1 || i.dest == i.src2) {
temp = e.xmm2;
}
// Shift left (to get high bits):
if (i.src2.is_constant) {
e.LoadConstantXmm(temp, i.src2.constant());
e.vpand(e.xmm0, temp, e.GetXmmConstPtr(XMMShiftMaskPS));
} else {
e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
}
e.vpsllvd(e.xmm1, i.src1, e.xmm0);
// Shift right (to get low bits):
e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32));
e.vpsubd(temp, e.xmm0);
e.vpsrlvd(i.dest, i.src1, temp);
// Merge:
e.vpor(i.dest, e.xmm1);
} else {
// TODO(benvanik): non-AVX2 native version.
if (cvars::xop_rotates && e.IsFeatureEnabled(kX64EmitXOP)) {
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
e.vpand(e.xmm2, src2,
e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
switch (i.instr->flags) {
case INT8_TYPE:
e.vprotb(i.dest, src1, e.xmm2);
break;
case INT16_TYPE:
e.vprotw(i.dest, src1, e.xmm2);
break;
case INT32_TYPE:
e.vprotd(i.dest, src1, e.xmm2);
break;
}
} else {
switch (i.instr->flags) {
case INT8_TYPE:
// TODO(benvanik): native version (with shift magic).
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1),
e.StashConstantXmm(1, i.src2.constant()));
@ -1469,14 +1554,63 @@ struct VECTOR_ROTATE_LEFT_V128
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint32_t>));
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint8_t>));
e.vmovaps(i.dest, e.xmm0);
break;
case INT16_TYPE:
// TODO(benvanik): native version (with shift magic).
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1),
e.StashConstantXmm(1, i.src2.constant()));
} else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint16_t>));
e.vmovaps(i.dest, e.xmm0);
break;
case INT32_TYPE: {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vprolvd(i.dest, i.src1, i.src2);
} else if (e.IsFeatureEnabled(kX64EmitAVX2)) {
Xmm temp = i.dest;
if (i.dest == i.src1 || i.dest == i.src2) {
temp = e.xmm2;
}
// Shift left (to get high bits):
if (i.src2.is_constant) {
e.LoadConstantXmm(temp, i.src2.constant());
e.vpand(e.xmm0, temp, e.GetXmmConstPtr(XMMShiftMaskPS));
} else {
e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
}
e.vpsllvd(e.xmm1, i.src1, e.xmm0);
// Shift right (to get low bits):
e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32));
e.vpsubd(temp, e.xmm0);
e.vpsrlvd(i.dest, i.src1, temp);
// Merge:
e.vpor(i.dest, e.xmm1);
} else {
// TODO(benvanik): non-AVX2 native version.
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1),
e.StashConstantXmm(1, i.src2.constant()));
} else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint32_t>));
e.vmovaps(i.dest, e.xmm0);
}
break;
}
break;
default:
assert_always();
break;
}
default:
assert_always();
break;
}
}
};

View File

@ -50,10 +50,10 @@ DEFINE_bool(no_round_to_single, false,
"Not for users, breaks games. Skip rounding double values to "
"single precision and back",
"CPU");
DEFINE_bool(
inline_loadclock, false,
"Directly read cached guest clock without calling the LoadClock method (it gets repeatedly updated by calls from other threads)",
"CPU");
DEFINE_bool(inline_loadclock, false,
"Directly read cached guest clock without calling the LoadClock "
"method (it gets repeatedly updated by calls from other threads)",
"CPU");
namespace xe {
namespace cpu {
namespace backend {
@ -549,7 +549,7 @@ struct MAX_F64 : Sequence<MAX_F64, I<OPCODE_MAX, F64Op, F64Op, F64Op>> {
struct MAX_V128 : Sequence<MAX_V128, I<OPCODE_MAX, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Vmx);
//if 0 and -0, return 0! opposite of minfp
// if 0 and -0, return 0! opposite of minfp
auto src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
auto src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
e.vmaxps(e.xmm2, src1, src2);
@ -781,11 +781,15 @@ struct SELECT_V128_V128
} else if (mayblend == PermittedBlend::Ps) {
e.vblendvps(i.dest, src2, src3, src1);
} else {
//ideally we would have an xop path here...
// src1 ? src2 : src3;
e.vpandn(e.xmm3, src1, src2);
e.vpand(i.dest, src1, src3);
e.vpor(i.dest, i.dest, e.xmm3);
if (e.IsFeatureEnabled(kX64EmitXOP)) {
e.vpcmov(i.dest, src3, src2, src1);
} else {
// src1 ? src2 : src3;
e.vpandn(e.xmm3, src1, src2);
e.vpand(i.dest, src1, src3);
e.vpor(i.dest, i.dest, e.xmm3);
}
}
}
};

View File

@ -84,7 +84,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) {
iter_result |= EliminateConversions(builder);
iter_result |= SimplifyAssignments(builder);
iter_result |= SimplifyBasicArith(builder);
iter_result |= SimplifyVectorOps(builder);
result |= iter_result;
} while (iter_result);
return true;
@ -1393,6 +1393,65 @@ bool SimplificationPass::SimplifyBasicArith(hir::HIRBuilder* builder) {
return result;
}
static bool CouldEverProduceDenormal(hir::Instr* i) {
if (!i) {
return false;
}
Opcode denflushed_opcode = i->GetOpcodeNum();
if (denflushed_opcode == OPCODE_VECTOR_DENORMFLUSH) {
return false;
} else if (denflushed_opcode == OPCODE_UNPACK) {
// todo: more unpack operations likely cannot produce denormals
if (i->flags == PACK_TYPE_FLOAT16_4 || i->flags == PACK_TYPE_FLOAT16_2) {
return false; // xenos half float format does not support denormals
}
} else if (denflushed_opcode == OPCODE_VECTOR_CONVERT_I2F) {
return false;
}
return true; // todo: recurse, check values for min/max, abs, and others
}
bool SimplificationPass::SimplifyVectorOps(hir::Instr* i,
hir::HIRBuilder* builder) {
Opcode opc = i->GetOpcodeNum();
/*
if the input to an unconditional denormal flush is an output of an
unconditional denormal flush, it is a pointless instruction and should be
elimed
*/
if (opc == OPCODE_VECTOR_DENORMFLUSH) {
hir::Instr* denflushed_def = i->src1.value->GetDefSkipAssigns();
if (denflushed_def) {
if (!CouldEverProduceDenormal(denflushed_def)) {
i->opcode = &OPCODE_ASSIGN_info;
return true;
}
}
}
return false;
}
bool SimplificationPass::SimplifyVectorOps(hir::HIRBuilder* builder) {
bool result = false;
auto block = builder->first_block();
while (block) {
auto i = block->instr_head;
while (i) {
bool looks_vectory = false;
i->VisitValueOperands([&looks_vectory](Value* val, uint32_t idx) {
if (val->type == VEC128_TYPE) {
looks_vectory = true;
}
});
result |= SimplifyVectorOps(i, builder);
i = i->next;
}
block = block->next;
}
return result;
}
/*
todo: add load-store simplification pass

View File

@ -35,6 +35,9 @@ class SimplificationPass : public ConditionalGroupSubpass {
// handles simple multiplication/addition rules
bool SimplifyBasicArith(hir::HIRBuilder* builder);
bool SimplifyVectorOps(hir::HIRBuilder* builder);
bool SimplifyVectorOps(hir::Instr* i, hir::HIRBuilder* builder);
bool SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder);
bool SimplifyAddWithSHL(hir::Instr* i, hir::HIRBuilder* builder);
bool SimplifyAddToSelf(hir::Instr* i, hir::HIRBuilder* builder);

View File

@ -31,10 +31,11 @@ struct SourceMapEntry {
uint32_t hir_offset; // Block ordinal (16b) | Instr ordinal (16b)
uint32_t code_offset; // Offset from emitted code start.
};
enum class SaveRestoreType : uint8_t { NONE, GPR, VMX, FPR };
class Function : public Symbol {
public:
enum class Behavior {
enum class Behavior : uint8_t {
kDefault = 0,
kProlog,
kEpilog,
@ -53,6 +54,20 @@ class Function : public Symbol {
void set_behavior(Behavior value) { behavior_ = value; }
bool is_guest() const { return behavior_ != Behavior::kBuiltin; }
void SetSaverest(SaveRestoreType type, bool is_rest, uint8_t index) {
saverest_type_ = type;
is_restore_ = is_rest;
saverest_index_ = index;
}
bool IsSaverest() const { return saverest_type_ != SaveRestoreType::NONE; }
SaveRestoreType SaverestType() const { return saverest_type_; }
unsigned SaverestIndex() const { return saverest_index_; }
bool IsSave() const { return IsSaverest() && is_restore_ == 0; }
bool IsRestore() const { return IsSaverest() && is_restore_; }
bool ContainsAddress(uint32_t address) const {
if (!address_ || !end_address_) {
return false;
@ -71,7 +86,11 @@ class Function : public Symbol {
Function(Module* module, uint32_t address);
uint32_t end_address_ = 0;
Behavior behavior_ = Behavior::kDefault;
SaveRestoreType saverest_type_ = SaveRestoreType::NONE;
uint8_t is_restore_ = 0;
uint8_t saverest_index_ = 0;
};
class BuiltinFunction : public Function {

View File

@ -1023,13 +1023,6 @@ Value* HIRBuilder::Truncate(Value* value, TypeName target_type) {
Value* HIRBuilder::Convert(Value* value, TypeName target_type,
RoundMode round_mode) {
if (value->type == target_type) {
return value;
} else if (value->IsConstant()) {
Value* dest = CloneValue(value);
dest->Convert(target_type, round_mode);
return dest;
}
Instr* i =
AppendInstr(OPCODE_CONVERT_info, round_mode, AllocValue(target_type));
@ -1041,11 +1034,6 @@ Value* HIRBuilder::Convert(Value* value, TypeName target_type,
Value* HIRBuilder::Round(Value* value, RoundMode round_mode) {
ASSERT_FLOAT_OR_VECTOR_TYPE(value);
if (value->IsConstant()) {
Value* dest = CloneValue(value);
dest->Round(round_mode);
return dest;
}
Instr* i =
AppendInstr(OPCODE_ROUND_info, round_mode, AllocValue(value->type));
@ -1295,7 +1283,7 @@ void HIRBuilder::SetNJM(Value* value) {
Value* HIRBuilder::Max(Value* value1, Value* value2) {
ASSERT_TYPES_EQUAL(value1, value2);
if (value1->type != VEC128_TYPE && value1->IsConstant() &&
if (IsScalarIntegralType( value1->type) && value1->IsConstant() &&
value2->IsConstant()) {
return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value2 : value1;
}
@ -1323,7 +1311,7 @@ Value* HIRBuilder::VectorMax(Value* value1, Value* value2, TypeName part_type,
Value* HIRBuilder::Min(Value* value1, Value* value2) {
ASSERT_TYPES_EQUAL(value1, value2);
if (value1->type != VEC128_TYPE && value1->IsConstant() &&
if (IsScalarIntegralType(value1->type) && value1->IsConstant() &&
value2->IsConstant()) {
return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value1 : value2;
}
@ -1351,8 +1339,9 @@ Value* HIRBuilder::VectorMin(Value* value1, Value* value2, TypeName part_type,
Value* HIRBuilder::Select(Value* cond, Value* value1, Value* value2) {
assert_true(cond->type == INT8_TYPE || cond->type == VEC128_TYPE); // for now
ASSERT_TYPES_EQUAL(value1, value2);
if (cond->IsConstant()) {
// chrispy: this was being done with V128, which was breaking stuff obviously
// because that should be an element by element select
if (cond->IsConstant() && IsScalarIntegralType(cond->type)) {
return cond->IsConstantTrue() ? value1 : value2;
}
@ -1518,7 +1507,8 @@ Value* HIRBuilder::Add(Value* value1, Value* value2,
ASSERT_TYPES_EQUAL(value1, value2);
// TODO(benvanik): optimize when flags set.
if (!arithmetic_flags) {
if (!arithmetic_flags && IsScalarIntegralType(value1->type)) {
if (value1->IsConstantZero()) {
return value2;
} else if (value2->IsConstantZero()) {

View File

@ -442,7 +442,18 @@ int InstrEmit_fabsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- abs(frB)
Value* v = f.Abs(f.LoadFPR(i.X.RB));
f.StoreFPR(i.X.RT, v);
f.UpdateFPSCR(v, i.X.Rc);
/*
The contents of frB with bit 0 cleared are placed into frD.
Note that the fabs instruction treats NaNs just like any other kind of value. That is, the sign
bit of a NaN may be altered by fabs. This instruction does not alter the FPSCR.
Other registers altered:
Condition Register (CR1 field):
Affected: FX, FEX, VX, OX (if Rc = 1)
*/
// f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) {
// todo
}
return 0;
}
@ -458,7 +469,10 @@ int InstrEmit_fnabsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- !abs(frB)
Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB)));
f.StoreFPR(i.X.RT, v);
f.UpdateFPSCR(v, i.X.Rc);
//f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) {
//todo
}
return 0;
}
@ -466,7 +480,10 @@ int InstrEmit_fnegx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- ¬ frB[0] || frB[1-63]
Value* v = f.Neg(f.LoadFPR(i.X.RB));
f.StoreFPR(i.X.RT, v);
f.UpdateFPSCR(v, i.X.Rc);
//f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) {
//todo
}
return 0;
}

View File

@ -1598,6 +1598,8 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveGprLr;
function->set_behavior(Function::Behavior::kProlog);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::GPR, false, n);
address += 4;
}
address = gplr_start + 20 * 4;
@ -1612,6 +1614,7 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestGprLr;
function->set_behavior(Function::Behavior::kEpilogReturn);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::GPR, true, n);
address += 4;
}
}
@ -1628,6 +1631,8 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveFpr;
function->set_behavior(Function::Behavior::kProlog);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::FPR, false, n);
address += 4;
}
address = fpr_start + (18 * 4) + (1 * 4);
@ -1642,6 +1647,7 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestFpr;
function->set_behavior(Function::Behavior::kEpilog);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::FPR, true, n);
address += 4;
}
}
@ -1662,6 +1668,7 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx;
function->set_behavior(Function::Behavior::kProlog);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::VMX, false, n);
address += 2 * 4;
}
address += 4;
@ -1675,6 +1682,7 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx;
function->set_behavior(Function::Behavior::kProlog);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::VMX, false, n);
address += 2 * 4;
}
address = vmx_start + (18 * 2 * 4) + (1 * 4) + (64 * 2 * 4) + (1 * 4);
@ -1688,6 +1696,7 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx;
function->set_behavior(Function::Behavior::kEpilog);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::VMX, true, n);
address += 2 * 4;
}
address += 4;
@ -1701,6 +1710,7 @@ bool XexModule::FindSaveRest() {
// TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx;
function->set_behavior(Function::Behavior::kEpilog);
function->set_status(Symbol::Status::kDeclared);
function->SetSaverest(cpu::SaveRestoreType::VMX, true, n);
address += 2 * 4;
}
}