Merge pull request #50 from chrisps/canary_experimental
Ton of cpu changes
This commit is contained in:
commit
23ca3725c4
|
@ -446,10 +446,11 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
|
|||
EmitSaveNonvolatileRegs();
|
||||
|
||||
mov(rax, rcx);
|
||||
mov(rsi, rdx); // context
|
||||
mov(rcx, r8); // return address
|
||||
mov(rsi, rdx); // context
|
||||
mov(rdi, ptr[rdx + offsetof(ppc::PPCContext, virtual_membase)]); // membase
|
||||
mov(rcx, r8); // return address
|
||||
call(rax);
|
||||
|
||||
vzeroupper();
|
||||
EmitLoadNonvolatileRegs();
|
||||
|
||||
code_offsets.epilog = getSize();
|
||||
|
@ -500,7 +501,8 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
|
|||
|
||||
code_offsets.prolog_stack_alloc = getSize();
|
||||
code_offsets.body = getSize();
|
||||
|
||||
// chrispy: added this for proper vmsum impl, avx2 bitshifts
|
||||
vzeroupper();
|
||||
// Save off volatile registers.
|
||||
EmitSaveVolatileRegs();
|
||||
|
||||
|
|
|
@ -101,13 +101,11 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
|||
TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
|
||||
|
||||
|
||||
|
||||
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
|
||||
#undef TEST_EMIT_FEATURE
|
||||
/*
|
||||
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in latest version of xbyak
|
||||
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
|
||||
latest version of xbyak
|
||||
*/
|
||||
unsigned int data[4];
|
||||
Xbyak::util::Cpu::getCpuid(0x80000001, data);
|
||||
|
@ -117,21 +115,19 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
|||
}
|
||||
}
|
||||
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
|
||||
|
||||
bool is_zennish = cpu_.displayFamily >= 0x17;
|
||||
bool is_zennish = cpu_.displayFamily >= 0x17;
|
||||
|
||||
if (is_zennish) {
|
||||
feature_flags_ |= kX64FastJrcx;
|
||||
if (is_zennish) {
|
||||
feature_flags_ |= kX64FastJrcx;
|
||||
|
||||
if (cpu_.displayFamily > 0x17) {
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
if (cpu_.displayFamily > 0x17) {
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
|
||||
} else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
} // todo:figure out at model zen+ became zen2, this is just the model
|
||||
// for my cpu, which is ripper90
|
||||
|
||||
}
|
||||
} else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
} // todo:figure out at model zen+ became zen2, this is just the model
|
||||
// for my cpu, which is ripper90
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -263,7 +259,10 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
code_offsets.prolog_stack_alloc = getSize();
|
||||
code_offsets.body = getSize();
|
||||
|
||||
/*
|
||||
* chrispy: removed this, it serves no purpose
|
||||
mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
|
||||
*/
|
||||
mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
|
||||
mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);
|
||||
|
||||
|
@ -296,9 +295,11 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
}
|
||||
|
||||
// Load membase.
|
||||
mov(GetMembaseReg(),
|
||||
/*
|
||||
* chrispy: removed this, as long as we load it in HostToGuestThunk we can
|
||||
count on no other code modifying it. mov(GetMembaseReg(),
|
||||
qword[GetContextReg() + offsetof(ppc::PPCContext, virtual_membase)]);
|
||||
|
||||
*/
|
||||
// Body.
|
||||
auto block = builder->first_block();
|
||||
while (block) {
|
||||
|
@ -318,7 +319,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
// NOTE: If you encounter this after adding a new instruction, do a full
|
||||
// rebuild!
|
||||
assert_always();
|
||||
XELOGE("Unable to process HIR opcode {}", instr->opcode->name);
|
||||
XELOGE("Unable to process HIR opcode {}", GetOpcodeName(instr->opcode));
|
||||
break;
|
||||
}
|
||||
instr = new_tail;
|
||||
|
@ -331,8 +332,10 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
L(epilog_label);
|
||||
epilog_label_ = nullptr;
|
||||
EmitTraceUserCallReturn();
|
||||
/*
|
||||
* chrispy: removed this, it serves no purpose
|
||||
mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
|
||||
|
||||
*/
|
||||
code_offsets.epilog = getSize();
|
||||
|
||||
add(rsp, (uint32_t)stack_size);
|
||||
|
@ -342,7 +345,6 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
|
||||
if (cvars::emit_source_annotations) {
|
||||
nop(5);
|
||||
|
||||
}
|
||||
|
||||
assert_zero(code_offsets.prolog);
|
||||
|
@ -676,37 +678,9 @@ Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param) {
|
|||
Xbyak::Reg64 X64Emitter::GetContextReg() { return rsi; }
|
||||
Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdi; }
|
||||
|
||||
void X64Emitter::ReloadContext() {
|
||||
mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
|
||||
}
|
||||
|
||||
void X64Emitter::ReloadMembase() {
|
||||
mov(GetMembaseReg(), qword[GetContextReg() + 8]); // membase
|
||||
}
|
||||
#define __NH_CONCAT(x, y) x##y
|
||||
#define _MH_CONCAT(cb, ...) cb (__VA_ARGS__)
|
||||
|
||||
#define mh_concat2_m(x, y) __NH_CONCAT(x, y)
|
||||
|
||||
#define DECLNOP(n, ...) \
|
||||
static constexpr unsigned char mh_concat2_m(nop_, n)[] = {__VA_ARGS__}
|
||||
|
||||
DECLNOP(1, 0x90);
|
||||
DECLNOP(2, 0x66, 0x90);
|
||||
DECLNOP(3, 0x0F, 0x1F, 0x00);
|
||||
DECLNOP(4, 0x0F, 0x1F, 0x40, 0x00);
|
||||
DECLNOP(5, 0x0F, 0x1F, 0x44, 0x00, 0x00);
|
||||
DECLNOP(6, 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00);
|
||||
DECLNOP(7, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00);
|
||||
DECLNOP(8, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
|
||||
DECLNOP(9, 0x66, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
|
||||
|
||||
static constexpr const unsigned char* const g_noptable[] = {
|
||||
&nop_1[0], &nop_1[0], &nop_2[0], &nop_3[0], &nop_4[0],
|
||||
&nop_5[0], &nop_6[0], &nop_7[0], &nop_8[0], &nop_9[0]};
|
||||
|
||||
static constexpr unsigned LENGTHOF_NOPTABLE =
|
||||
sizeof(g_noptable) / sizeof(g_noptable[0]);
|
||||
|
||||
// Len Assembly Byte Sequence
|
||||
// ============================================================================
|
||||
|
@ -720,17 +694,8 @@ static constexpr unsigned LENGTHOF_NOPTABLE =
|
|||
// 8b NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H
|
||||
// 9b 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H
|
||||
void X64Emitter::nop(size_t length) {
|
||||
while (length != 0) {
|
||||
unsigned patchsize = length % LENGTHOF_NOPTABLE;
|
||||
|
||||
// patch_memory(locptr, size, (char*)g_noptable[patchsize]);
|
||||
|
||||
for (unsigned i = 0; i < patchsize; ++i) {
|
||||
db(g_noptable[patchsize][i]);
|
||||
}
|
||||
|
||||
//locptr += patchsize;
|
||||
length -= patchsize;
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
db(0x90);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -912,8 +877,17 @@ static const vec128_t xmm_consts[] = {
|
|||
0x80, 0x80, 0x80, 0x80),
|
||||
/*XMMShortsToBytes*/
|
||||
v128_setr_bytes(0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
0x80, 0x80, 0x80)
|
||||
};
|
||||
0x80, 0x80, 0x80),
|
||||
/*XMMLVSLTableBase*/
|
||||
vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
|
||||
/*XMMLVSRTableBase*/
|
||||
vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
|
||||
/* XMMSingleDenormalMask */
|
||||
vec128i(0x7f800000),
|
||||
/* XMMThreeFloatMask */
|
||||
vec128i(~0U, ~0U, ~0U, 0U),
|
||||
/*XMMXenosF16ExtRangeStart*/
|
||||
vec128f(65504)};
|
||||
|
||||
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
|
||||
for (auto& vec : xmm_consts) {
|
||||
|
@ -1013,7 +987,6 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
|||
// 1111...
|
||||
vpcmpeqb(dest, dest);
|
||||
} else {
|
||||
|
||||
for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
|
||||
if (xmm_consts[i] == v) {
|
||||
vmovapd(dest, GetXmmConstPtr((XmmConst)i));
|
||||
|
|
|
@ -118,7 +118,12 @@ enum XmmConst {
|
|||
XMM2To32,
|
||||
XMMFloatInf,
|
||||
XMMIntsToBytes,
|
||||
XMMShortsToBytes
|
||||
XMMShortsToBytes,
|
||||
XMMLVSLTableBase,
|
||||
XMMLVSRTableBase,
|
||||
XMMSingleDenormalMask,
|
||||
XMMThreeFloatMask, //for clearing the fourth float prior to DOT_PRODUCT_3
|
||||
XMMXenosF16ExtRangeStart
|
||||
};
|
||||
|
||||
// Unfortunately due to the design of xbyak we have to pass this to the ctor.
|
||||
|
@ -147,6 +152,7 @@ enum X64EmitterFeatureFlags {
|
|||
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
|
||||
kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
|
||||
kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
|
||||
kX64EmitAVX512VBMI = 1 << 14
|
||||
};
|
||||
class ResolvableGuestCall {
|
||||
public:
|
||||
|
@ -225,7 +231,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
|
||||
Xbyak::Reg64 GetContextReg();
|
||||
Xbyak::Reg64 GetMembaseReg();
|
||||
void ReloadContext();
|
||||
|
||||
void ReloadMembase();
|
||||
|
||||
void nop(size_t length = 1);
|
||||
|
|
|
@ -127,6 +127,26 @@ struct VECTOR_CONVERT_F2I
|
|||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I);
|
||||
|
||||
struct VECTOR_DENORMFLUSH
|
||||
: Sequence<VECTOR_DENORMFLUSH,
|
||||
I<OPCODE_VECTOR_DENORMFLUSH, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.vxorps(e.xmm1, e.xmm1, e.xmm1); // 0.25 P0123
|
||||
|
||||
e.vandps(e.xmm0, i.src1,
|
||||
e.GetXmmConstPtr(XMMSingleDenormalMask)); // 0.25 P0123
|
||||
e.vcmpneqps(e.xmm2, e.xmm0, e.xmm1); // 0.5 P01
|
||||
e.vandps(e.xmm1, i.src1,
|
||||
e.GetXmmConstPtr(XMMSignMaskF32)); // 0.5 P0123 take signs, zeros
|
||||
// must keep their signs
|
||||
e.vandps(e.xmm0, i.src1, e.xmm2); // P0123
|
||||
e.vorps(i.dest, e.xmm0, e.xmm1); // P0123 make sure zeros keep signs
|
||||
|
||||
// if it does not equal zero, we stay
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_VECTOR_DENORMFLUSH, VECTOR_DENORMFLUSH);
|
||||
|
||||
// ============================================================================
|
||||
// OPCODE_LOAD_VECTOR_SHL
|
||||
// ============================================================================
|
||||
|
@ -154,15 +174,20 @@ struct LOAD_VECTOR_SHL_I8
|
|||
if (i.src1.is_constant) {
|
||||
auto sh = i.src1.constant();
|
||||
assert_true(sh < xe::countof(lvsl_table));
|
||||
e.mov(e.rax, (uintptr_t)&lvsl_table[sh]);
|
||||
e.vmovaps(i.dest, e.ptr[e.rax]);
|
||||
if (sh == 0) {
|
||||
e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVSLTableBase));
|
||||
} else {
|
||||
// this is probably extremely rare
|
||||
e.LoadConstantXmm(i.dest, lvsl_table[sh]);
|
||||
}
|
||||
} else {
|
||||
// TODO(benvanik): find a cheaper way of doing this.
|
||||
e.movzx(e.rdx, i.src1);
|
||||
e.and_(e.dx, 0xF);
|
||||
e.shl(e.dx, 4);
|
||||
e.mov(e.rax, (uintptr_t)lvsl_table);
|
||||
e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
|
||||
// chrispy: removed mask, ppc_emit_altivec already pre-ands it.
|
||||
e.vmovd(e.xmm0, i.src1.reg().cvt32());
|
||||
// broadcast byte
|
||||
// dont use broadcastb with avx2, its slower than shuf
|
||||
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero));
|
||||
e.vpaddb(i.dest, e.xmm0, e.GetXmmConstPtr(XMMLVSLTableBase));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -195,15 +220,23 @@ struct LOAD_VECTOR_SHR_I8
|
|||
if (i.src1.is_constant) {
|
||||
auto sh = i.src1.constant();
|
||||
assert_true(sh < xe::countof(lvsr_table));
|
||||
e.mov(e.rax, (uintptr_t)&lvsr_table[sh]);
|
||||
e.vmovaps(i.dest, e.ptr[e.rax]);
|
||||
if (sh == 0) {
|
||||
e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVSRTableBase));
|
||||
} else {
|
||||
e.LoadConstantXmm(i.dest, lvsr_table[sh]);
|
||||
}
|
||||
} else {
|
||||
// TODO(benvanik): find a cheaper way of doing this.
|
||||
e.movzx(e.rdx, i.src1);
|
||||
e.and_(e.dx, 0xF);
|
||||
e.shl(e.dx, 4);
|
||||
e.mov(e.rax, (uintptr_t)lvsr_table);
|
||||
e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
|
||||
|
||||
// chrispy: removed mask, ppc_emit_altivec already pre-ands it. removed
|
||||
// lookup as well, compute from LVSR base instead
|
||||
e.vmovd(e.xmm0, i.src1.reg().cvt32());
|
||||
e.vmovdqa(e.xmm1, e.GetXmmConstPtr(XMMLVSRTableBase));
|
||||
// broadcast byte
|
||||
// dont use broadcastb with avx2, its slower than shuf
|
||||
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero));
|
||||
|
||||
e.vpsubb(i.dest, e.xmm1, e.xmm0);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -728,7 +761,7 @@ struct VECTOR_SHL_V128
|
|||
}
|
||||
}
|
||||
|
||||
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
||||
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX2)) {
|
||||
|
@ -1793,6 +1826,14 @@ struct PERMUTE_I32
|
|||
}
|
||||
}
|
||||
};
|
||||
//todo: use this on const src1
|
||||
static vec128_t FixupConstantShuf8(vec128_t input) {
|
||||
for (uint32_t i = 0; i < 16; ++i) {
|
||||
input.u8[i] ^= 0x03;
|
||||
input.u8[i] &= 0x1F;
|
||||
}
|
||||
return input;
|
||||
}
|
||||
struct PERMUTE_V128
|
||||
: Sequence<PERMUTE_V128,
|
||||
I<OPCODE_PERMUTE, V128Op, V128Op, V128Op, V128Op>> {
|
||||
|
@ -1855,7 +1896,8 @@ struct PERMUTE_V128
|
|||
} else {
|
||||
e.vpshufb(src3_shuf, i.src3, e.xmm2);
|
||||
}
|
||||
// Build a mask with values in src2 having 0 and values in src3 having 1.
|
||||
// Build a mask with values in src2 having 0 and values in src3
|
||||
// having 1.
|
||||
e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15));
|
||||
e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest);
|
||||
}
|
||||
|
|
|
@ -35,11 +35,14 @@
|
|||
#include "xenia/cpu/backend/x64/x64_emitter.h"
|
||||
#include "xenia/cpu/backend/x64/x64_op.h"
|
||||
#include "xenia/cpu/backend/x64/x64_tracers.h"
|
||||
// needed for stmxcsr
|
||||
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
|
||||
#include "xenia/cpu/hir/hir_builder.h"
|
||||
#include "xenia/cpu/processor.h"
|
||||
|
||||
DEFINE_bool(use_fast_dot_product, false,
|
||||
"Experimental optimization, much shorter sequence on dot products, treating inf as overflow instead of using mcxsr"
|
||||
"Experimental optimization, much shorter sequence on dot products, "
|
||||
"treating inf as overflow instead of using mcxsr"
|
||||
"four insn dotprod",
|
||||
"CPU");
|
||||
namespace xe {
|
||||
|
@ -1996,8 +1999,8 @@ struct DIV_V128 : Sequence<DIV_V128, I<OPCODE_DIV, V128Op, V128Op, V128Op>> {
|
|||
assert_true(!i.instr->flags);
|
||||
EmitAssociativeBinaryXmmOp(e, i,
|
||||
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
// e.vrcpps(e.xmm0, src2);
|
||||
//e.vmulps(dest, src1, e.xmm0);
|
||||
// e.vrcpps(e.xmm0, src2);
|
||||
// e.vmulps(dest, src1, e.xmm0);
|
||||
e.vdivps(dest, src1, src2);
|
||||
});
|
||||
}
|
||||
|
@ -2607,68 +2610,84 @@ struct LOG2_V128 : Sequence<LOG2_V128, I<OPCODE_LOG2, V128Op, V128Op>> {
|
|||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128);
|
||||
|
||||
struct DOT_PRODUCT_V128 {
|
||||
static void Emit(X64Emitter& e, Xmm dest, Xmm src1, Xmm src2, uint8_t imm) {
|
||||
if (cvars::use_fast_dot_product) {
|
||||
e.vdpps(dest, src1, src2, imm);
|
||||
e.vandps(e.xmm0, dest, e.GetXmmConstPtr(XMMAbsMaskPS));
|
||||
e.vcmpgeps(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMFloatInf));
|
||||
e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0);
|
||||
|
||||
} else {
|
||||
// TODO(benvanik): apparently this is very slow
|
||||
// - find alternative?
|
||||
Xbyak::Label end;
|
||||
e.inLocalLabel();
|
||||
|
||||
// Grab space to put MXCSR.
|
||||
// TODO(gibbed): stick this in TLS or
|
||||
// something?
|
||||
e.sub(e.rsp, 8);
|
||||
|
||||
// Grab MXCSR and mask off the overflow flag,
|
||||
// because it's sticky.
|
||||
e.vstmxcsr(e.dword[e.rsp]);
|
||||
e.mov(e.eax, e.dword[e.rsp]);
|
||||
e.and_(e.eax, uint32_t(~8));
|
||||
e.mov(e.dword[e.rsp], e.eax);
|
||||
e.vldmxcsr(e.dword[e.rsp]);
|
||||
|
||||
// Hey we can do the dot product now.
|
||||
e.vdpps(dest, src1, src2, imm);
|
||||
|
||||
// Load MXCSR...
|
||||
e.vstmxcsr(e.dword[e.rsp]);
|
||||
|
||||
// ..free our temporary space and get MXCSR at
|
||||
// the same time
|
||||
e.pop(e.rax);
|
||||
|
||||
// Did we overflow?
|
||||
e.test(e.al, 8);
|
||||
e.jz(end);
|
||||
|
||||
// Infinity? HA! Give NAN.
|
||||
e.vmovdqa(dest, e.GetXmmConstPtr(XMMQNaN));
|
||||
|
||||
e.L(end);
|
||||
e.outLocalLabel();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// OPCODE_DOT_PRODUCT_3
|
||||
// ============================================================================
|
||||
struct DOT_PRODUCT_3_V128
|
||||
: Sequence<DOT_PRODUCT_3_V128,
|
||||
I<OPCODE_DOT_PRODUCT_3, F32Op, V128Op, V128Op>> {
|
||||
I<OPCODE_DOT_PRODUCT_3, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
// https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
|
||||
EmitCommutativeBinaryXmmOp(
|
||||
e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
DOT_PRODUCT_V128::Emit(e, dest, src1, src2, 0b01110001);
|
||||
});
|
||||
// todo: add fast_dot_product path that just checks for infinity instead of
|
||||
// using mxcsr
|
||||
auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64];
|
||||
|
||||
// this is going to hurt a bit...
|
||||
/*
|
||||
this implementation is accurate, it matches the results of xb360 vmsum3
|
||||
except that vmsum3 is often off by 1 bit, but its extremely slow. it is a
|
||||
long, unbroken chain of dependencies, and the three uses of mxcsr all cost
|
||||
about 15-20 cycles at the very least on amd zen processors. on older amd the
|
||||
figures agner has are pretty horrible. it looks like its just as bad on
|
||||
modern intel cpus also up until just recently. perhaps a better way of
|
||||
detecting overflow would be to just compare with inf. todo: test whether cmp
|
||||
with inf can replace
|
||||
*/
|
||||
e.vstmxcsr(mxcsr_storage);
|
||||
|
||||
e.mov(e.eax, 8);
|
||||
|
||||
auto src1v = e.xmm0;
|
||||
auto src2v = e.xmm1;
|
||||
if (i.src1.is_constant) {
|
||||
src1v = e.xmm0;
|
||||
e.LoadConstantXmm(src1v, i.src1.constant());
|
||||
} else {
|
||||
src1v = i.src1.reg();
|
||||
}
|
||||
if (i.src2.is_constant) {
|
||||
src2v = e.xmm1;
|
||||
e.LoadConstantXmm(src2v, i.src2.constant());
|
||||
} else {
|
||||
src2v = i.src2.reg();
|
||||
}
|
||||
e.not_(e.eax);
|
||||
// todo: maybe the top element should be cleared by the InstrEmit_ function
|
||||
// so that in the future this could be optimized away if the top is known to
|
||||
// be zero. Right now im not sure that happens often though and its
|
||||
// currently not worth it also, maybe pre-and if constant
|
||||
e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
|
||||
e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
|
||||
|
||||
e.and_(mxcsr_storage, e.eax);
|
||||
e.vldmxcsr(mxcsr_storage); // overflow flag is cleared, now we're good to
|
||||
// go
|
||||
|
||||
e.vcvtps2pd(e.ymm0, e.xmm3);
|
||||
e.vcvtps2pd(e.ymm1, e.xmm2);
|
||||
/*
|
||||
ymm0 = src1 as doubles, ele 3 cleared
|
||||
ymm1 = src2 as doubles, ele 3 cleared
|
||||
*/
|
||||
e.vmulpd(e.ymm3, e.ymm0, e.ymm1);
|
||||
e.vextractf128(e.xmm2, e.ymm3, 1);
|
||||
e.vunpckhpd(e.xmm0, e.xmm3, e.xmm3); // get element [1] in xmm3
|
||||
e.vaddsd(e.xmm3, e.xmm3, e.xmm2);
|
||||
e.not_(e.eax);
|
||||
e.vaddsd(e.xmm2, e.xmm3, e.xmm0);
|
||||
e.vcvtsd2ss(e.xmm1, e.xmm2);
|
||||
|
||||
// this is awful
|
||||
e.vstmxcsr(mxcsr_storage);
|
||||
e.test(mxcsr_storage, e.eax);
|
||||
Xbyak::Label ret_qnan;
|
||||
Xbyak::Label done;
|
||||
e.jnz(ret_qnan);
|
||||
// e.vshufps(i.dest, e.xmm1,e.xmm1, 0); // broadcast
|
||||
e.vbroadcastss(i.dest, e.xmm1);
|
||||
e.jmp(done);
|
||||
e.L(ret_qnan);
|
||||
e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
|
||||
e.L(done);
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3, DOT_PRODUCT_3_V128);
|
||||
|
@ -2678,13 +2697,81 @@ EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3, DOT_PRODUCT_3_V128);
|
|||
// ============================================================================
|
||||
struct DOT_PRODUCT_4_V128
|
||||
: Sequence<DOT_PRODUCT_4_V128,
|
||||
I<OPCODE_DOT_PRODUCT_4, F32Op, V128Op, V128Op>> {
|
||||
I<OPCODE_DOT_PRODUCT_4, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
// https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
|
||||
EmitCommutativeBinaryXmmOp(
|
||||
e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
DOT_PRODUCT_V128::Emit(e, dest, src1, src2, 0b11110001);
|
||||
});
|
||||
// todo: add fast_dot_product path that just checks for infinity instead of
|
||||
// using mxcsr
|
||||
auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64];
|
||||
|
||||
e.vstmxcsr(mxcsr_storage);
|
||||
|
||||
e.mov(e.eax, 8);
|
||||
|
||||
auto src1v = e.xmm3;
|
||||
auto src2v = e.xmm2;
|
||||
if (i.src1.is_constant) {
|
||||
src1v = e.xmm3;
|
||||
e.LoadConstantXmm(src1v, i.src1.constant());
|
||||
} else {
|
||||
src1v = i.src1.reg();
|
||||
}
|
||||
if (i.src2.is_constant) {
|
||||
src2v = e.xmm2;
|
||||
e.LoadConstantXmm(src2v, i.src2.constant());
|
||||
} else {
|
||||
src2v = i.src2.reg();
|
||||
}
|
||||
e.not_(e.eax);
|
||||
|
||||
e.and_(mxcsr_storage, e.eax);
|
||||
e.vldmxcsr(mxcsr_storage);
|
||||
|
||||
e.vcvtps2pd(e.ymm0, src1v);
|
||||
e.vcvtps2pd(e.ymm1, src2v);
|
||||
/*
|
||||
e.vandps(e.xmm3, src1v, e.GetXmmConstPtr(XMMThreeFloatMask));
|
||||
e.vandps(e.xmm2, src2v, e.GetXmmConstPtr(XMMThreeFloatMask));
|
||||
|
||||
e.and_(mxcsr_storage, e.eax);
|
||||
e.vldmxcsr(mxcsr_storage); // overflow flag is cleared, now we're good to
|
||||
// go
|
||||
|
||||
e.vcvtps2pd(e.ymm0, e.xmm3);
|
||||
e.vcvtps2pd(e.ymm1, e.xmm2);
|
||||
|
||||
|
||||
e.vmulpd(e.ymm5, e.ymm0, e.ymm1);
|
||||
e.vextractf128(e.xmm4, e.ymm5, 1);
|
||||
e.vunpckhpd(e.xmm3, e.xmm5, e.xmm5); // get element [1] in xmm3
|
||||
e.vaddsd(e.xmm5, e.xmm5, e.xmm4);
|
||||
e.not_(e.eax);
|
||||
e.vaddsd(e.xmm2, e.xmm5, e.xmm3);
|
||||
e.vcvtsd2ss(e.xmm1, e.xmm2);
|
||||
|
||||
*/
|
||||
e.vmulpd(e.ymm3, e.ymm0, e.ymm1);
|
||||
e.vextractf128(e.xmm2, e.ymm3, 1);
|
||||
e.vaddpd(e.xmm3, e.xmm3, e.xmm2);
|
||||
|
||||
e.vunpckhpd(e.xmm0, e.xmm3, e.xmm3);
|
||||
e.not_(e.eax);
|
||||
e.vaddsd(e.xmm2, e.xmm3, e.xmm0);
|
||||
e.vcvtsd2ss(e.xmm1, e.xmm2);
|
||||
|
||||
e.vstmxcsr(mxcsr_storage);
|
||||
|
||||
e.test(mxcsr_storage, e.eax);
|
||||
|
||||
Xbyak::Label ret_qnan;
|
||||
Xbyak::Label done;
|
||||
e.jnz(ret_qnan); // reorder these jmps later, just want to get this fix in
|
||||
// e.vshufps(i.dest, e.xmm1, e.xmm1, 0);
|
||||
e.vbroadcastss(i.dest, e.xmm1);
|
||||
e.jmp(done);
|
||||
e.L(ret_qnan);
|
||||
e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN));
|
||||
e.L(done);
|
||||
// e.DebugBreak();
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_4, DOT_PRODUCT_4_V128);
|
||||
|
@ -2759,7 +2846,6 @@ struct AND_I64 : Sequence<AND_I64, I<OPCODE_AND, I64Op, I64Op, I64Op>> {
|
|||
};
|
||||
struct AND_V128 : Sequence<AND_V128, I<OPCODE_AND, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
|
||||
EmitCommutativeBinaryXmmOp(e, i,
|
||||
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
e.vpand(dest, src1, src2);
|
||||
|
@ -3419,7 +3505,7 @@ bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) {
|
|||
return true;
|
||||
}
|
||||
}
|
||||
XELOGE("No sequence match for variant {}", i->opcode->name);
|
||||
XELOGE("No sequence match for variant {}", GetOpcodeName(i->opcode));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -122,7 +122,8 @@ class StackLayout {
|
|||
*
|
||||
*/
|
||||
static const size_t GUEST_STACK_SIZE = 104;
|
||||
static const size_t GUEST_CTX_HOME = 80;
|
||||
//was GUEST_CTX_HOME, can't remove because that'd throw stack alignment off. instead, can be used as a temporary in sequences
|
||||
static const size_t GUEST_SCRATCH64 = 80;
|
||||
static const size_t GUEST_RET_ADDR = 88;
|
||||
static const size_t GUEST_CALL_RET_ADDR = 96;
|
||||
};
|
||||
|
|
|
@ -312,13 +312,18 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
result = true;
|
||||
} else if (i->src2.value->IsConstant() &&
|
||||
i->src3.value->IsConstant()) {
|
||||
// TODO: Select
|
||||
// v->set_from(i->src2.value);
|
||||
// v->Select(i->src3.value, i->src1.value);
|
||||
// i->Remove();
|
||||
v->set_from(i->src2.value);
|
||||
v->Select(i->src3.value, i->src1.value);
|
||||
i->Remove();
|
||||
result = true;
|
||||
}
|
||||
} else {
|
||||
// TODO: vec128 select
|
||||
if (i->src2.value->IsConstant() && i->src3.value->IsConstant()) {
|
||||
v->set_from(i->src2.value);
|
||||
v->Select(i->src3.value, i->src1.value);
|
||||
i->Remove();
|
||||
result = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -744,8 +749,35 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
result = true;
|
||||
}
|
||||
break;
|
||||
// TODO(benvanik): INSERT/EXTRACT
|
||||
// TODO(benvanik): PERMUTE/SWIZZLE
|
||||
|
||||
case OPCODE_PERMUTE: {
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
|
||||
i->src3.value->IsConstant() &&
|
||||
(i->flags == INT8_TYPE || i->flags == INT16_TYPE)) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Permute(i->src2.value, i->src3.value, (TypeName)i->flags);
|
||||
i->Remove();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case OPCODE_INSERT:
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant() &&
|
||||
i->src3.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Insert(i->src2.value, i->src3.value, (TypeName)i->flags);
|
||||
i->Remove();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
case OPCODE_SWIZZLE:
|
||||
if (i->src1.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Swizzle((uint32_t)i->src2.offset, (TypeName)i->flags);
|
||||
i->Remove();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
case OPCODE_EXTRACT:
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_zero(v->type);
|
||||
|
@ -867,24 +899,6 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
}
|
||||
break;
|
||||
|
||||
case OPCODE_DOT_PRODUCT_3:
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->DotProduct3(i->src2.value);
|
||||
i->Remove();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
||||
case OPCODE_DOT_PRODUCT_4:
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->DotProduct4(i->src2.value);
|
||||
i->Remove();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
||||
case OPCODE_VECTOR_AVERAGE:
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
|
@ -896,7 +910,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
result = true;
|
||||
}
|
||||
break;
|
||||
|
||||
case OPCODE_VECTOR_DENORMFLUSH:
|
||||
if (i->src1.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->DenormalFlush();
|
||||
i->Remove();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// Ignored.
|
||||
break;
|
||||
|
|
|
@ -132,10 +132,10 @@ void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder,
|
|||
while (outgoing_ordinal != -1) {
|
||||
Value* src_value = value_map[outgoing_ordinal];
|
||||
assert_not_null(src_value);
|
||||
if (!src_value->local_slot) {
|
||||
src_value->local_slot = builder->AllocLocal(src_value->type);
|
||||
if (!src_value->HasLocalSlot()) {
|
||||
src_value->SetLocalSlot(builder->AllocLocal(src_value->type));
|
||||
}
|
||||
builder->StoreLocal(src_value->local_slot, src_value);
|
||||
builder->StoreLocal(src_value->GetLocalSlot(), src_value);
|
||||
|
||||
// If we are in the block the value was defined in:
|
||||
if (src_value->def->block == block) {
|
||||
|
@ -168,10 +168,10 @@ void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder,
|
|||
while (incoming_ordinal != -1) {
|
||||
Value* src_value = value_map[incoming_ordinal];
|
||||
assert_not_null(src_value);
|
||||
if (!src_value->local_slot) {
|
||||
src_value->local_slot = builder->AllocLocal(src_value->type);
|
||||
if (!src_value->HasLocalSlot()) {
|
||||
src_value->SetLocalSlot(builder->AllocLocal(src_value->type));
|
||||
}
|
||||
Value* local_value = builder->LoadLocal(src_value->local_slot);
|
||||
Value* local_value = builder->LoadLocal(src_value->GetLocalSlot());
|
||||
builder->last_instr()->MoveBefore(block->instr_head);
|
||||
|
||||
// Swap uses of original value with the local value.
|
||||
|
|
|
@ -365,7 +365,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,
|
|||
auto new_head_use = next_use;
|
||||
|
||||
// Allocate local.
|
||||
if (spill_value->local_slot) {
|
||||
if (spill_value->HasLocalSlot()) {
|
||||
// Value is already assigned a slot. Since we allocate in order and this is
|
||||
// all SSA we know the stored value will be exactly what we want. Yay,
|
||||
// we can prevent the redundant store!
|
||||
|
@ -373,10 +373,10 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,
|
|||
// use the spilled value and prevent the need for more locals.
|
||||
} else {
|
||||
// Allocate a local slot.
|
||||
spill_value->local_slot = builder->AllocLocal(spill_value->type);
|
||||
spill_value->SetLocalSlot(builder->AllocLocal(spill_value->type));
|
||||
|
||||
// Add store.
|
||||
builder->StoreLocal(spill_value->local_slot, spill_value);
|
||||
builder->StoreLocal(spill_value->GetLocalSlot(), spill_value);
|
||||
auto spill_store = builder->last_instr();
|
||||
auto spill_store_use = spill_store->src2_use;
|
||||
assert_null(spill_store_use->prev);
|
||||
|
@ -417,7 +417,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,
|
|||
// use is after the instruction requesting the spill we know we haven't
|
||||
// done allocation for that code yet and can let that be handled
|
||||
// automatically when we get to it.
|
||||
auto new_value = builder->LoadLocal(spill_value->local_slot);
|
||||
auto new_value = builder->LoadLocal(spill_value->GetLocalSlot());
|
||||
auto spill_load = builder->last_instr();
|
||||
spill_load->MoveBefore(next_use->instr);
|
||||
// Note: implicit first use added.
|
||||
|
@ -429,7 +429,7 @@ bool RegisterAllocationPass::SpillOneRegister(HIRBuilder* builder, Block* block,
|
|||
|
||||
// Set the local slot of the new value to our existing one. This way we will
|
||||
// reuse that same memory if needed.
|
||||
new_value->local_slot = spill_value->local_slot;
|
||||
new_value->SetLocalSlot( spill_value->GetLocalSlot());
|
||||
|
||||
// Rename all future uses of the SSA value to the new value as loaded
|
||||
// from the local.
|
||||
|
|
|
@ -260,9 +260,9 @@ void HIRBuilder::Dump(StringBuffer* str) {
|
|||
str->Append(" = ");
|
||||
}
|
||||
if (i->flags) {
|
||||
str->AppendFormat("{}.{}", info->name, i->flags);
|
||||
str->AppendFormat("{}.{}", GetOpcodeName(info), i->flags);
|
||||
} else {
|
||||
str->Append(info->name);
|
||||
str->Append(GetOpcodeName(info));
|
||||
}
|
||||
if (src1_type) {
|
||||
str->Append(' ');
|
||||
|
@ -712,7 +712,6 @@ Value* HIRBuilder::AllocValue(TypeName type) {
|
|||
value->use_head = NULL;
|
||||
value->last_use = NULL;
|
||||
value->local_slot = NULL;
|
||||
value->tag = NULL;
|
||||
value->reg.set = NULL;
|
||||
value->reg.index = -1;
|
||||
return value;
|
||||
|
@ -723,12 +722,11 @@ Value* HIRBuilder::CloneValue(Value* source) {
|
|||
value->ordinal = next_value_ordinal_++;
|
||||
value->type = source->type;
|
||||
value->flags = source->flags;
|
||||
value->local_slot = NULL;
|
||||
value->constant.v128 = source->constant.v128;
|
||||
value->def = NULL;
|
||||
value->use_head = NULL;
|
||||
value->last_use = NULL;
|
||||
value->local_slot = NULL;
|
||||
value->tag = NULL;
|
||||
value->reg.set = NULL;
|
||||
value->reg.index = -1;
|
||||
return value;
|
||||
|
@ -1493,7 +1491,16 @@ Value* HIRBuilder::VectorCompareUGE(Value* value1, Value* value2,
|
|||
return VectorCompareXX(OPCODE_VECTOR_COMPARE_UGE_info, value1, value2,
|
||||
part_type);
|
||||
}
|
||||
|
||||
Value* HIRBuilder::VectorDenormFlush(Value* value1) {
|
||||
return value1;
|
||||
ASSERT_VECTOR_TYPE(value1);
|
||||
Instr* i =
|
||||
AppendInstr(OPCODE_VECTOR_DENORMFLUSH_info, 0, AllocValue(VEC128_TYPE));
|
||||
i->set_src1(value1);
|
||||
i->src2.value = nullptr;
|
||||
i->src3.value = nullptr;
|
||||
return i->dest;
|
||||
}
|
||||
Value* HIRBuilder::Add(Value* value1, Value* value2,
|
||||
uint32_t arithmetic_flags) {
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
|
@ -1713,13 +1720,13 @@ Value* HIRBuilder::Log2(Value* value) {
|
|||
return i->dest;
|
||||
}
|
||||
|
||||
|
||||
Value* HIRBuilder::DotProduct3(Value* value1, Value* value2) {
|
||||
ASSERT_VECTOR_TYPE(value1);
|
||||
ASSERT_VECTOR_TYPE(value2);
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
|
||||
Instr* i =
|
||||
AppendInstr(OPCODE_DOT_PRODUCT_3_info, 0, AllocValue(FLOAT32_TYPE));
|
||||
Instr* i = AppendInstr(OPCODE_DOT_PRODUCT_3_info, 0, AllocValue(VEC128_TYPE));
|
||||
i->set_src1(value1);
|
||||
i->set_src2(value2);
|
||||
i->src3.value = NULL;
|
||||
|
@ -1731,8 +1738,7 @@ Value* HIRBuilder::DotProduct4(Value* value1, Value* value2) {
|
|||
ASSERT_VECTOR_TYPE(value2);
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
|
||||
Instr* i =
|
||||
AppendInstr(OPCODE_DOT_PRODUCT_4_info, 0, AllocValue(FLOAT32_TYPE));
|
||||
Instr* i = AppendInstr(OPCODE_DOT_PRODUCT_4_info, 0, AllocValue(VEC128_TYPE));
|
||||
i->set_src1(value1);
|
||||
i->set_src2(value2);
|
||||
i->src3.value = NULL;
|
||||
|
|
|
@ -199,6 +199,7 @@ class HIRBuilder {
|
|||
Value* VectorCompareSGE(Value* value1, Value* value2, TypeName part_type);
|
||||
Value* VectorCompareUGT(Value* value1, Value* value2, TypeName part_type);
|
||||
Value* VectorCompareUGE(Value* value1, Value* value2, TypeName part_type);
|
||||
Value* VectorDenormFlush(Value* value1);
|
||||
|
||||
Value* Add(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
|
||||
Value* AddWithCarry(Value* value1, Value* value2, Value* value3,
|
||||
|
|
|
@ -15,14 +15,23 @@ namespace hir {
|
|||
|
||||
#define DEFINE_OPCODE(num, name, sig, flags) \
|
||||
const OpcodeInfo num##_info = { \
|
||||
num, \
|
||||
flags, \
|
||||
sig, \
|
||||
name, \
|
||||
num, \
|
||||
};
|
||||
#include "xenia/cpu/hir/opcodes.inl"
|
||||
#undef DEFINE_OPCODE
|
||||
|
||||
const char* GetOpcodeName(Opcode num) {
|
||||
switch (num) {
|
||||
#define DEFINE_OPCODE(num, name, sig, flags) \
|
||||
case num: \
|
||||
return name;
|
||||
#include "xenia/cpu/hir/opcodes.inl"
|
||||
#undef DEFINE_OPCODE
|
||||
}
|
||||
return "invalid opcode";
|
||||
}
|
||||
} // namespace hir
|
||||
} // namespace cpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -280,7 +280,8 @@ enum Opcode {
|
|||
OPCODE_ATOMIC_EXCHANGE,
|
||||
OPCODE_ATOMIC_COMPARE_EXCHANGE,
|
||||
OPCODE_SET_ROUNDING_MODE,
|
||||
__OPCODE_MAX_VALUE, // Keep at end.
|
||||
OPCODE_VECTOR_DENORMFLUSH, // converts denormals to signed zeros in a vector
|
||||
__OPCODE_MAX_VALUE, // Keep at end.
|
||||
};
|
||||
|
||||
enum OpcodeFlags {
|
||||
|
@ -352,17 +353,42 @@ static bool IsOpcodeBinaryValue(uint32_t signature) {
|
|||
((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6));
|
||||
}
|
||||
|
||||
static void UnpackOpcodeSig(uint32_t sig, OpcodeSignatureType& dest,
|
||||
OpcodeSignatureType& src1,
|
||||
OpcodeSignatureType& src2,
|
||||
OpcodeSignatureType& src3) {
|
||||
dest = GET_OPCODE_SIG_TYPE_DEST(sig);
|
||||
src1 = GET_OPCODE_SIG_TYPE_SRC1(sig);
|
||||
src2 = GET_OPCODE_SIG_TYPE_SRC2(sig);
|
||||
src3 = GET_OPCODE_SIG_TYPE_SRC3(sig);
|
||||
}
|
||||
|
||||
constexpr uint32_t GetNumOperandsForSig(uint32_t sig) {
|
||||
sig >>= 3;
|
||||
|
||||
uint32_t result = 0;
|
||||
while (sig) {
|
||||
if (sig & 0x7) {
|
||||
++result;
|
||||
}
|
||||
sig >>= 3;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
typedef struct {
|
||||
Opcode num;
|
||||
uint32_t flags;
|
||||
uint32_t signature;
|
||||
const char* name;
|
||||
Opcode num;
|
||||
} OpcodeInfo;
|
||||
|
||||
#define DEFINE_OPCODE(num, name, sig, flags) extern const OpcodeInfo num##_info;
|
||||
#include "xenia/cpu/hir/opcodes.inl"
|
||||
#undef DEFINE_OPCODE
|
||||
|
||||
const char* GetOpcodeName(Opcode num);
|
||||
static inline const char* GetOpcodeName(const OpcodeInfo* info) {
|
||||
return GetOpcodeName(info->num);
|
||||
}
|
||||
} // namespace hir
|
||||
} // namespace cpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -673,3 +673,10 @@ DEFINE_OPCODE(
|
|||
"set_rounding_mode",
|
||||
OPCODE_SIG_X_V,
|
||||
0)
|
||||
|
||||
DEFINE_OPCODE(
|
||||
OPCODE_VECTOR_DENORMFLUSH,
|
||||
"vector_denormflush",
|
||||
OPCODE_SIG_V_V,
|
||||
0
|
||||
)
|
|
@ -864,10 +864,112 @@ void Value::Extract(Value* vec, Value* index) {
|
|||
break;
|
||||
}
|
||||
}
|
||||
void Value::Permute(Value* src1, Value* src2, TypeName type) {
|
||||
if (type == INT8_TYPE) {
|
||||
uint8_t table[32];
|
||||
|
||||
for (uint32_t i = 0; i < 16; ++i) {
|
||||
table[i] = src1->constant.v128.u8[i];
|
||||
table[i + 16] = src2->constant.v128.u8[i];
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < 16; ++i) {
|
||||
constant.v128.u8[i] = table[(constant.v128.u8[i] ^ 3) & 0x1f];
|
||||
}
|
||||
} else if (type == INT16_TYPE) {
|
||||
vec128_t perm = (constant.v128 & vec128s(0xF)) ^ vec128s(0x1);
|
||||
vec128_t perm_ctrl = vec128b(0);
|
||||
for (int i = 0; i < 8; i++) {
|
||||
perm_ctrl.i16[i] = perm.i16[i] > 7 ? -1 : 0;
|
||||
|
||||
auto v = uint8_t(perm.u16[i]);
|
||||
perm.u8[i * 2] = v * 2;
|
||||
perm.u8[i * 2 + 1] = v * 2 + 1;
|
||||
}
|
||||
auto lod = [](const vec128_t& v) {
|
||||
return _mm_loadu_si128((const __m128i*)&v);
|
||||
};
|
||||
auto sto = [](vec128_t& v, __m128i x) {
|
||||
return _mm_storeu_si128((__m128i*)&v, x);
|
||||
};
|
||||
|
||||
__m128i xmm1 = lod(src1->constant.v128);
|
||||
__m128i xmm2 = lod(src2->constant.v128);
|
||||
xmm1 = _mm_shuffle_epi8(xmm1, lod(perm));
|
||||
xmm2 = _mm_shuffle_epi8(xmm2, lod(perm));
|
||||
uint8_t mask = 0;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
if (perm_ctrl.i16[i] == 0) {
|
||||
mask |= 1 << (7 - i);
|
||||
}
|
||||
}
|
||||
|
||||
vec128_t unp_mask = vec128b(0);
|
||||
for (int i = 0; i < 8; i++) {
|
||||
if (mask & (1 << i)) {
|
||||
unp_mask.u16[i] = 0xFFFF;
|
||||
}
|
||||
}
|
||||
|
||||
sto(constant.v128, _mm_blendv_epi8(xmm1, xmm2, lod(unp_mask)));
|
||||
|
||||
} else {
|
||||
assert_unhandled_case(type);
|
||||
}
|
||||
}
|
||||
void Value::Insert(Value* index, Value* part, TypeName type) {
|
||||
vec128_t* me = &constant.v128;
|
||||
|
||||
switch (type) {
|
||||
case INT8_TYPE:
|
||||
me->u8[index->constant.u8 ^ 3] = part->constant.u8;
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
me->u16[index->constant.u8 ^ 1] = part->constant.u16;
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
me->u32[index->constant.u8] = part->constant.u32;
|
||||
break;
|
||||
}
|
||||
}
|
||||
void Value::Swizzle(uint32_t mask, TypeName type) {
|
||||
if (type == INT32_TYPE || type == FLOAT32_TYPE) {
|
||||
vec128_t result = vec128b(0);
|
||||
for (uint32_t i = 0; i < 4; ++i) {
|
||||
result.u32[i] = constant.v128.u32[(mask >> (i * 2)) & 0b11];
|
||||
}
|
||||
constant.v128 = result;
|
||||
} else {
|
||||
assert_unhandled_case(type);
|
||||
}
|
||||
}
|
||||
void Value::Select(Value* other, Value* ctrl) {
|
||||
// TODO
|
||||
assert_always();
|
||||
if (ctrl->type == VEC128_TYPE) {
|
||||
constant.v128.low = (constant.v128.low & ~ctrl->constant.v128.low) |
|
||||
(other->constant.v128.low & ctrl->constant.v128.low);
|
||||
constant.v128.high = (constant.v128.high & ~ctrl->constant.v128.high) |
|
||||
(other->constant.v128.high & ctrl->constant.v128.high);
|
||||
|
||||
} else {
|
||||
if (ctrl->constant.u8) {
|
||||
switch (other->type) {
|
||||
case INT8_TYPE:
|
||||
constant.u8 = other->constant.u8;
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
constant.u16 = other->constant.u16;
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
case FLOAT32_TYPE:
|
||||
constant.u32 = other->constant.u32;
|
||||
break;
|
||||
case INT64_TYPE:
|
||||
case FLOAT64_TYPE:
|
||||
constant.u64 = other->constant.u64;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Value::Splat(Value* other) {
|
||||
|
@ -1532,7 +1634,15 @@ void Value::ByteSwap() {
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void Value::DenormalFlush() {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
uint32_t current_element = constant.v128.u32[i];
|
||||
if ((current_element & 0x7f800000) == 0) {
|
||||
current_element = current_element & 0x80000000;
|
||||
}
|
||||
constant.v128.u32[i] = current_element;
|
||||
}
|
||||
}
|
||||
void Value::CountLeadingZeros(const Value* other) {
|
||||
switch (other->type) {
|
||||
case INT8_TYPE:
|
||||
|
|
|
@ -104,6 +104,9 @@ struct ValueMask {
|
|||
|
||||
class Value {
|
||||
public:
|
||||
/*
|
||||
todo : this should be intrusive and be part of Instr instead.
|
||||
*/
|
||||
typedef struct Use_s {
|
||||
Instr* instr;
|
||||
Use_s* prev;
|
||||
|
@ -128,17 +131,16 @@ class Value {
|
|||
TypeName type;
|
||||
|
||||
uint32_t flags;
|
||||
RegAssignment reg;
|
||||
ConstantValue constant;
|
||||
|
||||
Instr* def;
|
||||
Use* use_head;
|
||||
// NOTE: for performance reasons this is not maintained during construction.
|
||||
Instr* last_use;
|
||||
Value* local_slot;
|
||||
|
||||
// TODO(benvanik): remove to shrink size.
|
||||
void* tag;
|
||||
RegAssignment reg;
|
||||
union {
|
||||
Value* local_slot;
|
||||
ConstantValue constant;
|
||||
};
|
||||
|
||||
Use* AddUse(Arena* arena, Instr* instr);
|
||||
void RemoveUse(Use* use);
|
||||
|
@ -209,7 +211,20 @@ class Value {
|
|||
flags = other->flags;
|
||||
constant.v128 = other->constant.v128;
|
||||
}
|
||||
bool HasLocalSlot() const {
|
||||
return !(flags & VALUE_IS_CONSTANT) && local_slot;
|
||||
}
|
||||
void SetLocalSlot(Value* lslot) {
|
||||
assert(!(flags & VALUE_IS_CONSTANT));
|
||||
local_slot = lslot;
|
||||
}
|
||||
|
||||
Value* GetLocalSlot() {
|
||||
return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot;
|
||||
}
|
||||
const Value* GetLocalSlot() const {
|
||||
return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot;
|
||||
}
|
||||
inline bool IsConstant() const { return !!(flags & VALUE_IS_CONSTANT); }
|
||||
bool IsConstantTrue() const {
|
||||
if (type == VEC128_TYPE) {
|
||||
|
@ -555,7 +570,10 @@ class Value {
|
|||
void Shr(Value* other);
|
||||
void Sha(Value* other);
|
||||
void RotateLeft(Value* other);
|
||||
void Insert(Value* index, Value* part, TypeName type);
|
||||
void Extract(Value* vec, Value* index);
|
||||
void Permute(Value* src1, Value* src2, TypeName type);
|
||||
void Swizzle(uint32_t mask, TypeName type);
|
||||
void Select(Value* other, Value* ctrl);
|
||||
void Splat(Value* other);
|
||||
void VectorCompareEQ(Value* other, TypeName type);
|
||||
|
@ -575,6 +593,8 @@ class Value {
|
|||
void VectorAverage(Value* other, TypeName type, bool is_unsigned,
|
||||
bool saturate);
|
||||
void ByteSwap();
|
||||
void DenormalFlush();
|
||||
|
||||
void CountLeadingZeros(const Value* other);
|
||||
bool Compare(Opcode opcode, Value* other);
|
||||
hir::Instr* GetDefSkipAssigns();
|
||||
|
|
|
@ -279,14 +279,21 @@ int InstrEmit_stvlx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd,
|
|||
Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
|
||||
// ea &= ~0xF
|
||||
ea = f.And(ea, f.LoadConstantUint64(~0xFull));
|
||||
Value* shrs = f.LoadVectorShr(eb);
|
||||
Value* zerovec = f.LoadZeroVec128();
|
||||
|
||||
// v = (old & ~mask) | ((new >> eb) & mask)
|
||||
Value* new_value = f.Permute(f.LoadVectorShr(eb), f.LoadZeroVec128(),
|
||||
f.LoadVR(vd), INT8_TYPE);
|
||||
Value* new_value = f.Permute(shrs, zerovec, f.LoadVR(vd), INT8_TYPE);
|
||||
Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE));
|
||||
/*
|
||||
these permutes need to be looked at closer. keep in mind Permute is meant to
|
||||
emulate vmx's shuffles and does not generate particularly good code. The logic
|
||||
here looks as if it might make more sense as a comparison (
|
||||
*/
|
||||
// mask = FFFF... >> eb
|
||||
Value* mask = f.Permute(f.LoadVectorShr(eb), f.LoadZeroVec128(),
|
||||
f.Not(f.LoadZeroVec128()), INT8_TYPE);
|
||||
Value* v = f.Or(f.AndNot(old_value, mask), f.And(new_value, mask));
|
||||
Value* mask = f.Permute(shrs, zerovec, f.Not(zerovec), INT8_TYPE);
|
||||
|
||||
Value* v = f.Select(mask, old_value, new_value);
|
||||
// ea &= ~0xF (handled above)
|
||||
f.Store(ea, f.ByteSwap(v));
|
||||
return 0;
|
||||
|
@ -321,14 +328,14 @@ int InstrEmit_stvrx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd,
|
|||
ea = CalculateEA_0(f, ra, rb);
|
||||
eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
|
||||
ea = f.And(ea, f.LoadConstantUint64(~0xFull));
|
||||
Value* shrs = f.LoadVectorShr(eb);
|
||||
Value* zerovec = f.LoadZeroVec128();
|
||||
// v = (old & ~mask) | ((new << eb) & mask)
|
||||
Value* new_value = f.Permute(f.LoadVectorShr(eb), f.LoadVR(vd),
|
||||
f.LoadZeroVec128(), INT8_TYPE);
|
||||
Value* new_value = f.Permute(shrs, f.LoadVR(vd), zerovec, INT8_TYPE);
|
||||
Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE));
|
||||
// mask = ~FFFF... >> eb
|
||||
Value* mask = f.Permute(f.LoadVectorShr(eb), f.Not(f.LoadZeroVec128()),
|
||||
f.LoadZeroVec128(), INT8_TYPE);
|
||||
Value* v = f.Or(f.AndNot(old_value, mask), f.And(new_value, mask));
|
||||
Value* mask = f.Permute(shrs, f.Not(zerovec), zerovec, INT8_TYPE);
|
||||
Value* v = f.Select(mask, old_value, new_value);
|
||||
// ea &= ~0xF (handled above)
|
||||
f.Store(ea, f.ByteSwap(v));
|
||||
f.MarkLabel(skip_label);
|
||||
|
@ -815,8 +822,16 @@ int InstrEmit_vlogefp128(PPCHIRBuilder& f, const InstrData& i) {
|
|||
|
||||
int InstrEmit_vmaddfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
|
||||
uint32_t vc) {
|
||||
/*
|
||||
chrispy: testing on POWER8 revealed that altivec vmaddfp unconditionally
|
||||
flushes denormal inputs to 0, regardless of NJM setting
|
||||
*/
|
||||
Value* a = f.VectorDenormFlush(f.LoadVR(va));
|
||||
Value* b = f.VectorDenormFlush(f.LoadVR(vb));
|
||||
Value* c = f.VectorDenormFlush(f.LoadVR(vc));
|
||||
// (VD) <- ((VA) * (VC)) + (VB)
|
||||
Value* v = f.MulAdd(f.LoadVR(va), f.LoadVR(vc), f.LoadVR(vb));
|
||||
Value* v = f.MulAdd(a, c, b);
|
||||
// todo: do denormal results also unconditionally become 0?
|
||||
f.StoreVR(vd, v);
|
||||
return 0;
|
||||
}
|
||||
|
@ -832,9 +847,14 @@ int InstrEmit_vmaddfp128(PPCHIRBuilder& f, const InstrData& i) {
|
|||
}
|
||||
|
||||
int InstrEmit_vmaddcfp128(PPCHIRBuilder& f, const InstrData& i) {
|
||||
/*
|
||||
see vmaddfp about these denormflushes
|
||||
*/
|
||||
Value* a = f.VectorDenormFlush(f.LoadVR(VX128_VA128));
|
||||
Value* b = f.VectorDenormFlush(f.LoadVR(VX128_VB128));
|
||||
Value* d = f.VectorDenormFlush(f.LoadVR(VX128_VD128));
|
||||
// (VD) <- ((VA) * (VD)) + (VB)
|
||||
Value* v = f.MulAdd(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VD128),
|
||||
f.LoadVR(VX128_VB128));
|
||||
Value* v = f.MulAdd(a, d, b);
|
||||
f.StoreVR(VX128_VD128, v);
|
||||
return 0;
|
||||
}
|
||||
|
@ -1085,7 +1105,8 @@ int InstrEmit_vmsum3fp128(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// Dot product XYZ.
|
||||
// (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z)
|
||||
Value* v = f.DotProduct3(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128));
|
||||
v = f.Splat(v, VEC128_TYPE);
|
||||
//chrispy: denormal outputs for Dot product are unconditionally made 0
|
||||
v = f.VectorDenormFlush(v);
|
||||
f.StoreVR(VX128_VD128, v);
|
||||
return 0;
|
||||
}
|
||||
|
@ -1094,7 +1115,7 @@ int InstrEmit_vmsum4fp128(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// Dot product XYZW.
|
||||
// (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z) + (VA.w * VB.w)
|
||||
Value* v = f.DotProduct4(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128));
|
||||
v = f.Splat(v, VEC128_TYPE);
|
||||
v = f.VectorDenormFlush(v);
|
||||
f.StoreVR(VX128_VD128, v);
|
||||
return 0;
|
||||
}
|
||||
|
@ -1151,7 +1172,19 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
|
|||
// (VD) <- -(((VA) * (VC)) - (VB))
|
||||
// NOTE: only one rounding should take place, but that's hard...
|
||||
// This really needs VFNMSUB132PS/VFNMSUB213PS/VFNMSUB231PS but that's AVX.
|
||||
Value* v = f.Neg(f.MulSub(f.LoadVR(va), f.LoadVR(vc), f.LoadVR(vb)));
|
||||
// NOTE2: we could make vnmsub a new opcode, and then do it in double
|
||||
// precision, rounding after the neg
|
||||
|
||||
/*
|
||||
chrispy: this is untested, but i believe this has the same DAZ behavior for
|
||||
inputs as vmadd
|
||||
*/
|
||||
|
||||
Value* a = f.VectorDenormFlush(f.LoadVR(va));
|
||||
Value* b = f.VectorDenormFlush(f.LoadVR(vb));
|
||||
Value* c = f.VectorDenormFlush(f.LoadVR(vc));
|
||||
|
||||
Value* v = f.Neg(f.MulSub(a, c, b));
|
||||
f.StoreVR(vd, v);
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue