ome guest function calls can now be resolved and embedded directly in
the emitted asm as rel32 calls. Disabled by default, enabled via resolve_rel32_guest_calls detect whether cpu has fast jrcxz, fast loop/loope/loopne much more thorough LoadConstantXMM New cvar elide_e0_check that allows the backend to assume accesses via the SP or TLS register will not cross into 0xe0 range Add x64 codegen for Vector shift uint8 If has fast jrcxz use for some traptrue/breaktrue instructions Use phat nops Add cvar use_fast_dot_product, which uses a four instruction sequence for both dot product instructions which ought to be equivalent. disabled by default.
This commit is contained in:
parent
a4ff64c465
commit
549ee28a93
|
@ -43,7 +43,10 @@ DEFINE_bool(ignore_undefined_externs, true,
|
|||
DEFINE_bool(emit_source_annotations, false,
|
||||
"Add extra movs and nops to make disassembly easier to read.",
|
||||
"CPU");
|
||||
|
||||
DEFINE_bool(resolve_rel32_guest_calls, false,
|
||||
"Experimental optimization, directly call already resolved "
|
||||
"functions via x86 rel32 call/jmp",
|
||||
"CPU");
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace backend {
|
||||
|
@ -99,7 +102,28 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
|||
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
|
||||
|
||||
|
||||
|
||||
|
||||
#undef TEST_EMIT_FEATURE
|
||||
|
||||
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
|
||||
|
||||
bool is_zennish = cpu_.displayFamily >= 0x17;
|
||||
|
||||
if (is_zennish) {
|
||||
feature_flags_ |= kX64FastJrcx;
|
||||
|
||||
if (cpu_.displayFamily > 0x17) {
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
|
||||
} else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
} // todo:figure out at model zen+ became zen2, this is just the model
|
||||
// for my cpu, which is ripper90
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
X64Emitter::~X64Emitter() = default;
|
||||
|
@ -149,6 +173,26 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
|
|||
if (function) {
|
||||
code_cache_->PlaceGuestCode(function->address(), top_, func_info, function,
|
||||
new_execute_address, new_write_address);
|
||||
if (cvars::resolve_rel32_guest_calls) {
|
||||
for (auto&& callsite : call_sites_) {
|
||||
#pragma pack(push, 1)
|
||||
struct RGCEmitted {
|
||||
uint8_t ff_;
|
||||
uint32_t rgcid_;
|
||||
};
|
||||
#pragma pack(pop)
|
||||
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
|
||||
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
|
||||
hunter = reinterpret_cast<RGCEmitted*>(
|
||||
reinterpret_cast<char*>(hunter) + 1);
|
||||
}
|
||||
|
||||
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
|
||||
hunter->rgcid_ =
|
||||
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
|
||||
reinterpret_cast<intptr_t>(hunter + 1));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address,
|
||||
new_write_address);
|
||||
|
@ -157,6 +201,7 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
|
|||
ready();
|
||||
top_ = old_address;
|
||||
reset();
|
||||
call_sites_.clear();
|
||||
return new_execute_address;
|
||||
}
|
||||
|
||||
|
@ -287,11 +332,8 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
code_offsets.tail = getSize();
|
||||
|
||||
if (cvars::emit_source_annotations) {
|
||||
nop();
|
||||
nop();
|
||||
nop();
|
||||
nop();
|
||||
nop();
|
||||
nop(5);
|
||||
|
||||
}
|
||||
|
||||
assert_zero(code_offsets.prolog);
|
||||
|
@ -313,11 +355,9 @@ void X64Emitter::MarkSourceOffset(const Instr* i) {
|
|||
entry->code_offset = static_cast<uint32_t>(getSize());
|
||||
|
||||
if (cvars::emit_source_annotations) {
|
||||
nop();
|
||||
nop();
|
||||
nop(2);
|
||||
mov(eax, entry->guest_address);
|
||||
nop();
|
||||
nop();
|
||||
nop(2);
|
||||
}
|
||||
|
||||
if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctionCoverage) {
|
||||
|
@ -414,10 +454,44 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
|
|||
assert_not_null(function);
|
||||
auto fn = static_cast<X64Function*>(function);
|
||||
// Resolve address to the function to call and store in rax.
|
||||
|
||||
if (cvars::resolve_rel32_guest_calls && fn->machine_code()) {
|
||||
ResolvableGuestCall rgc;
|
||||
rgc.destination_ = uint32_t(uint64_t(fn->machine_code()));
|
||||
rgc.offset_ = current_rgc_id_;
|
||||
current_rgc_id_++;
|
||||
|
||||
if (!(instr->flags & hir::CALL_TAIL)) {
|
||||
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
||||
|
||||
db(0xFF);
|
||||
rgc.is_jump_ = false;
|
||||
|
||||
dd(rgc.offset_);
|
||||
|
||||
} else {
|
||||
// tail call
|
||||
EmitTraceUserCallReturn();
|
||||
|
||||
rgc.is_jump_ = true;
|
||||
// Pass the callers return address over.
|
||||
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||
|
||||
add(rsp, static_cast<uint32_t>(stack_size()));
|
||||
db(0xFF);
|
||||
dd(rgc.offset_);
|
||||
}
|
||||
call_sites_.push_back(rgc);
|
||||
return;
|
||||
}
|
||||
|
||||
if (fn->machine_code()) {
|
||||
// TODO(benvanik): is it worth it to do this? It removes the need for
|
||||
// a ResolveFunction call, but makes the table less useful.
|
||||
assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000);
|
||||
// todo: this should be changed so that we can actually do a call to
|
||||
// fn->machine_code. the code will be emitted near us, so 32 bit rel jmp
|
||||
// should be possible
|
||||
mov(eax, uint32_t(uint64_t(fn->machine_code())));
|
||||
} else if (code_cache_->has_indirection_table()) {
|
||||
// Load the pointer to the indirection table maintained in X64CodeCache.
|
||||
|
@ -600,6 +674,30 @@ void X64Emitter::ReloadContext() {
|
|||
void X64Emitter::ReloadMembase() {
|
||||
mov(GetMembaseReg(), qword[GetContextReg() + 8]); // membase
|
||||
}
|
||||
#define __NH_CONCAT(x, y) x##y
|
||||
#define _MH_CONCAT(cb, ...) cb (__VA_ARGS__)
|
||||
|
||||
#define mh_concat2_m(x, y) __NH_CONCAT(x, y)
|
||||
|
||||
#define DECLNOP(n, ...) \
|
||||
static constexpr unsigned char mh_concat2_m(nop_, n)[] = {__VA_ARGS__}
|
||||
|
||||
DECLNOP(1, 0x90);
|
||||
DECLNOP(2, 0x66, 0x90);
|
||||
DECLNOP(3, 0x0F, 0x1F, 0x00);
|
||||
DECLNOP(4, 0x0F, 0x1F, 0x40, 0x00);
|
||||
DECLNOP(5, 0x0F, 0x1F, 0x44, 0x00, 0x00);
|
||||
DECLNOP(6, 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00);
|
||||
DECLNOP(7, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00);
|
||||
DECLNOP(8, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
|
||||
DECLNOP(9, 0x66, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
|
||||
|
||||
static constexpr const unsigned char* const g_noptable[] = {
|
||||
&nop_1[0], &nop_1[0], &nop_2[0], &nop_3[0], &nop_4[0],
|
||||
&nop_5[0], &nop_6[0], &nop_7[0], &nop_8[0], &nop_9[0]};
|
||||
|
||||
static constexpr unsigned LENGTHOF_NOPTABLE =
|
||||
sizeof(g_noptable) / sizeof(g_noptable[0]);
|
||||
|
||||
// Len Assembly Byte Sequence
|
||||
// ============================================================================
|
||||
|
@ -613,9 +711,17 @@ void X64Emitter::ReloadMembase() {
|
|||
// 8b NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H
|
||||
// 9b 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H
|
||||
void X64Emitter::nop(size_t length) {
|
||||
// TODO(benvanik): fat nop
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
db(0x90);
|
||||
while (length != 0) {
|
||||
unsigned patchsize = length % LENGTHOF_NOPTABLE;
|
||||
|
||||
// patch_memory(locptr, size, (char*)g_noptable[patchsize]);
|
||||
|
||||
for (unsigned i = 0; i < patchsize; ++i) {
|
||||
db(g_noptable[patchsize][i]);
|
||||
}
|
||||
|
||||
//locptr += patchsize;
|
||||
length -= patchsize;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -649,6 +755,35 @@ void X64Emitter::MovMem64(const Xbyak::RegExp& addr, uint64_t v) {
|
|||
mov(dword[addr + 4], static_cast<uint32_t>(v >> 32));
|
||||
}
|
||||
}
|
||||
static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,
|
||||
unsigned char v2, unsigned char v3,
|
||||
unsigned char v4, unsigned char v5,
|
||||
unsigned char v6, unsigned char v7,
|
||||
unsigned char v8, unsigned char v9,
|
||||
unsigned char v10, unsigned char v11,
|
||||
unsigned char v12, unsigned char v13,
|
||||
unsigned char v14, unsigned char v15) {
|
||||
vec128_t result;
|
||||
|
||||
result.u8[0] = v0;
|
||||
result.u8[1] = v1;
|
||||
result.u8[2] = v2;
|
||||
result.u8[3] = v3;
|
||||
result.u8[4] = v4;
|
||||
result.u8[5] = v5;
|
||||
result.u8[6] = v6;
|
||||
result.u8[7] = v7;
|
||||
result.u8[8] = v8;
|
||||
result.u8[9] = v9;
|
||||
result.u8[10] = v10;
|
||||
result.u8[11] = v11;
|
||||
result.u8[12] = v12;
|
||||
result.u8[13] = v13;
|
||||
result.u8[14] = v14;
|
||||
|
||||
result.u8[15] = v15;
|
||||
return result;
|
||||
}
|
||||
|
||||
static const vec128_t xmm_consts[] = {
|
||||
/* XMMZero */ vec128f(0.0f),
|
||||
|
@ -761,8 +896,60 @@ static const vec128_t xmm_consts[] = {
|
|||
/* XMMQNaN */ vec128i(0x7FC00000u),
|
||||
/* XMMInt127 */ vec128i(0x7Fu),
|
||||
/* XMM2To32 */ vec128f(0x1.0p32f),
|
||||
/* xmminf */ vec128i(0x7f800000),
|
||||
|
||||
/* XMMIntsToBytes*/
|
||||
v128_setr_bytes(0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
0x80, 0x80, 0x80, 0x80),
|
||||
/*XMMShortsToBytes*/
|
||||
v128_setr_bytes(0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
0x80, 0x80, 0x80)
|
||||
};
|
||||
|
||||
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
|
||||
for (auto& vec : xmm_consts) {
|
||||
for (auto& u8 : vec.u8) {
|
||||
if (u8 == bytevalue) {
|
||||
return reinterpret_cast<void*>(backend_->emitter_data() +
|
||||
(&u8 - &xmm_consts[0].u8[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
void* X64Emitter::FindWordConstantOffset(unsigned wordvalue) {
|
||||
for (auto& vec : xmm_consts) {
|
||||
for (auto& u16 : vec.u16) {
|
||||
if (u16 == wordvalue) {
|
||||
return reinterpret_cast<void*>(backend_->emitter_data() +
|
||||
((&u16 - &xmm_consts[0].u16[0]) * 2));
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
void* X64Emitter::FindDwordConstantOffset(unsigned dwordvalue) {
|
||||
for (auto& vec : xmm_consts) {
|
||||
for (auto& u32 : vec.u32) {
|
||||
if (u32 == dwordvalue) {
|
||||
return reinterpret_cast<void*>(backend_->emitter_data() +
|
||||
((&u32 - &xmm_consts[0].u32[0]) * 4));
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
void* X64Emitter::FindQwordConstantOffset(uint64_t qwordvalue) {
|
||||
for (auto& vec : xmm_consts) {
|
||||
for (auto& u64 : vec.u64) {
|
||||
if (u64 == qwordvalue) {
|
||||
return reinterpret_cast<void*>(backend_->emitter_data() +
|
||||
((&u64 - &xmm_consts[0].u64[0]) * 8));
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
// First location to try and place constants.
|
||||
static const uintptr_t kConstDataLocation = 0x20000000;
|
||||
static const uintptr_t kConstDataSize = sizeof(xmm_consts);
|
||||
|
@ -806,7 +993,6 @@ Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) {
|
|||
return ptr[reinterpret_cast<void*>(backend_->emitter_data() +
|
||||
sizeof(vec128_t) * id)];
|
||||
}
|
||||
|
||||
// Implies possible StashXmm(0, ...)!
|
||||
void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
||||
// https://www.agner.org/optimize/optimizing_assembly.pdf
|
||||
|
@ -818,12 +1004,115 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
|||
// 1111...
|
||||
vpcmpeqb(dest, dest);
|
||||
} else {
|
||||
|
||||
for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
|
||||
if (xmm_consts[i] == v) {
|
||||
vmovapd(dest, GetXmmConstPtr((XmmConst)i));
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (IsFeatureEnabled(kX64EmitAVX2)) {
|
||||
bool all_equal_bytes = true;
|
||||
|
||||
unsigned firstbyte = v.u8[0];
|
||||
for (unsigned i = 1; i < 16; ++i) {
|
||||
if (v.u8[i] != firstbyte) {
|
||||
all_equal_bytes = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (all_equal_bytes) {
|
||||
void* bval = FindByteConstantOffset(firstbyte);
|
||||
|
||||
if (bval) {
|
||||
vpbroadcastb(dest, byte[bval]);
|
||||
return;
|
||||
}
|
||||
// didnt find existing mem with the value
|
||||
mov(byte[rsp + kStashOffset], firstbyte);
|
||||
vpbroadcastb(dest, byte[rsp + kStashOffset]);
|
||||
return;
|
||||
}
|
||||
|
||||
bool all_equal_words = true;
|
||||
unsigned firstword = v.u16[0];
|
||||
for (unsigned i = 1; i < 8; ++i) {
|
||||
if (v.u16[i] != firstword) {
|
||||
all_equal_words = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (all_equal_words) {
|
||||
void* wval = FindWordConstantOffset(firstword);
|
||||
if (wval) {
|
||||
vpbroadcastw(dest, word[wval]);
|
||||
return;
|
||||
}
|
||||
// didnt find existing mem with the value
|
||||
mov(word[rsp + kStashOffset], firstword);
|
||||
vpbroadcastw(dest, word[rsp + kStashOffset]);
|
||||
return;
|
||||
}
|
||||
|
||||
bool all_equal_dwords = true;
|
||||
unsigned firstdword = v.u32[0];
|
||||
for (unsigned i = 1; i < 4; ++i) {
|
||||
if (v.u32[i] != firstdword) {
|
||||
all_equal_dwords = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (all_equal_dwords) {
|
||||
void* dwval = FindDwordConstantOffset(firstdword);
|
||||
if (dwval) {
|
||||
vpbroadcastd(dest, dword[dwval]);
|
||||
return;
|
||||
}
|
||||
mov(dword[rsp + kStashOffset], firstdword);
|
||||
vpbroadcastd(dest, dword[rsp + kStashOffset]);
|
||||
return;
|
||||
}
|
||||
|
||||
bool all_equal_qwords = v.low == v.high;
|
||||
|
||||
if (all_equal_qwords) {
|
||||
void* qwval = FindQwordConstantOffset(v.low);
|
||||
if (qwval) {
|
||||
vpbroadcastq(dest, qword[qwval]);
|
||||
return;
|
||||
}
|
||||
MovMem64(rsp + kStashOffset, v.low);
|
||||
vpbroadcastq(dest, qword[rsp + kStashOffset]);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& vec : xmm_consts) {
|
||||
if (vec.low == v.low && vec.high == v.high) {
|
||||
vmovdqa(dest,
|
||||
ptr[reinterpret_cast<void*>(backend_->emitter_data() +
|
||||
((&vec - &xmm_consts[0]) * 16))]);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (v.high == 0 && v.low == ~0ULL) {
|
||||
vpcmpeqb(dest, dest);
|
||||
movq(dest, dest);
|
||||
return;
|
||||
}
|
||||
if (v.high == 0) {
|
||||
if ((v.low & 0xFFFFFFFF) == v.low) {
|
||||
mov(dword[rsp + kStashOffset], static_cast<unsigned>(v.low));
|
||||
movd(dest, dword[rsp + kStashOffset]);
|
||||
return;
|
||||
}
|
||||
MovMem64(rsp + kStashOffset, v.low);
|
||||
movq(dest, qword[rsp + kStashOffset]);
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO(benvanik): see what other common values are.
|
||||
// TODO(benvanik): build constant table - 99% are reused.
|
||||
MovMem64(rsp + kStashOffset, v.low);
|
||||
|
|
|
@ -116,6 +116,9 @@ enum XmmConst {
|
|||
XMMQNaN,
|
||||
XMMInt127,
|
||||
XMM2To32,
|
||||
XMMFloatInf,
|
||||
XMMIntsToBytes,
|
||||
XMMShortsToBytes
|
||||
};
|
||||
|
||||
// Unfortunately due to the design of xbyak we have to pass this to the ctor.
|
||||
|
@ -141,7 +144,16 @@ enum X64EmitterFeatureFlags {
|
|||
kX64EmitAVX512DQ = 1 << 11,
|
||||
|
||||
kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
|
||||
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ
|
||||
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
|
||||
kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
|
||||
kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
|
||||
};
|
||||
class ResolvableGuestCall {
|
||||
public:
|
||||
bool is_jump_;
|
||||
uintptr_t destination_;
|
||||
// rgcid
|
||||
unsigned offset_;
|
||||
};
|
||||
|
||||
class X64Emitter : public Xbyak::CodeGenerator {
|
||||
|
@ -230,7 +242,10 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
Xbyak::Address StashConstantXmm(int index, float v);
|
||||
Xbyak::Address StashConstantXmm(int index, double v);
|
||||
Xbyak::Address StashConstantXmm(int index, const vec128_t& v);
|
||||
|
||||
void* FindByteConstantOffset(unsigned bytevalue);
|
||||
void* FindWordConstantOffset(unsigned wordvalue);
|
||||
void* FindDwordConstantOffset(unsigned bytevalue);
|
||||
void* FindQwordConstantOffset(uint64_t bytevalue);
|
||||
bool IsFeatureEnabled(uint32_t feature_flag) const {
|
||||
return (feature_flags_ & feature_flag) == feature_flag;
|
||||
}
|
||||
|
@ -267,6 +282,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
|
||||
static const uint32_t gpr_reg_map_[GPR_COUNT];
|
||||
static const uint32_t xmm_reg_map_[XMM_COUNT];
|
||||
uint32_t current_rgc_id_ = 0xEEDDF00F;
|
||||
std::vector<ResolvableGuestCall> call_sites_;
|
||||
};
|
||||
|
||||
} // namespace x64
|
||||
|
|
|
@ -109,22 +109,39 @@ struct DEBUG_BREAK_TRUE_I32
|
|||
: Sequence<DEBUG_BREAK_TRUE_I32,
|
||||
I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I32Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.DebugBreak();
|
||||
e.L(skip);
|
||||
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.ecx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jrcxz(skip);
|
||||
e.DebugBreak();
|
||||
e.L(skip);
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.DebugBreak();
|
||||
e.L(skip);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct DEBUG_BREAK_TRUE_I64
|
||||
: Sequence<DEBUG_BREAK_TRUE_I64,
|
||||
I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.DebugBreak();
|
||||
e.L(skip);
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.rcx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jrcxz(skip);
|
||||
e.DebugBreak();
|
||||
e.L(skip);
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.DebugBreak();
|
||||
e.L(skip);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct DEBUG_BREAK_TRUE_F32
|
||||
|
@ -190,21 +207,37 @@ struct TRAP_TRUE_I16
|
|||
struct TRAP_TRUE_I32
|
||||
: Sequence<TRAP_TRUE_I32, I<OPCODE_TRAP_TRUE, VoidOp, I32Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.ecx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jrcxz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct TRAP_TRUE_I64
|
||||
: Sequence<TRAP_TRUE_I64, I<OPCODE_TRAP_TRUE, VoidOp, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.rcx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jrcxz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct TRAP_TRUE_F32
|
||||
|
@ -355,22 +388,39 @@ struct CALL_INDIRECT_TRUE_I32
|
|||
: Sequence<CALL_INDIRECT_TRUE_I32,
|
||||
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I32Op, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip, CodeGenerator::T_NEAR);
|
||||
e.CallIndirect(i.instr, i.src2);
|
||||
e.L(skip);
|
||||
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.ecx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jrcxz(skip);
|
||||
e.CallIndirect(i.instr, i.src2);
|
||||
e.L(skip);
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip, CodeGenerator::T_NEAR);
|
||||
e.CallIndirect(i.instr, i.src2);
|
||||
e.L(skip);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct CALL_INDIRECT_TRUE_I64
|
||||
: Sequence<CALL_INDIRECT_TRUE_I64,
|
||||
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I64Op, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip, CodeGenerator::T_NEAR);
|
||||
e.CallIndirect(i.instr, i.src2);
|
||||
e.L(skip);
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.rcx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jrcxz(skip);
|
||||
e.CallIndirect(i.instr, i.src2);
|
||||
e.L(skip);
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip, CodeGenerator::T_NEAR);
|
||||
e.CallIndirect(i.instr, i.src2);
|
||||
e.L(skip);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct CALL_INDIRECT_TRUE_F32
|
||||
|
|
|
@ -15,6 +15,13 @@
|
|||
#include "xenia/base/memory.h"
|
||||
#include "xenia/cpu/backend/x64/x64_op.h"
|
||||
#include "xenia/cpu/backend/x64/x64_tracers.h"
|
||||
#include "xenia/cpu/ppc/ppc_context.h"
|
||||
#include "xenia/base/cvar.h"
|
||||
|
||||
DEFINE_bool(
|
||||
elide_e0_check, false,
|
||||
"Eliminate e0 check on some memory accesses, like to r13(tls) or r1(sp)",
|
||||
"CPU");
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
|
@ -27,7 +34,30 @@ volatile int anchor_memory = 0;
|
|||
RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) {
|
||||
return e.GetContextReg() + offset.value;
|
||||
}
|
||||
static bool is_eo_def(const hir::Value* v) {
|
||||
if (v->def) {
|
||||
auto df = v->def;
|
||||
if (df->opcode == &OPCODE_LOAD_CONTEXT_info) {
|
||||
size_t offs = df->src1.offset;
|
||||
if (offs == offsetof(ppc::PPCContext_s, r[1]) ||
|
||||
offs == offsetof(ppc::PPCContext_s, r[13])) {
|
||||
return true;
|
||||
}
|
||||
} else if (df->opcode == &OPCODE_ASSIGN_info) {
|
||||
return is_eo_def(df->src1.value);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static bool is_definitely_not_eo(const T& v) {
|
||||
if (!cvars::elide_e0_check) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return is_eo_def(v.value);
|
||||
}
|
||||
template <typename T>
|
||||
RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
|
||||
const T& offset) {
|
||||
|
@ -49,7 +79,8 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
|
|||
return e.GetMembaseReg() + e.rax;
|
||||
}
|
||||
} else {
|
||||
if (xe::memory::allocation_granularity() > 0x1000) {
|
||||
if (xe::memory::allocation_granularity() > 0x1000 &&
|
||||
!is_definitely_not_eo(guest)) {
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
e.xor_(e.eax, e.eax);
|
||||
|
@ -60,12 +91,12 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
|
|||
} else {
|
||||
// Clear the top 32 bits, as they are likely garbage.
|
||||
// TODO(benvanik): find a way to avoid doing this.
|
||||
|
||||
e.mov(e.eax, guest.reg().cvt32());
|
||||
}
|
||||
return e.GetMembaseReg() + e.rax + offset_const;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: most *should* be aligned, but needs to be checked!
|
||||
template <typename T>
|
||||
RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
|
||||
|
@ -86,7 +117,8 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
|
|||
return e.GetMembaseReg() + e.rax;
|
||||
}
|
||||
} else {
|
||||
if (xe::memory::allocation_granularity() > 0x1000) {
|
||||
if (xe::memory::allocation_granularity() > 0x1000 &&
|
||||
!is_definitely_not_eo(guest)) {
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
e.xor_(e.eax, e.eax);
|
||||
|
|
|
@ -728,28 +728,103 @@ struct VECTOR_SHL_V128
|
|||
}
|
||||
}
|
||||
|
||||
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
||||
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
if (i.src2.is_constant) {
|
||||
if (e.IsFeatureEnabled(kX64EmitGFNI)) {
|
||||
const auto& shamt = i.src2.constant();
|
||||
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX2)) {
|
||||
if (!i.src2.is_constant) {
|
||||
// get high 8 bytes
|
||||
e.vpunpckhqdq(e.xmm1, i.src1, i.src1);
|
||||
e.vpunpckhqdq(e.xmm3, i.src2, i.src2);
|
||||
|
||||
e.vpmovzxbd(e.ymm0, i.src1);
|
||||
e.vpmovzxbd(e.ymm1, e.xmm1);
|
||||
|
||||
e.vpmovzxbd(e.ymm2, i.src2);
|
||||
e.vpmovzxbd(e.ymm3, e.xmm3);
|
||||
|
||||
e.vpsllvd(e.ymm0, e.ymm0, e.ymm2);
|
||||
e.vpsllvd(e.ymm1, e.ymm1, e.ymm3);
|
||||
e.vextracti128(e.xmm2, e.ymm0, 1);
|
||||
e.vextracti128(e.xmm3, e.ymm1, 1);
|
||||
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
e.vpshufb(e.xmm3, e.xmm3, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
|
||||
e.vpunpckldq(e.xmm0, e.xmm0, e.xmm1);
|
||||
e.vpunpckldq(e.xmm2, e.xmm2, e.xmm3);
|
||||
e.vpunpcklqdq(i.dest, e.xmm0, e.xmm2);
|
||||
return;
|
||||
} else {
|
||||
vec128_t constmask = i.src2.constant();
|
||||
|
||||
for (unsigned i = 0; i < 16; ++i) {
|
||||
constmask.u8[i] &= 7;
|
||||
}
|
||||
|
||||
unsigned seenvalue = constmask.u8[0];
|
||||
bool all_same = true;
|
||||
for (size_t n = 0; n < 16 - n; ++n) {
|
||||
if (shamt.u8[n] != shamt.u8[n + 1]) {
|
||||
for (unsigned i = 1; i < 16; ++i) {
|
||||
if (constmask.u8[i] != seenvalue) {
|
||||
all_same = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (all_same) {
|
||||
// Every count is the same, so we can use gf2p8affineqb.
|
||||
const uint8_t shift_amount = shamt.u8[0] & 0b111;
|
||||
const uint64_t shift_matrix =
|
||||
UINT64_C(0x0102040810204080) >> (shift_amount * 8);
|
||||
e.vgf2p8affineqb(i.dest, i.src1,
|
||||
e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
|
||||
// mul by two
|
||||
/*if (seenvalue == 1) {
|
||||
e.vpaddb(i.dest, i.src1, i.src1);
|
||||
} else if (seenvalue == 2) {
|
||||
e.vpaddb(i.dest, i.src1, i.src1);
|
||||
e.vpaddb(i.dest, i.dest, i.dest);
|
||||
} else if (seenvalue == 3) {
|
||||
// mul by 8
|
||||
e.vpaddb(i.dest, i.src1, i.src1);
|
||||
e.vpaddb(i.dest, i.dest, i.dest);
|
||||
e.vpaddb(i.dest, i.dest, i.dest);
|
||||
} else*/
|
||||
{
|
||||
e.vpmovzxbw(e.ymm0, i.src1);
|
||||
e.vpsllw(e.ymm0, e.ymm0, seenvalue);
|
||||
e.vextracti128(e.xmm1, e.ymm0, 1);
|
||||
|
||||
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMShortsToBytes));
|
||||
e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMShortsToBytes));
|
||||
e.vpunpcklqdq(i.dest, e.xmm0, e.xmm1);
|
||||
return;
|
||||
}
|
||||
|
||||
} else {
|
||||
e.LoadConstantXmm(e.xmm2, constmask);
|
||||
|
||||
e.vpunpckhqdq(e.xmm1, i.src1, i.src1);
|
||||
e.vpunpckhqdq(e.xmm3, e.xmm2, e.xmm2);
|
||||
|
||||
e.vpmovzxbd(e.ymm0, i.src1);
|
||||
e.vpmovzxbd(e.ymm1, e.xmm1);
|
||||
|
||||
e.vpmovzxbd(e.ymm2, e.xmm2);
|
||||
e.vpmovzxbd(e.ymm3, e.xmm3);
|
||||
|
||||
e.vpsllvd(e.ymm0, e.ymm0, e.ymm2);
|
||||
e.vpsllvd(e.ymm1, e.ymm1, e.ymm3);
|
||||
e.vextracti128(e.xmm2, e.ymm0, 1);
|
||||
e.vextracti128(e.xmm3, e.ymm1, 1);
|
||||
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
e.vpshufb(e.xmm3, e.xmm3, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
|
||||
e.vpunpckldq(e.xmm0, e.xmm0, e.xmm1);
|
||||
e.vpunpckldq(e.xmm2, e.xmm2, e.xmm3);
|
||||
e.vpunpcklqdq(i.dest, e.xmm0, e.xmm2);
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
|
@ -758,7 +833,6 @@ struct VECTOR_SHL_V128
|
|||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint8_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
}
|
||||
|
||||
static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
|
||||
Xmm src1;
|
||||
if (i.src1.is_constant) {
|
||||
|
|
|
@ -38,6 +38,10 @@
|
|||
#include "xenia/cpu/hir/hir_builder.h"
|
||||
#include "xenia/cpu/processor.h"
|
||||
|
||||
DEFINE_bool(use_fast_dot_product, false,
|
||||
"Experimental optimization, much shorter sequence on dot products, treating inf as overflow instead of using mcxsr"
|
||||
"four insn dotprod",
|
||||
"CPU");
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace backend {
|
||||
|
@ -886,7 +890,10 @@ struct COMPARE_EQ_I8
|
|||
e.cmp(src1, src2);
|
||||
},
|
||||
[](X64Emitter& e, const Reg8& src1, int32_t constant) {
|
||||
e.cmp(src1, constant);
|
||||
if (constant == 0) {
|
||||
e.test(src1, src1);
|
||||
} else
|
||||
e.cmp(src1, constant);
|
||||
});
|
||||
e.sete(i.dest);
|
||||
}
|
||||
|
@ -900,7 +907,10 @@ struct COMPARE_EQ_I16
|
|||
e.cmp(src1, src2);
|
||||
},
|
||||
[](X64Emitter& e, const Reg16& src1, int32_t constant) {
|
||||
e.cmp(src1, constant);
|
||||
if (constant == 0) {
|
||||
e.test(src1, src1);
|
||||
} else
|
||||
e.cmp(src1, constant);
|
||||
});
|
||||
e.sete(i.dest);
|
||||
}
|
||||
|
@ -914,7 +924,10 @@ struct COMPARE_EQ_I32
|
|||
e.cmp(src1, src2);
|
||||
},
|
||||
[](X64Emitter& e, const Reg32& src1, int32_t constant) {
|
||||
e.cmp(src1, constant);
|
||||
if (constant == 0) {
|
||||
e.test(src1, src1);
|
||||
} else
|
||||
e.cmp(src1, constant);
|
||||
});
|
||||
e.sete(i.dest);
|
||||
}
|
||||
|
@ -928,7 +941,10 @@ struct COMPARE_EQ_I64
|
|||
e.cmp(src1, src2);
|
||||
},
|
||||
[](X64Emitter& e, const Reg64& src1, int32_t constant) {
|
||||
e.cmp(src1, constant);
|
||||
if (constant == 0) {
|
||||
e.test(src1, src1);
|
||||
} else
|
||||
e.cmp(src1, constant);
|
||||
});
|
||||
e.sete(i.dest);
|
||||
}
|
||||
|
@ -1980,6 +1996,8 @@ struct DIV_V128 : Sequence<DIV_V128, I<OPCODE_DIV, V128Op, V128Op, V128Op>> {
|
|||
assert_true(!i.instr->flags);
|
||||
EmitAssociativeBinaryXmmOp(e, i,
|
||||
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
// e.vrcpps(e.xmm0, src2);
|
||||
//e.vmulps(dest, src1, e.xmm0);
|
||||
e.vdivps(dest, src1, src2);
|
||||
});
|
||||
}
|
||||
|
@ -2591,43 +2609,51 @@ EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128);
|
|||
|
||||
struct DOT_PRODUCT_V128 {
|
||||
static void Emit(X64Emitter& e, Xmm dest, Xmm src1, Xmm src2, uint8_t imm) {
|
||||
// TODO(benvanik): apparently this is very slow
|
||||
// - find alternative?
|
||||
Xbyak::Label end;
|
||||
e.inLocalLabel();
|
||||
if (cvars::use_fast_dot_product) {
|
||||
e.vdpps(dest, src1, src2, imm);
|
||||
e.vandps(e.xmm0, dest, e.GetXmmConstPtr(XMMAbsMaskPS));
|
||||
e.vcmpgeps(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMFloatInf));
|
||||
e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0);
|
||||
|
||||
// Grab space to put MXCSR.
|
||||
// TODO(gibbed): stick this in TLS or
|
||||
// something?
|
||||
e.sub(e.rsp, 8);
|
||||
} else {
|
||||
// TODO(benvanik): apparently this is very slow
|
||||
// - find alternative?
|
||||
Xbyak::Label end;
|
||||
e.inLocalLabel();
|
||||
|
||||
// Grab MXCSR and mask off the overflow flag,
|
||||
// because it's sticky.
|
||||
e.vstmxcsr(e.dword[e.rsp]);
|
||||
e.mov(e.eax, e.dword[e.rsp]);
|
||||
e.and_(e.eax, uint32_t(~8));
|
||||
e.mov(e.dword[e.rsp], e.eax);
|
||||
e.vldmxcsr(e.dword[e.rsp]);
|
||||
// Grab space to put MXCSR.
|
||||
// TODO(gibbed): stick this in TLS or
|
||||
// something?
|
||||
e.sub(e.rsp, 8);
|
||||
|
||||
// Hey we can do the dot product now.
|
||||
e.vdpps(dest, src1, src2, imm);
|
||||
// Grab MXCSR and mask off the overflow flag,
|
||||
// because it's sticky.
|
||||
e.vstmxcsr(e.dword[e.rsp]);
|
||||
e.mov(e.eax, e.dword[e.rsp]);
|
||||
e.and_(e.eax, uint32_t(~8));
|
||||
e.mov(e.dword[e.rsp], e.eax);
|
||||
e.vldmxcsr(e.dword[e.rsp]);
|
||||
|
||||
// Load MXCSR...
|
||||
e.vstmxcsr(e.dword[e.rsp]);
|
||||
// Hey we can do the dot product now.
|
||||
e.vdpps(dest, src1, src2, imm);
|
||||
|
||||
// ..free our temporary space and get MXCSR at
|
||||
// the same time
|
||||
e.pop(e.rax);
|
||||
// Load MXCSR...
|
||||
e.vstmxcsr(e.dword[e.rsp]);
|
||||
|
||||
// Did we overflow?
|
||||
e.test(e.al, 8);
|
||||
e.jz(end);
|
||||
// ..free our temporary space and get MXCSR at
|
||||
// the same time
|
||||
e.pop(e.rax);
|
||||
|
||||
// Infinity? HA! Give NAN.
|
||||
e.vmovdqa(dest, e.GetXmmConstPtr(XMMQNaN));
|
||||
// Did we overflow?
|
||||
e.test(e.al, 8);
|
||||
e.jz(end);
|
||||
|
||||
e.L(end);
|
||||
e.outLocalLabel();
|
||||
// Infinity? HA! Give NAN.
|
||||
e.vmovdqa(dest, e.GetXmmConstPtr(XMMQNaN));
|
||||
|
||||
e.L(end);
|
||||
e.outLocalLabel();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue