Merge pull request #46 from chrisps/canary_experimental
ome guest function calls can now be resolved and embedded directly in
This commit is contained in:
commit
9a72d6ab05
|
@ -43,7 +43,10 @@ DEFINE_bool(ignore_undefined_externs, true,
|
|||
DEFINE_bool(emit_source_annotations, false,
|
||||
"Add extra movs and nops to make disassembly easier to read.",
|
||||
"CPU");
|
||||
|
||||
DEFINE_bool(resolve_rel32_guest_calls, false,
|
||||
"Experimental optimization, directly call already resolved "
|
||||
"functions via x86 rel32 call/jmp",
|
||||
"CPU");
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace backend {
|
||||
|
@ -99,7 +102,28 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
|||
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
|
||||
|
||||
|
||||
|
||||
|
||||
#undef TEST_EMIT_FEATURE
|
||||
|
||||
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
|
||||
|
||||
bool is_zennish = cpu_.displayFamily >= 0x17;
|
||||
|
||||
if (is_zennish) {
|
||||
feature_flags_ |= kX64FastJrcx;
|
||||
|
||||
if (cpu_.displayFamily > 0x17) {
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
|
||||
} else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
} // todo:figure out at model zen+ became zen2, this is just the model
|
||||
// for my cpu, which is ripper90
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
X64Emitter::~X64Emitter() = default;
|
||||
|
@ -149,6 +173,26 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
|
|||
if (function) {
|
||||
code_cache_->PlaceGuestCode(function->address(), top_, func_info, function,
|
||||
new_execute_address, new_write_address);
|
||||
if (cvars::resolve_rel32_guest_calls) {
|
||||
for (auto&& callsite : call_sites_) {
|
||||
#pragma pack(push, 1)
|
||||
struct RGCEmitted {
|
||||
uint8_t ff_;
|
||||
uint32_t rgcid_;
|
||||
};
|
||||
#pragma pack(pop)
|
||||
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
|
||||
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
|
||||
hunter = reinterpret_cast<RGCEmitted*>(
|
||||
reinterpret_cast<char*>(hunter) + 1);
|
||||
}
|
||||
|
||||
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
|
||||
hunter->rgcid_ =
|
||||
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
|
||||
reinterpret_cast<intptr_t>(hunter + 1));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address,
|
||||
new_write_address);
|
||||
|
@ -157,6 +201,7 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
|
|||
ready();
|
||||
top_ = old_address;
|
||||
reset();
|
||||
call_sites_.clear();
|
||||
return new_execute_address;
|
||||
}
|
||||
|
||||
|
@ -287,11 +332,8 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
code_offsets.tail = getSize();
|
||||
|
||||
if (cvars::emit_source_annotations) {
|
||||
nop();
|
||||
nop();
|
||||
nop();
|
||||
nop();
|
||||
nop();
|
||||
nop(5);
|
||||
|
||||
}
|
||||
|
||||
assert_zero(code_offsets.prolog);
|
||||
|
@ -313,11 +355,9 @@ void X64Emitter::MarkSourceOffset(const Instr* i) {
|
|||
entry->code_offset = static_cast<uint32_t>(getSize());
|
||||
|
||||
if (cvars::emit_source_annotations) {
|
||||
nop();
|
||||
nop();
|
||||
nop(2);
|
||||
mov(eax, entry->guest_address);
|
||||
nop();
|
||||
nop();
|
||||
nop(2);
|
||||
}
|
||||
|
||||
if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctionCoverage) {
|
||||
|
@ -414,10 +454,44 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
|
|||
assert_not_null(function);
|
||||
auto fn = static_cast<X64Function*>(function);
|
||||
// Resolve address to the function to call and store in rax.
|
||||
|
||||
if (cvars::resolve_rel32_guest_calls && fn->machine_code()) {
|
||||
ResolvableGuestCall rgc;
|
||||
rgc.destination_ = uint32_t(uint64_t(fn->machine_code()));
|
||||
rgc.offset_ = current_rgc_id_;
|
||||
current_rgc_id_++;
|
||||
|
||||
if (!(instr->flags & hir::CALL_TAIL)) {
|
||||
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
||||
|
||||
db(0xFF);
|
||||
rgc.is_jump_ = false;
|
||||
|
||||
dd(rgc.offset_);
|
||||
|
||||
} else {
|
||||
// tail call
|
||||
EmitTraceUserCallReturn();
|
||||
|
||||
rgc.is_jump_ = true;
|
||||
// Pass the callers return address over.
|
||||
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||
|
||||
add(rsp, static_cast<uint32_t>(stack_size()));
|
||||
db(0xFF);
|
||||
dd(rgc.offset_);
|
||||
}
|
||||
call_sites_.push_back(rgc);
|
||||
return;
|
||||
}
|
||||
|
||||
if (fn->machine_code()) {
|
||||
// TODO(benvanik): is it worth it to do this? It removes the need for
|
||||
// a ResolveFunction call, but makes the table less useful.
|
||||
assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000);
|
||||
// todo: this should be changed so that we can actually do a call to
|
||||
// fn->machine_code. the code will be emitted near us, so 32 bit rel jmp
|
||||
// should be possible
|
||||
mov(eax, uint32_t(uint64_t(fn->machine_code())));
|
||||
} else if (code_cache_->has_indirection_table()) {
|
||||
// Load the pointer to the indirection table maintained in X64CodeCache.
|
||||
|
@ -600,6 +674,30 @@ void X64Emitter::ReloadContext() {
|
|||
void X64Emitter::ReloadMembase() {
|
||||
mov(GetMembaseReg(), qword[GetContextReg() + 8]); // membase
|
||||
}
|
||||
#define __NH_CONCAT(x, y) x##y
|
||||
#define _MH_CONCAT(cb, ...) cb (__VA_ARGS__)
|
||||
|
||||
#define mh_concat2_m(x, y) __NH_CONCAT(x, y)
|
||||
|
||||
#define DECLNOP(n, ...) \
|
||||
static constexpr unsigned char mh_concat2_m(nop_, n)[] = {__VA_ARGS__}
|
||||
|
||||
DECLNOP(1, 0x90);
|
||||
DECLNOP(2, 0x66, 0x90);
|
||||
DECLNOP(3, 0x0F, 0x1F, 0x00);
|
||||
DECLNOP(4, 0x0F, 0x1F, 0x40, 0x00);
|
||||
DECLNOP(5, 0x0F, 0x1F, 0x44, 0x00, 0x00);
|
||||
DECLNOP(6, 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00);
|
||||
DECLNOP(7, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00);
|
||||
DECLNOP(8, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
|
||||
DECLNOP(9, 0x66, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
|
||||
|
||||
static constexpr const unsigned char* const g_noptable[] = {
|
||||
&nop_1[0], &nop_1[0], &nop_2[0], &nop_3[0], &nop_4[0],
|
||||
&nop_5[0], &nop_6[0], &nop_7[0], &nop_8[0], &nop_9[0]};
|
||||
|
||||
static constexpr unsigned LENGTHOF_NOPTABLE =
|
||||
sizeof(g_noptable) / sizeof(g_noptable[0]);
|
||||
|
||||
// Len Assembly Byte Sequence
|
||||
// ============================================================================
|
||||
|
@ -613,9 +711,17 @@ void X64Emitter::ReloadMembase() {
|
|||
// 8b NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H
|
||||
// 9b 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H
|
||||
void X64Emitter::nop(size_t length) {
|
||||
// TODO(benvanik): fat nop
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
db(0x90);
|
||||
while (length != 0) {
|
||||
unsigned patchsize = length % LENGTHOF_NOPTABLE;
|
||||
|
||||
// patch_memory(locptr, size, (char*)g_noptable[patchsize]);
|
||||
|
||||
for (unsigned i = 0; i < patchsize; ++i) {
|
||||
db(g_noptable[patchsize][i]);
|
||||
}
|
||||
|
||||
//locptr += patchsize;
|
||||
length -= patchsize;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -649,6 +755,35 @@ void X64Emitter::MovMem64(const Xbyak::RegExp& addr, uint64_t v) {
|
|||
mov(dword[addr + 4], static_cast<uint32_t>(v >> 32));
|
||||
}
|
||||
}
|
||||
static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,
|
||||
unsigned char v2, unsigned char v3,
|
||||
unsigned char v4, unsigned char v5,
|
||||
unsigned char v6, unsigned char v7,
|
||||
unsigned char v8, unsigned char v9,
|
||||
unsigned char v10, unsigned char v11,
|
||||
unsigned char v12, unsigned char v13,
|
||||
unsigned char v14, unsigned char v15) {
|
||||
vec128_t result;
|
||||
|
||||
result.u8[0] = v0;
|
||||
result.u8[1] = v1;
|
||||
result.u8[2] = v2;
|
||||
result.u8[3] = v3;
|
||||
result.u8[4] = v4;
|
||||
result.u8[5] = v5;
|
||||
result.u8[6] = v6;
|
||||
result.u8[7] = v7;
|
||||
result.u8[8] = v8;
|
||||
result.u8[9] = v9;
|
||||
result.u8[10] = v10;
|
||||
result.u8[11] = v11;
|
||||
result.u8[12] = v12;
|
||||
result.u8[13] = v13;
|
||||
result.u8[14] = v14;
|
||||
|
||||
result.u8[15] = v15;
|
||||
return result;
|
||||
}
|
||||
|
||||
static const vec128_t xmm_consts[] = {
|
||||
/* XMMZero */ vec128f(0.0f),
|
||||
|
@ -761,8 +896,60 @@ static const vec128_t xmm_consts[] = {
|
|||
/* XMMQNaN */ vec128i(0x7FC00000u),
|
||||
/* XMMInt127 */ vec128i(0x7Fu),
|
||||
/* XMM2To32 */ vec128f(0x1.0p32f),
|
||||
/* xmminf */ vec128i(0x7f800000),
|
||||
|
||||
/* XMMIntsToBytes*/
|
||||
v128_setr_bytes(0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
0x80, 0x80, 0x80, 0x80),
|
||||
/*XMMShortsToBytes*/
|
||||
v128_setr_bytes(0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
0x80, 0x80, 0x80)
|
||||
};
|
||||
|
||||
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
|
||||
for (auto& vec : xmm_consts) {
|
||||
for (auto& u8 : vec.u8) {
|
||||
if (u8 == bytevalue) {
|
||||
return reinterpret_cast<void*>(backend_->emitter_data() +
|
||||
(&u8 - &xmm_consts[0].u8[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
void* X64Emitter::FindWordConstantOffset(unsigned wordvalue) {
|
||||
for (auto& vec : xmm_consts) {
|
||||
for (auto& u16 : vec.u16) {
|
||||
if (u16 == wordvalue) {
|
||||
return reinterpret_cast<void*>(backend_->emitter_data() +
|
||||
((&u16 - &xmm_consts[0].u16[0]) * 2));
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
void* X64Emitter::FindDwordConstantOffset(unsigned dwordvalue) {
|
||||
for (auto& vec : xmm_consts) {
|
||||
for (auto& u32 : vec.u32) {
|
||||
if (u32 == dwordvalue) {
|
||||
return reinterpret_cast<void*>(backend_->emitter_data() +
|
||||
((&u32 - &xmm_consts[0].u32[0]) * 4));
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
void* X64Emitter::FindQwordConstantOffset(uint64_t qwordvalue) {
|
||||
for (auto& vec : xmm_consts) {
|
||||
for (auto& u64 : vec.u64) {
|
||||
if (u64 == qwordvalue) {
|
||||
return reinterpret_cast<void*>(backend_->emitter_data() +
|
||||
((&u64 - &xmm_consts[0].u64[0]) * 8));
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
// First location to try and place constants.
|
||||
static const uintptr_t kConstDataLocation = 0x20000000;
|
||||
static const uintptr_t kConstDataSize = sizeof(xmm_consts);
|
||||
|
@ -806,7 +993,6 @@ Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) {
|
|||
return ptr[reinterpret_cast<void*>(backend_->emitter_data() +
|
||||
sizeof(vec128_t) * id)];
|
||||
}
|
||||
|
||||
// Implies possible StashXmm(0, ...)!
|
||||
void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
||||
// https://www.agner.org/optimize/optimizing_assembly.pdf
|
||||
|
@ -818,12 +1004,115 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
|||
// 1111...
|
||||
vpcmpeqb(dest, dest);
|
||||
} else {
|
||||
|
||||
for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
|
||||
if (xmm_consts[i] == v) {
|
||||
vmovapd(dest, GetXmmConstPtr((XmmConst)i));
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (IsFeatureEnabled(kX64EmitAVX2)) {
|
||||
bool all_equal_bytes = true;
|
||||
|
||||
unsigned firstbyte = v.u8[0];
|
||||
for (unsigned i = 1; i < 16; ++i) {
|
||||
if (v.u8[i] != firstbyte) {
|
||||
all_equal_bytes = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (all_equal_bytes) {
|
||||
void* bval = FindByteConstantOffset(firstbyte);
|
||||
|
||||
if (bval) {
|
||||
vpbroadcastb(dest, byte[bval]);
|
||||
return;
|
||||
}
|
||||
// didnt find existing mem with the value
|
||||
mov(byte[rsp + kStashOffset], firstbyte);
|
||||
vpbroadcastb(dest, byte[rsp + kStashOffset]);
|
||||
return;
|
||||
}
|
||||
|
||||
bool all_equal_words = true;
|
||||
unsigned firstword = v.u16[0];
|
||||
for (unsigned i = 1; i < 8; ++i) {
|
||||
if (v.u16[i] != firstword) {
|
||||
all_equal_words = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (all_equal_words) {
|
||||
void* wval = FindWordConstantOffset(firstword);
|
||||
if (wval) {
|
||||
vpbroadcastw(dest, word[wval]);
|
||||
return;
|
||||
}
|
||||
// didnt find existing mem with the value
|
||||
mov(word[rsp + kStashOffset], firstword);
|
||||
vpbroadcastw(dest, word[rsp + kStashOffset]);
|
||||
return;
|
||||
}
|
||||
|
||||
bool all_equal_dwords = true;
|
||||
unsigned firstdword = v.u32[0];
|
||||
for (unsigned i = 1; i < 4; ++i) {
|
||||
if (v.u32[i] != firstdword) {
|
||||
all_equal_dwords = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (all_equal_dwords) {
|
||||
void* dwval = FindDwordConstantOffset(firstdword);
|
||||
if (dwval) {
|
||||
vpbroadcastd(dest, dword[dwval]);
|
||||
return;
|
||||
}
|
||||
mov(dword[rsp + kStashOffset], firstdword);
|
||||
vpbroadcastd(dest, dword[rsp + kStashOffset]);
|
||||
return;
|
||||
}
|
||||
|
||||
bool all_equal_qwords = v.low == v.high;
|
||||
|
||||
if (all_equal_qwords) {
|
||||
void* qwval = FindQwordConstantOffset(v.low);
|
||||
if (qwval) {
|
||||
vpbroadcastq(dest, qword[qwval]);
|
||||
return;
|
||||
}
|
||||
MovMem64(rsp + kStashOffset, v.low);
|
||||
vpbroadcastq(dest, qword[rsp + kStashOffset]);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& vec : xmm_consts) {
|
||||
if (vec.low == v.low && vec.high == v.high) {
|
||||
vmovdqa(dest,
|
||||
ptr[reinterpret_cast<void*>(backend_->emitter_data() +
|
||||
((&vec - &xmm_consts[0]) * 16))]);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (v.high == 0 && v.low == ~0ULL) {
|
||||
vpcmpeqb(dest, dest);
|
||||
movq(dest, dest);
|
||||
return;
|
||||
}
|
||||
if (v.high == 0) {
|
||||
if ((v.low & 0xFFFFFFFF) == v.low) {
|
||||
mov(dword[rsp + kStashOffset], static_cast<unsigned>(v.low));
|
||||
movd(dest, dword[rsp + kStashOffset]);
|
||||
return;
|
||||
}
|
||||
MovMem64(rsp + kStashOffset, v.low);
|
||||
movq(dest, qword[rsp + kStashOffset]);
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO(benvanik): see what other common values are.
|
||||
// TODO(benvanik): build constant table - 99% are reused.
|
||||
MovMem64(rsp + kStashOffset, v.low);
|
||||
|
|
|
@ -116,6 +116,9 @@ enum XmmConst {
|
|||
XMMQNaN,
|
||||
XMMInt127,
|
||||
XMM2To32,
|
||||
XMMFloatInf,
|
||||
XMMIntsToBytes,
|
||||
XMMShortsToBytes
|
||||
};
|
||||
|
||||
// Unfortunately due to the design of xbyak we have to pass this to the ctor.
|
||||
|
@ -141,7 +144,16 @@ enum X64EmitterFeatureFlags {
|
|||
kX64EmitAVX512DQ = 1 << 11,
|
||||
|
||||
kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
|
||||
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ
|
||||
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
|
||||
kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
|
||||
kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
|
||||
};
|
||||
class ResolvableGuestCall {
|
||||
public:
|
||||
bool is_jump_;
|
||||
uintptr_t destination_;
|
||||
// rgcid
|
||||
unsigned offset_;
|
||||
};
|
||||
|
||||
class X64Emitter : public Xbyak::CodeGenerator {
|
||||
|
@ -230,7 +242,10 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
Xbyak::Address StashConstantXmm(int index, float v);
|
||||
Xbyak::Address StashConstantXmm(int index, double v);
|
||||
Xbyak::Address StashConstantXmm(int index, const vec128_t& v);
|
||||
|
||||
void* FindByteConstantOffset(unsigned bytevalue);
|
||||
void* FindWordConstantOffset(unsigned wordvalue);
|
||||
void* FindDwordConstantOffset(unsigned bytevalue);
|
||||
void* FindQwordConstantOffset(uint64_t bytevalue);
|
||||
bool IsFeatureEnabled(uint32_t feature_flag) const {
|
||||
return (feature_flags_ & feature_flag) == feature_flag;
|
||||
}
|
||||
|
@ -267,6 +282,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
|
||||
static const uint32_t gpr_reg_map_[GPR_COUNT];
|
||||
static const uint32_t xmm_reg_map_[XMM_COUNT];
|
||||
uint32_t current_rgc_id_ = 0xEEDDF00F;
|
||||
std::vector<ResolvableGuestCall> call_sites_;
|
||||
};
|
||||
|
||||
} // namespace x64
|
||||
|
|
|
@ -109,22 +109,39 @@ struct DEBUG_BREAK_TRUE_I32
|
|||
: Sequence<DEBUG_BREAK_TRUE_I32,
|
||||
I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I32Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.DebugBreak();
|
||||
e.L(skip);
|
||||
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.ecx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jrcxz(skip);
|
||||
e.DebugBreak();
|
||||
e.L(skip);
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.DebugBreak();
|
||||
e.L(skip);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct DEBUG_BREAK_TRUE_I64
|
||||
: Sequence<DEBUG_BREAK_TRUE_I64,
|
||||
I<OPCODE_DEBUG_BREAK_TRUE, VoidOp, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.DebugBreak();
|
||||
e.L(skip);
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.rcx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jrcxz(skip);
|
||||
e.DebugBreak();
|
||||
e.L(skip);
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.DebugBreak();
|
||||
e.L(skip);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct DEBUG_BREAK_TRUE_F32
|
||||
|
@ -190,21 +207,37 @@ struct TRAP_TRUE_I16
|
|||
struct TRAP_TRUE_I32
|
||||
: Sequence<TRAP_TRUE_I32, I<OPCODE_TRAP_TRUE, VoidOp, I32Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.ecx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jrcxz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct TRAP_TRUE_I64
|
||||
: Sequence<TRAP_TRUE_I64, I<OPCODE_TRAP_TRUE, VoidOp, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.rcx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jrcxz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip);
|
||||
e.Trap(i.instr->flags);
|
||||
e.L(skip);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct TRAP_TRUE_F32
|
||||
|
@ -355,22 +388,39 @@ struct CALL_INDIRECT_TRUE_I32
|
|||
: Sequence<CALL_INDIRECT_TRUE_I32,
|
||||
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I32Op, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip, CodeGenerator::T_NEAR);
|
||||
e.CallIndirect(i.instr, i.src2);
|
||||
e.L(skip);
|
||||
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.ecx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jrcxz(skip);
|
||||
e.CallIndirect(i.instr, i.src2);
|
||||
e.L(skip);
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip, CodeGenerator::T_NEAR);
|
||||
e.CallIndirect(i.instr, i.src2);
|
||||
e.L(skip);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct CALL_INDIRECT_TRUE_I64
|
||||
: Sequence<CALL_INDIRECT_TRUE_I64,
|
||||
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I64Op, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip, CodeGenerator::T_NEAR);
|
||||
e.CallIndirect(i.instr, i.src2);
|
||||
e.L(skip);
|
||||
if (e.IsFeatureEnabled(kX64FastJrcx)) {
|
||||
e.mov(e.rcx, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jrcxz(skip);
|
||||
e.CallIndirect(i.instr, i.src2);
|
||||
e.L(skip);
|
||||
} else {
|
||||
e.test(i.src1, i.src1);
|
||||
Xbyak::Label skip;
|
||||
e.jz(skip, CodeGenerator::T_NEAR);
|
||||
e.CallIndirect(i.instr, i.src2);
|
||||
e.L(skip);
|
||||
}
|
||||
}
|
||||
};
|
||||
struct CALL_INDIRECT_TRUE_F32
|
||||
|
|
|
@ -15,6 +15,13 @@
|
|||
#include "xenia/base/memory.h"
|
||||
#include "xenia/cpu/backend/x64/x64_op.h"
|
||||
#include "xenia/cpu/backend/x64/x64_tracers.h"
|
||||
#include "xenia/cpu/ppc/ppc_context.h"
|
||||
#include "xenia/base/cvar.h"
|
||||
|
||||
DEFINE_bool(
|
||||
elide_e0_check, false,
|
||||
"Eliminate e0 check on some memory accesses, like to r13(tls) or r1(sp)",
|
||||
"CPU");
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
|
@ -27,7 +34,30 @@ volatile int anchor_memory = 0;
|
|||
RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) {
|
||||
return e.GetContextReg() + offset.value;
|
||||
}
|
||||
static bool is_eo_def(const hir::Value* v) {
|
||||
if (v->def) {
|
||||
auto df = v->def;
|
||||
if (df->opcode == &OPCODE_LOAD_CONTEXT_info) {
|
||||
size_t offs = df->src1.offset;
|
||||
if (offs == offsetof(ppc::PPCContext_s, r[1]) ||
|
||||
offs == offsetof(ppc::PPCContext_s, r[13])) {
|
||||
return true;
|
||||
}
|
||||
} else if (df->opcode == &OPCODE_ASSIGN_info) {
|
||||
return is_eo_def(df->src1.value);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static bool is_definitely_not_eo(const T& v) {
|
||||
if (!cvars::elide_e0_check) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return is_eo_def(v.value);
|
||||
}
|
||||
template <typename T>
|
||||
RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
|
||||
const T& offset) {
|
||||
|
@ -49,7 +79,8 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
|
|||
return e.GetMembaseReg() + e.rax;
|
||||
}
|
||||
} else {
|
||||
if (xe::memory::allocation_granularity() > 0x1000) {
|
||||
if (xe::memory::allocation_granularity() > 0x1000 &&
|
||||
!is_definitely_not_eo(guest)) {
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
e.xor_(e.eax, e.eax);
|
||||
|
@ -60,12 +91,12 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest,
|
|||
} else {
|
||||
// Clear the top 32 bits, as they are likely garbage.
|
||||
// TODO(benvanik): find a way to avoid doing this.
|
||||
|
||||
e.mov(e.eax, guest.reg().cvt32());
|
||||
}
|
||||
return e.GetMembaseReg() + e.rax + offset_const;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: most *should* be aligned, but needs to be checked!
|
||||
template <typename T>
|
||||
RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
|
||||
|
@ -86,7 +117,8 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
|
|||
return e.GetMembaseReg() + e.rax;
|
||||
}
|
||||
} else {
|
||||
if (xe::memory::allocation_granularity() > 0x1000) {
|
||||
if (xe::memory::allocation_granularity() > 0x1000 &&
|
||||
!is_definitely_not_eo(guest)) {
|
||||
// Emulate the 4 KB physical address offset in 0xE0000000+ when can't do
|
||||
// it via memory mapping.
|
||||
e.xor_(e.eax, e.eax);
|
||||
|
|
|
@ -728,28 +728,103 @@ struct VECTOR_SHL_V128
|
|||
}
|
||||
}
|
||||
|
||||
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
||||
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
if (i.src2.is_constant) {
|
||||
if (e.IsFeatureEnabled(kX64EmitGFNI)) {
|
||||
const auto& shamt = i.src2.constant();
|
||||
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX2)) {
|
||||
if (!i.src2.is_constant) {
|
||||
// get high 8 bytes
|
||||
e.vpunpckhqdq(e.xmm1, i.src1, i.src1);
|
||||
e.vpunpckhqdq(e.xmm3, i.src2, i.src2);
|
||||
|
||||
e.vpmovzxbd(e.ymm0, i.src1);
|
||||
e.vpmovzxbd(e.ymm1, e.xmm1);
|
||||
|
||||
e.vpmovzxbd(e.ymm2, i.src2);
|
||||
e.vpmovzxbd(e.ymm3, e.xmm3);
|
||||
|
||||
e.vpsllvd(e.ymm0, e.ymm0, e.ymm2);
|
||||
e.vpsllvd(e.ymm1, e.ymm1, e.ymm3);
|
||||
e.vextracti128(e.xmm2, e.ymm0, 1);
|
||||
e.vextracti128(e.xmm3, e.ymm1, 1);
|
||||
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
e.vpshufb(e.xmm3, e.xmm3, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
|
||||
e.vpunpckldq(e.xmm0, e.xmm0, e.xmm1);
|
||||
e.vpunpckldq(e.xmm2, e.xmm2, e.xmm3);
|
||||
e.vpunpcklqdq(i.dest, e.xmm0, e.xmm2);
|
||||
return;
|
||||
} else {
|
||||
vec128_t constmask = i.src2.constant();
|
||||
|
||||
for (unsigned i = 0; i < 16; ++i) {
|
||||
constmask.u8[i] &= 7;
|
||||
}
|
||||
|
||||
unsigned seenvalue = constmask.u8[0];
|
||||
bool all_same = true;
|
||||
for (size_t n = 0; n < 16 - n; ++n) {
|
||||
if (shamt.u8[n] != shamt.u8[n + 1]) {
|
||||
for (unsigned i = 1; i < 16; ++i) {
|
||||
if (constmask.u8[i] != seenvalue) {
|
||||
all_same = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (all_same) {
|
||||
// Every count is the same, so we can use gf2p8affineqb.
|
||||
const uint8_t shift_amount = shamt.u8[0] & 0b111;
|
||||
const uint64_t shift_matrix =
|
||||
UINT64_C(0x0102040810204080) >> (shift_amount * 8);
|
||||
e.vgf2p8affineqb(i.dest, i.src1,
|
||||
e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
|
||||
// mul by two
|
||||
/*if (seenvalue == 1) {
|
||||
e.vpaddb(i.dest, i.src1, i.src1);
|
||||
} else if (seenvalue == 2) {
|
||||
e.vpaddb(i.dest, i.src1, i.src1);
|
||||
e.vpaddb(i.dest, i.dest, i.dest);
|
||||
} else if (seenvalue == 3) {
|
||||
// mul by 8
|
||||
e.vpaddb(i.dest, i.src1, i.src1);
|
||||
e.vpaddb(i.dest, i.dest, i.dest);
|
||||
e.vpaddb(i.dest, i.dest, i.dest);
|
||||
} else*/
|
||||
{
|
||||
e.vpmovzxbw(e.ymm0, i.src1);
|
||||
e.vpsllw(e.ymm0, e.ymm0, seenvalue);
|
||||
e.vextracti128(e.xmm1, e.ymm0, 1);
|
||||
|
||||
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMShortsToBytes));
|
||||
e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMShortsToBytes));
|
||||
e.vpunpcklqdq(i.dest, e.xmm0, e.xmm1);
|
||||
return;
|
||||
}
|
||||
|
||||
} else {
|
||||
e.LoadConstantXmm(e.xmm2, constmask);
|
||||
|
||||
e.vpunpckhqdq(e.xmm1, i.src1, i.src1);
|
||||
e.vpunpckhqdq(e.xmm3, e.xmm2, e.xmm2);
|
||||
|
||||
e.vpmovzxbd(e.ymm0, i.src1);
|
||||
e.vpmovzxbd(e.ymm1, e.xmm1);
|
||||
|
||||
e.vpmovzxbd(e.ymm2, e.xmm2);
|
||||
e.vpmovzxbd(e.ymm3, e.xmm3);
|
||||
|
||||
e.vpsllvd(e.ymm0, e.ymm0, e.ymm2);
|
||||
e.vpsllvd(e.ymm1, e.ymm1, e.ymm3);
|
||||
e.vextracti128(e.xmm2, e.ymm0, 1);
|
||||
e.vextracti128(e.xmm3, e.ymm1, 1);
|
||||
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
e.vpshufb(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
e.vpshufb(e.xmm3, e.xmm3, e.GetXmmConstPtr(XMMIntsToBytes));
|
||||
|
||||
e.vpunpckldq(e.xmm0, e.xmm0, e.xmm1);
|
||||
e.vpunpckldq(e.xmm2, e.xmm2, e.xmm3);
|
||||
e.vpunpcklqdq(i.dest, e.xmm0, e.xmm2);
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (i.src2.is_constant) {
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
|
@ -758,7 +833,6 @@ struct VECTOR_SHL_V128
|
|||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint8_t>));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
}
|
||||
|
||||
static void EmitInt16(X64Emitter& e, const EmitArgType& i) {
|
||||
Xmm src1;
|
||||
if (i.src1.is_constant) {
|
||||
|
|
|
@ -38,6 +38,10 @@
|
|||
#include "xenia/cpu/hir/hir_builder.h"
|
||||
#include "xenia/cpu/processor.h"
|
||||
|
||||
DEFINE_bool(use_fast_dot_product, false,
|
||||
"Experimental optimization, much shorter sequence on dot products, treating inf as overflow instead of using mcxsr"
|
||||
"four insn dotprod",
|
||||
"CPU");
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace backend {
|
||||
|
@ -886,7 +890,10 @@ struct COMPARE_EQ_I8
|
|||
e.cmp(src1, src2);
|
||||
},
|
||||
[](X64Emitter& e, const Reg8& src1, int32_t constant) {
|
||||
e.cmp(src1, constant);
|
||||
if (constant == 0) {
|
||||
e.test(src1, src1);
|
||||
} else
|
||||
e.cmp(src1, constant);
|
||||
});
|
||||
e.sete(i.dest);
|
||||
}
|
||||
|
@ -900,7 +907,10 @@ struct COMPARE_EQ_I16
|
|||
e.cmp(src1, src2);
|
||||
},
|
||||
[](X64Emitter& e, const Reg16& src1, int32_t constant) {
|
||||
e.cmp(src1, constant);
|
||||
if (constant == 0) {
|
||||
e.test(src1, src1);
|
||||
} else
|
||||
e.cmp(src1, constant);
|
||||
});
|
||||
e.sete(i.dest);
|
||||
}
|
||||
|
@ -914,7 +924,10 @@ struct COMPARE_EQ_I32
|
|||
e.cmp(src1, src2);
|
||||
},
|
||||
[](X64Emitter& e, const Reg32& src1, int32_t constant) {
|
||||
e.cmp(src1, constant);
|
||||
if (constant == 0) {
|
||||
e.test(src1, src1);
|
||||
} else
|
||||
e.cmp(src1, constant);
|
||||
});
|
||||
e.sete(i.dest);
|
||||
}
|
||||
|
@ -928,7 +941,10 @@ struct COMPARE_EQ_I64
|
|||
e.cmp(src1, src2);
|
||||
},
|
||||
[](X64Emitter& e, const Reg64& src1, int32_t constant) {
|
||||
e.cmp(src1, constant);
|
||||
if (constant == 0) {
|
||||
e.test(src1, src1);
|
||||
} else
|
||||
e.cmp(src1, constant);
|
||||
});
|
||||
e.sete(i.dest);
|
||||
}
|
||||
|
@ -1980,6 +1996,8 @@ struct DIV_V128 : Sequence<DIV_V128, I<OPCODE_DIV, V128Op, V128Op, V128Op>> {
|
|||
assert_true(!i.instr->flags);
|
||||
EmitAssociativeBinaryXmmOp(e, i,
|
||||
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||
// e.vrcpps(e.xmm0, src2);
|
||||
//e.vmulps(dest, src1, e.xmm0);
|
||||
e.vdivps(dest, src1, src2);
|
||||
});
|
||||
}
|
||||
|
@ -2591,43 +2609,51 @@ EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128);
|
|||
|
||||
struct DOT_PRODUCT_V128 {
|
||||
static void Emit(X64Emitter& e, Xmm dest, Xmm src1, Xmm src2, uint8_t imm) {
|
||||
// TODO(benvanik): apparently this is very slow
|
||||
// - find alternative?
|
||||
Xbyak::Label end;
|
||||
e.inLocalLabel();
|
||||
if (cvars::use_fast_dot_product) {
|
||||
e.vdpps(dest, src1, src2, imm);
|
||||
e.vandps(e.xmm0, dest, e.GetXmmConstPtr(XMMAbsMaskPS));
|
||||
e.vcmpgeps(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMFloatInf));
|
||||
e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0);
|
||||
|
||||
// Grab space to put MXCSR.
|
||||
// TODO(gibbed): stick this in TLS or
|
||||
// something?
|
||||
e.sub(e.rsp, 8);
|
||||
} else {
|
||||
// TODO(benvanik): apparently this is very slow
|
||||
// - find alternative?
|
||||
Xbyak::Label end;
|
||||
e.inLocalLabel();
|
||||
|
||||
// Grab MXCSR and mask off the overflow flag,
|
||||
// because it's sticky.
|
||||
e.vstmxcsr(e.dword[e.rsp]);
|
||||
e.mov(e.eax, e.dword[e.rsp]);
|
||||
e.and_(e.eax, uint32_t(~8));
|
||||
e.mov(e.dword[e.rsp], e.eax);
|
||||
e.vldmxcsr(e.dword[e.rsp]);
|
||||
// Grab space to put MXCSR.
|
||||
// TODO(gibbed): stick this in TLS or
|
||||
// something?
|
||||
e.sub(e.rsp, 8);
|
||||
|
||||
// Hey we can do the dot product now.
|
||||
e.vdpps(dest, src1, src2, imm);
|
||||
// Grab MXCSR and mask off the overflow flag,
|
||||
// because it's sticky.
|
||||
e.vstmxcsr(e.dword[e.rsp]);
|
||||
e.mov(e.eax, e.dword[e.rsp]);
|
||||
e.and_(e.eax, uint32_t(~8));
|
||||
e.mov(e.dword[e.rsp], e.eax);
|
||||
e.vldmxcsr(e.dword[e.rsp]);
|
||||
|
||||
// Load MXCSR...
|
||||
e.vstmxcsr(e.dword[e.rsp]);
|
||||
// Hey we can do the dot product now.
|
||||
e.vdpps(dest, src1, src2, imm);
|
||||
|
||||
// ..free our temporary space and get MXCSR at
|
||||
// the same time
|
||||
e.pop(e.rax);
|
||||
// Load MXCSR...
|
||||
e.vstmxcsr(e.dword[e.rsp]);
|
||||
|
||||
// Did we overflow?
|
||||
e.test(e.al, 8);
|
||||
e.jz(end);
|
||||
// ..free our temporary space and get MXCSR at
|
||||
// the same time
|
||||
e.pop(e.rax);
|
||||
|
||||
// Infinity? HA! Give NAN.
|
||||
e.vmovdqa(dest, e.GetXmmConstPtr(XMMQNaN));
|
||||
// Did we overflow?
|
||||
e.test(e.al, 8);
|
||||
e.jz(end);
|
||||
|
||||
e.L(end);
|
||||
e.outLocalLabel();
|
||||
// Infinity? HA! Give NAN.
|
||||
e.vmovdqa(dest, e.GetXmmConstPtr(XMMQNaN));
|
||||
|
||||
e.L(end);
|
||||
e.outLocalLabel();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue