implement dynamically allocateable guest to host callbacks

This commit is contained in:
disjtqz 2023-10-11 11:58:15 -04:00 committed by Radosław Gliński
parent d0a6cec024
commit 43fd396db7
5 changed files with 243 additions and 43 deletions

View File

@ -25,37 +25,57 @@ BitMap::BitMap(uint64_t* data, size_t size_bits) {
data_.resize(size_bits / kDataSizeBits);
std::memcpy(data_.data(), data, size_bits / kDataSizeBits);
}
inline size_t BitMap::TryAcquireAt(size_t i) {
uint64_t entry = 0;
uint64_t new_entry = 0;
int64_t acquired_idx = -1LL;
do {
entry = data_[i];
uint8_t index = lzcnt(entry);
if (index == kDataSizeBits) {
// None free.
acquired_idx = -1;
break;
}
// Entry has a free bit. Acquire it.
uint64_t bit = 1ull << (kDataSizeBits - index - 1);
new_entry = entry & ~bit;
assert_not_zero(entry & bit);
acquired_idx = index;
} while (!atomic_cas(entry, new_entry, &data_[i]));
if (acquired_idx != -1) {
// Acquired.
return (i * kDataSizeBits) + acquired_idx;
}
return -1LL;
}
size_t BitMap::Acquire() {
for (size_t i = 0; i < data_.size(); i++) {
uint64_t entry = 0;
uint64_t new_entry = 0;
int64_t acquired_idx = -1;
do {
entry = data_[i];
uint8_t index = lzcnt(entry);
if (index == kDataSizeBits) {
// None free.
acquired_idx = -1;
break;
}
// Entry has a free bit. Acquire it.
uint64_t bit = 1ull << (kDataSizeBits - index - 1);
new_entry = entry & ~bit;
assert_not_zero(entry & bit);
acquired_idx = index;
} while (!atomic_cas(entry, new_entry, &data_[i]));
if (acquired_idx != -1) {
// Acquired.
return (i * kDataSizeBits) + acquired_idx;
size_t attempt_result = TryAcquireAt(i);
if (attempt_result != -1LL) {
return attempt_result;
}
}
return -1;
return -1LL;
}
size_t BitMap::AcquireFromBack() {
if (!data_.size()) {
return -1LL;
}
for (ptrdiff_t i = data_.size() - 1; i >= 0; i--) {
size_t attempt_result = TryAcquireAt(static_cast<size_t>(i));
if (attempt_result != -1LL) {
return attempt_result;
}
}
return -1LL;
}
void BitMap::Release(size_t index) {

View File

@ -32,7 +32,7 @@ class BitMap {
// (threadsafe) Acquires an entry and returns its index. Returns -1 if there
// are no more free entries.
size_t Acquire();
size_t AcquireFromBack();
// (threadsafe) Releases an entry by an index.
void Release(size_t index);
@ -49,6 +49,7 @@ class BitMap {
const static size_t kDataSize = 8;
const static size_t kDataSizeBits = kDataSize * 8;
std::vector<uint64_t> data_;
inline size_t TryAcquireAt(size_t i);
};
} // namespace xe

View File

@ -38,7 +38,9 @@ struct GuestPseudoStackTrace {
};
class Assembler;
class CodeCache;
using GuestTrampolineProc = void (*)(ppc::PPCContext* context, void* userarg1,
void* userarg2);
using SimpleGuestTrampolineProc = void (*)(ppc::PPCContext*);
class Backend {
public:
explicit Backend();
@ -95,11 +97,74 @@ class Backend {
virtual bool PopulatePseudoStacktrace(GuestPseudoStackTrace* st) {
return false;
}
virtual uint32_t CreateGuestTrampoline(GuestTrampolineProc proc,
void* userdata1, void* userdata2,
bool long_term = false) {
return 0;
}
uint32_t CreateGuestTrampoline(void (*func)(ppc::PPCContext*),
bool long_term = false) {
return CreateGuestTrampoline(
reinterpret_cast<GuestTrampolineProc>(reinterpret_cast<void*>(func)),
nullptr, nullptr);
}
// if long-term, allocate towards the back of bitset to make allocating short
// term ones faster
uint32_t CreateLongTermGuestTrampoline(void (*func)(ppc::PPCContext*)) {
return CreateGuestTrampoline(
reinterpret_cast<GuestTrampolineProc>(reinterpret_cast<void*>(func)),
nullptr, nullptr, true);
}
virtual void FreeGuestTrampoline(uint32_t trampoline_addr) {}
protected:
Processor* processor_ = nullptr;
MachineInfo machine_info_;
CodeCache* code_cache_ = nullptr;
};
/*
* a set of guest trampolines that all have shared ownership.
*/
struct GuestTrampolineGroup
: public std::map<SimpleGuestTrampolineProc, uint32_t> {
Backend* const m_backend;
xe_mutex m_mutex;
uint32_t _NewTrampoline(SimpleGuestTrampolineProc proc, bool longterm) {
uint32_t result;
m_mutex.lock();
auto iter = this->find(proc);
if (iter == this->end()) {
uint32_t new_entry = longterm
? m_backend->CreateLongTermGuestTrampoline(proc)
: m_backend->CreateGuestTrampoline(proc);
this->emplace_hint(iter, proc, new_entry);
result = new_entry;
} else {
result = iter->second;
}
m_mutex.unlock();
return result;
}
public:
GuestTrampolineGroup(Backend* backend) : m_backend(backend) {}
~GuestTrampolineGroup() {
m_mutex.lock();
for (auto&& entry : *this) {
m_backend->FreeGuestTrampoline(entry.second);
}
m_mutex.unlock();
}
uint32_t NewLongtermTrampoline(SimpleGuestTrampolineProc proc) {
return _NewTrampoline(proc, true);
}
uint32_t NewTrampoline(SimpleGuestTrampolineProc proc) {
return _NewTrampoline(proc, false);
}
};
} // namespace backend
} // namespace cpu

View File

@ -90,6 +90,25 @@ class X64HelperEmitter : public X64Emitter {
void EmitLoadNonvolatileRegs();
};
#if XE_PLATFORM_WIN32
static constexpr unsigned char guest_trampoline_template[] = {
0x48, 0xBA, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x49,
0xB8, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48, 0xB9,
0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48, 0xB8, 0x99,
0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0xFF, 0xE0};
#else
// sysv x64 abi, exact same offsets for args
static constexpr unsigned char guest_trampoline_template[] = {
0x48, 0xBF, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48,
0xBE, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48, 0xB9,
0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48, 0xB8, 0x99,
0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0xFF, 0xE0};
#endif
static constexpr uint32_t guest_trampoline_template_offset_arg1 = 2,
guest_trampoline_template_offset_arg2 = 0xC,
guest_trampoline_template_offset_rcx = 0x16,
guest_trampoline_template_offset_rax = 0x20;
X64Backend::X64Backend() : Backend(), code_cache_(nullptr) {
if (cs_open(CS_ARCH_X86, CS_MODE_64, &capstone_handle_) != CS_ERR_OK) {
assert_always("Failed to initialize capstone");
@ -97,6 +116,23 @@ X64Backend::X64Backend() : Backend(), code_cache_(nullptr) {
cs_option(capstone_handle_, CS_OPT_SYNTAX, CS_OPT_SYNTAX_INTEL);
cs_option(capstone_handle_, CS_OPT_DETAIL, CS_OPT_ON);
cs_option(capstone_handle_, CS_OPT_SKIPDATA, CS_OPT_OFF);
uint32_t base_address = 0x10000;
void* buf_trampoline_code = nullptr;
while (base_address < 0x80000000) {
buf_trampoline_code = memory::AllocFixed(
(void*)(uintptr_t)base_address,
sizeof(guest_trampoline_template) * MAX_GUEST_TRAMPOLINES,
xe::memory::AllocationType::kReserveCommit,
xe::memory::PageAccess::kExecuteReadWrite);
if (!buf_trampoline_code) {
base_address += 65536;
} else {
break;
}
}
xenia_assert(buf_trampoline_code);
guest_trampoline_memory_ = (uint8_t*)buf_trampoline_code;
guest_trampoline_address_bitmap_.Resize(MAX_GUEST_TRAMPOLINES);
}
X64Backend::~X64Backend() {
@ -106,6 +142,13 @@ X64Backend::~X64Backend() {
X64Emitter::FreeConstData(emitter_data_);
ExceptionHandler::Uninstall(&ExceptionCallbackThunk, this);
if (guest_trampoline_memory_) {
memory::DeallocFixed(
guest_trampoline_memory_,
sizeof(guest_trampoline_template) * MAX_GUEST_TRAMPOLINES,
memory::DeallocationType::kRelease);
guest_trampoline_memory_ = nullptr;
}
}
static void ForwardMMIOAccessForRecording(void* context, void* hostaddr) {
@ -212,6 +255,9 @@ bool X64Backend::Initialize(Processor* processor) {
if (!code_cache_->Initialize()) {
return false;
}
// HV range
code_cache()->CommitExecutableRange(GUEST_TRAMPOLINE_BASE,
GUEST_TRAMPOLINE_END);
// Allocate emitter constant data.
emitter_data_ = X64Emitter::PlaceConstData();
@ -241,7 +287,8 @@ bool X64Backend::Initialize(Processor* processor) {
reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false);
reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);
vrsqrtefp_scalar_helper = thunk_emitter.EmitScalarVRsqrteHelper();
vrsqrtefp_vector_helper = thunk_emitter.EmitVectorVRsqrteHelper(vrsqrtefp_scalar_helper);
vrsqrtefp_vector_helper =
thunk_emitter.EmitVectorVRsqrteHelper(vrsqrtefp_scalar_helper);
frsqrtefp_helper = thunk_emitter.EmitFrsqrteHelper();
// Set the code cache to use the ResolveFunction thunk for default
// indirections.
@ -850,7 +897,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
_code_offsets code_offsets = {};
code_offsets.prolog = getSize();
pop(r8); // return address
switch (stack_element_size) {
case 4:
mov(r11d, ptr[r8]);
@ -919,11 +966,11 @@ void* X64HelperEmitter::EmitScalarVRsqrteHelper() {
bt(GetBackendFlagsPtr(), kX64BackendNJMOn);
jnc(handle_denormal_input, CodeGenerator::T_NEAR);
// handle denormal input with NJM on
// handle denormal input with NJM on
// denorms get converted to zero w/ input sign, jump to our label
// that handles inputs of 0 for this
jmp(convert_to_signed_inf_and_ret);
L(L35);
@ -1038,7 +1085,6 @@ void* X64HelperEmitter::EmitScalarVRsqrteHelper() {
L(L1);
ret();
L(handle_denormal_input);
mov(r9d, r8d);
and_(r9d, 0x7FFFFFFF);
@ -1089,7 +1135,6 @@ void* X64HelperEmitter::EmitScalarVRsqrteHelper() {
dd(0x7FC00000);
dd(0x5F34FD00);
code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();
code_offsets.prolog = getSize();
@ -1126,18 +1171,16 @@ void* X64HelperEmitter::EmitVectorVRsqrteHelper(void* scalar_helper) {
jnz(actual_vector_version);
vshufps(xmm0, xmm0,xmm0, _MM_SHUFFLE(3, 3, 3, 3));
call(scalar_helper);
// this->DebugBreak();
// this->DebugBreak();
vinsertps(xmm0, xmm0, (3 << 4) | (0 << 6));
vblendps(xmm0, xmm0, ptr[backend()->LookupXMMConstantAddress(XMMFloatInf)],
0b0111);
ret();
L(actual_vector_version);
xor_(ecx, ecx);
vmovaps(result_ptr, xmm0);
@ -1172,7 +1215,7 @@ void* X64HelperEmitter::EmitFrsqrteHelper() {
code_offsets.epilog = getSize();
code_offsets.tail = getSize();
code_offsets.prolog = getSize();
Xbyak::Label L2, L7, L6, L9, L1, L12, L24, L3, L25, frsqrte_table2, LC1;
bt(GetBackendFlagsPtr(), kX64BackendNonIEEEMode);
vmovq(rax, xmm0);
@ -1190,7 +1233,7 @@ void* X64HelperEmitter::EmitFrsqrteHelper() {
not_(rcx);
and_(rcx, rdx);
}
jne(L6);
cmp(rax, rdx);
je(L1, CodeGenerator::T_NEAR);
@ -1199,7 +1242,7 @@ void* X64HelperEmitter::EmitFrsqrteHelper() {
jne(L7);
vcomisd(xmm0, xmm1);
jb(L12, CodeGenerator::T_NEAR);
L(L7);
mov(rdx, 0x7ff8000000000000ULL);
or_(rax, rdx);
@ -1236,7 +1279,7 @@ void* X64HelperEmitter::EmitFrsqrteHelper() {
sal(rax, 44);
or_(rax, rdx);
vmovq(xmm1, rax);
L(L1);
vmovapd(xmm0, xmm1);
ret();
@ -1255,7 +1298,7 @@ void* X64HelperEmitter::EmitFrsqrteHelper() {
jne(L2);
mov(rdx, 0x8000000000000000ULL);
and_(rax, rdx);
L(L3);
mov(rdx, 0x8000000000000000ULL);
and_(rax, rdx);
@ -1617,6 +1660,53 @@ uint64_t* X64Backend::GetProfilerRecordForFunction(uint32_t guest_address) {
}
#endif
// todo:flush cache
uint32_t X64Backend::CreateGuestTrampoline(GuestTrampolineProc proc,
void* userdata1, void* userdata2,
bool longterm) {
size_t new_index;
if (longterm) {
new_index = guest_trampoline_address_bitmap_.AcquireFromBack();
} else {
new_index = guest_trampoline_address_bitmap_.Acquire();
}
xenia_assert(new_index != (size_t)-1);
uint8_t* write_pos =
&guest_trampoline_memory_[sizeof(guest_trampoline_template) * new_index];
memcpy(write_pos, guest_trampoline_template,
sizeof(guest_trampoline_template));
*reinterpret_cast<void**>(&write_pos[guest_trampoline_template_offset_arg1]) =
userdata1;
*reinterpret_cast<void**>(&write_pos[guest_trampoline_template_offset_arg2]) =
userdata2;
*reinterpret_cast<GuestTrampolineProc*>(
&write_pos[guest_trampoline_template_offset_rcx]) = proc;
*reinterpret_cast<GuestToHostThunk*>(
&write_pos[guest_trampoline_template_offset_rax]) = guest_to_host_thunk_;
uint32_t indirection_guest_addr =
GUEST_TRAMPOLINE_BASE +
(static_cast<uint32_t>(new_index) * GUEST_TRAMPOLINE_MIN_LEN);
code_cache()->AddIndirection(
indirection_guest_addr,
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(write_pos)));
return indirection_guest_addr;
}
void X64Backend::FreeGuestTrampoline(uint32_t trampoline_addr) {
xenia_assert(trampoline_addr >= GUEST_TRAMPOLINE_BASE &&
trampoline_addr < GUEST_TRAMPOLINE_END);
size_t index =
(trampoline_addr - GUEST_TRAMPOLINE_BASE) / GUEST_TRAMPOLINE_MIN_LEN;
guest_trampoline_address_bitmap_.Release(index);
}
} // namespace x64
} // namespace backend
} // namespace cpu

View File

@ -13,6 +13,7 @@
#include <memory>
#include "xenia/base/cvar.h"
#include "xenia/base/bit_map.h"
#include "xenia/cpu/backend/backend.h"
#if XE_PLATFORM_WIN32 == 1
@ -42,6 +43,19 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
typedef void (*ResolveFunctionThunk)();
/*
place guest trampolines in the memory range that the HV normally occupies.
This way guests can call in via the indirection table and we don't have to clobber/reuse an existing memory range
The xboxkrnl range is already used by export trampolines (see kernel/kernel_module.cc)
*/
static constexpr uint32_t GUEST_TRAMPOLINE_BASE = 0x80000000;
static constexpr uint32_t GUEST_TRAMPOLINE_END = 0x80040000;
static constexpr uint32_t GUEST_TRAMPOLINE_MIN_LEN = 8;
static constexpr uint32_t MAX_GUEST_TRAMPOLINES =
(GUEST_TRAMPOLINE_END - GUEST_TRAMPOLINE_BASE) / GUEST_TRAMPOLINE_MIN_LEN;
#define RESERVE_BLOCK_SHIFT 16
#define RESERVE_NUM_ENTRIES \
@ -155,6 +169,11 @@ class X64Backend : public Backend {
return reinterpret_cast<X64BackendContext*>(
reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
}
virtual uint32_t CreateGuestTrampoline(GuestTrampolineProc proc,
void* userdata1,
void* userdata2, bool long_term) override;
virtual void FreeGuestTrampoline(uint32_t trampoline_addr) override;
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;
virtual bool PopulatePseudoStacktrace(GuestPseudoStackTrace* st) override;
void RecordMMIOExceptionForGuestInstruction(void* host_address);
@ -200,6 +219,11 @@ class X64Backend : public Backend {
#endif
alignas(64) ReserveHelper reserve_helper_;
// allocates 8-byte aligned addresses in a normally not executable guest
// address
// range that will be used to dispatch to host code
BitMap guest_trampoline_address_bitmap_;
uint8_t* guest_trampoline_memory_;
};
} // namespace x64