Merge pull request #96 from chrisps/host_guest_stack_synchronization
Host/Guest stack sync, exception messagebox, kernel improvements, minor opt
This commit is contained in:
commit
0674b68143
|
@ -103,3 +103,5 @@ node_modules/.bin/
|
|||
/tools/shader-playground/*.dll
|
||||
/profile_print_times.py
|
||||
/profile_times.txt
|
||||
/cache1
|
||||
/cache0
|
||||
|
|
|
@ -35,13 +35,15 @@ static bool has_shell_environment_variable() {
|
|||
}
|
||||
|
||||
void AttachConsole() {
|
||||
bool has_console = ::AttachConsole(ATTACH_PARENT_PROCESS) == TRUE;
|
||||
|
||||
bool has_console = ::AttachConsole(ATTACH_PARENT_PROCESS) == TRUE;
|
||||
#if 0
|
||||
if (!has_console || !has_shell_environment_variable()) {
|
||||
// We weren't launched from a console, so just return.
|
||||
has_console_attached_ = false;
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
AllocConsole();
|
||||
|
||||
has_console_attached_ = true;
|
||||
|
|
|
@ -410,34 +410,7 @@ static float ArchReciprocal(float den) {
|
|||
return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
|
||||
}
|
||||
|
||||
#if 0
|
||||
using ArchFloatMask = float;
|
||||
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchCmpneqFloatMask(float x, float y) {
|
||||
return _mm_cvtss_f32(_mm_cmpneq_ss(_mm_set_ss(x), _mm_set_ss(y)));
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||
return _mm_cvtss_f32(_mm_or_ps(_mm_set_ss(x), _mm_set_ss(y)));
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||
return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x), _mm_set_ss(y)));
|
||||
}
|
||||
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x), _mm_set_ss(y)));
|
||||
}
|
||||
|
||||
XE_FORCEINLINE
|
||||
static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) {
|
||||
return static_cast<uint32_t>(_mm_movemask_ps(_mm_set_ss(x)));
|
||||
}
|
||||
|
||||
constexpr ArchFloatMask floatmask_zero = .0f;
|
||||
#else
|
||||
|
||||
using ArchFloatMask = __m128;
|
||||
|
||||
XE_FORCEINLINE
|
||||
|
@ -464,7 +437,7 @@ static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) {
|
|||
}
|
||||
|
||||
constexpr ArchFloatMask floatmask_zero{.0f};
|
||||
#endif
|
||||
|
||||
#else
|
||||
static float ArchMin(float x, float y) { return std::min<float>(x, y); }
|
||||
static float ArchMax(float x, float y) { return std::max<float>(x, y); }
|
||||
|
@ -610,17 +583,17 @@ union IDivExtraInfo {
|
|||
} info;
|
||||
};
|
||||
// returns magicnum multiplier
|
||||
static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
|
||||
IDivExtraInfo extra;
|
||||
static constexpr uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
|
||||
IDivExtraInfo extra{};
|
||||
|
||||
uint32_t d = _denom;
|
||||
int p;
|
||||
uint32_t nc, delta, q1, r1, q2, r2;
|
||||
int p=0;
|
||||
uint32_t nc=0, delta=0, q1=0, r1=0, q2=0, r2=0;
|
||||
struct {
|
||||
unsigned M;
|
||||
int a;
|
||||
int s;
|
||||
} magu;
|
||||
} magu{};
|
||||
magu.a = 0;
|
||||
nc = -1 - ((uint32_t) - (int32_t)d) % d;
|
||||
p = 31;
|
||||
|
@ -660,13 +633,13 @@ static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
|
|||
return static_cast<uint64_t>(q2 + 1);
|
||||
}
|
||||
|
||||
static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
|
||||
static constexpr uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
|
||||
uint32_t extradata) {
|
||||
IDivExtraInfo extra;
|
||||
IDivExtraInfo extra{};
|
||||
|
||||
extra.value_ = extradata;
|
||||
|
||||
uint32_t result = ((uint64_t)(num) * (uint64_t)mul) >> 32;
|
||||
uint32_t result = static_cast<uint32_t>((static_cast<uint64_t>(num) * static_cast<uint64_t>(mul)) >> 32);
|
||||
if (extra.info.add_) {
|
||||
uint32_t addend = result + num;
|
||||
addend = ((addend < result ? 0x80000000 : 0) | addend);
|
||||
|
@ -675,7 +648,7 @@ static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
|
|||
return result >> extra.info.shift_;
|
||||
}
|
||||
|
||||
static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
|
||||
static constexpr uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
|
||||
uint32_t extradata, uint32_t original) {
|
||||
uint32_t dived = ApplyUint32Div(num, mul, extradata);
|
||||
unsigned result = num - (dived * original);
|
||||
|
@ -686,12 +659,12 @@ static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
|
|||
struct MagicDiv {
|
||||
uint32_t multiplier_;
|
||||
uint32_t extradata_;
|
||||
MagicDiv() : multiplier_(0), extradata_(0) {}
|
||||
MagicDiv(uint32_t original) {
|
||||
constexpr MagicDiv() : multiplier_(0), extradata_(0) {}
|
||||
constexpr MagicDiv(uint32_t original) : MagicDiv() {
|
||||
multiplier_ = PregenerateUint32Div(original, extradata_);
|
||||
}
|
||||
|
||||
uint32_t Apply(uint32_t numerator) const {
|
||||
constexpr uint32_t Apply(uint32_t numerator) const {
|
||||
return ApplyUint32Div(numerator, multiplier_, extradata_);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -28,6 +28,9 @@ namespace xe {
|
|||
namespace memory {
|
||||
|
||||
size_t page_size() {
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
return 4096;
|
||||
#else
|
||||
static size_t value = 0;
|
||||
if (!value) {
|
||||
SYSTEM_INFO si;
|
||||
|
@ -35,9 +38,13 @@ size_t page_size() {
|
|||
value = si.dwPageSize;
|
||||
}
|
||||
return value;
|
||||
#endif
|
||||
}
|
||||
|
||||
size_t allocation_granularity() {
|
||||
#if XE_ARCH_AMD64 == 1 && XE_PLATFORM_WIN32 == 1
|
||||
return 65536;
|
||||
#else
|
||||
static size_t value = 0;
|
||||
if (!value) {
|
||||
SYSTEM_INFO si;
|
||||
|
@ -45,6 +52,7 @@ size_t allocation_granularity() {
|
|||
value = si.dwAllocationGranularity;
|
||||
}
|
||||
return value;
|
||||
#endif
|
||||
}
|
||||
|
||||
DWORD ToWin32ProtectFlags(PageAccess access) {
|
||||
|
|
|
@ -37,7 +37,7 @@
|
|||
#define XE_USE_NTDLL_FUNCTIONS 1
|
||||
//chrispy: disabling this for now, more research needs to be done imo, although it does work very well on my machine
|
||||
//
|
||||
#define XE_USE_KUSER_SHARED 0
|
||||
#define XE_USE_KUSER_SHARED 1
|
||||
#if XE_USE_NTDLL_FUNCTIONS == 1
|
||||
/*
|
||||
ntdll versions of functions often skip through a lot of extra garbage in
|
||||
|
|
|
@ -67,7 +67,22 @@ class Backend {
|
|||
// up until the start of ctx may be used by the backend to store whatever data
|
||||
// they want
|
||||
virtual void InitializeBackendContext(void* ctx) {}
|
||||
|
||||
/*
|
||||
Free any dynamically allocated data/resources that the backendcontext uses
|
||||
*/
|
||||
virtual void DeinitializeBackendContext(void* ctx) {}
|
||||
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode){};
|
||||
/*
|
||||
called by KeSetCurrentStackPointers in xboxkrnl_threading.cc just prior
|
||||
to calling XThread::Reenter this is an opportunity for a backend to clear any
|
||||
data related to the guest stack
|
||||
|
||||
in the case of the X64 backend, it means we reset the stackpoint index
|
||||
to 0, since its a new stack and all of our old entries are invalid now
|
||||
|
||||
* */
|
||||
virtual void PrepareForReentry(void* ctx) {}
|
||||
|
||||
protected:
|
||||
Processor* processor_ = nullptr;
|
||||
|
|
|
@ -31,7 +31,16 @@ DEFINE_bool(record_mmio_access_exceptions, true,
|
|||
"For guest addresses records whether we caught any mmio accesses "
|
||||
"for them. This info can then be used on a subsequent run to "
|
||||
"instruct the recompiler to emit checks",
|
||||
"CPU");
|
||||
"x64");
|
||||
|
||||
DEFINE_int64(max_stackpoints, 65536,
|
||||
"Max number of host->guest stack mappings we can record.", "x64");
|
||||
|
||||
DEFINE_bool(enable_host_guest_stack_synchronization, true,
|
||||
"Records entries for guest/host stack mappings at function starts "
|
||||
"and checks for reentry at return sites. Has slight performance "
|
||||
"impact, but fixes crashes in games that use setjmp/longjmp.",
|
||||
"x64");
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
DECLARE_bool(instrument_call_times);
|
||||
#endif
|
||||
|
@ -41,15 +50,29 @@ namespace cpu {
|
|||
namespace backend {
|
||||
namespace x64 {
|
||||
|
||||
class X64ThunkEmitter : public X64Emitter {
|
||||
class X64HelperEmitter : public X64Emitter {
|
||||
public:
|
||||
X64ThunkEmitter(X64Backend* backend, XbyakAllocator* allocator);
|
||||
~X64ThunkEmitter() override;
|
||||
struct _code_offsets {
|
||||
size_t prolog;
|
||||
size_t prolog_stack_alloc;
|
||||
size_t body;
|
||||
size_t epilog;
|
||||
size_t tail;
|
||||
};
|
||||
X64HelperEmitter(X64Backend* backend, XbyakAllocator* allocator);
|
||||
~X64HelperEmitter() override;
|
||||
HostToGuestThunk EmitHostToGuestThunk();
|
||||
GuestToHostThunk EmitGuestToHostThunk();
|
||||
ResolveFunctionThunk EmitResolveFunctionThunk();
|
||||
void* EmitGuestAndHostSynchronizeStackHelper();
|
||||
// 1 for loading byte, 2 for halfword and 4 for word.
|
||||
// these specialized versions save space in the caller
|
||||
void* EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
||||
void* sync_func, unsigned stack_element_size);
|
||||
|
||||
private:
|
||||
void* EmitCurrentForOffsets(const _code_offsets& offsets,
|
||||
size_t stack_size = 0);
|
||||
// The following four functions provide save/load functionality for registers.
|
||||
// They assume at least StackLayout::THUNK_STACK_SIZE bytes have been
|
||||
// allocated on the stack.
|
||||
|
@ -184,11 +207,26 @@ bool X64Backend::Initialize(Processor* processor) {
|
|||
|
||||
// Generate thunks used to transition between jitted code and host code.
|
||||
XbyakAllocator allocator;
|
||||
X64ThunkEmitter thunk_emitter(this, &allocator);
|
||||
X64HelperEmitter thunk_emitter(this, &allocator);
|
||||
host_to_guest_thunk_ = thunk_emitter.EmitHostToGuestThunk();
|
||||
guest_to_host_thunk_ = thunk_emitter.EmitGuestToHostThunk();
|
||||
resolve_function_thunk_ = thunk_emitter.EmitResolveFunctionThunk();
|
||||
|
||||
if (cvars::enable_host_guest_stack_synchronization) {
|
||||
synchronize_guest_and_host_stack_helper_ =
|
||||
thunk_emitter.EmitGuestAndHostSynchronizeStackHelper();
|
||||
|
||||
synchronize_guest_and_host_stack_helper_size8_ =
|
||||
thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
||||
synchronize_guest_and_host_stack_helper_, 1);
|
||||
synchronize_guest_and_host_stack_helper_size16_ =
|
||||
thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
||||
synchronize_guest_and_host_stack_helper_, 2);
|
||||
synchronize_guest_and_host_stack_helper_size32_ =
|
||||
thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
||||
synchronize_guest_and_host_stack_helper_, 4);
|
||||
}
|
||||
|
||||
// Set the code cache to use the ResolveFunction thunk for default
|
||||
// indirections.
|
||||
assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
|
||||
|
@ -203,9 +241,10 @@ bool X64Backend::Initialize(Processor* processor) {
|
|||
|
||||
// Setup exception callback
|
||||
ExceptionHandler::Install(&ExceptionCallbackThunk, this);
|
||||
|
||||
processor->memory()->SetMMIOExceptionRecordingCallback(
|
||||
ForwardMMIOAccessForRecording, (void*)this);
|
||||
if (cvars::record_mmio_access_exceptions) {
|
||||
processor->memory()->SetMMIOExceptionRecordingCallback(
|
||||
ForwardMMIOAccessForRecording, (void*)this);
|
||||
}
|
||||
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
if (cvars::instrument_call_times) {
|
||||
|
@ -509,23 +548,32 @@ bool X64Backend::ExceptionCallback(Exception* ex) {
|
|||
return processor()->OnThreadBreakpointHit(ex);
|
||||
}
|
||||
|
||||
X64ThunkEmitter::X64ThunkEmitter(X64Backend* backend, XbyakAllocator* allocator)
|
||||
X64HelperEmitter::X64HelperEmitter(X64Backend* backend,
|
||||
XbyakAllocator* allocator)
|
||||
: X64Emitter(backend, allocator) {}
|
||||
|
||||
X64ThunkEmitter::~X64ThunkEmitter() {}
|
||||
X64HelperEmitter::~X64HelperEmitter() {}
|
||||
void* X64HelperEmitter::EmitCurrentForOffsets(const _code_offsets& code_offsets,
|
||||
size_t stack_size) {
|
||||
EmitFunctionInfo func_info = {};
|
||||
func_info.code_size.total = getSize();
|
||||
func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
|
||||
func_info.code_size.body = code_offsets.epilog - code_offsets.body;
|
||||
func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
|
||||
func_info.code_size.tail = getSize() - code_offsets.tail;
|
||||
func_info.prolog_stack_alloc_offset =
|
||||
code_offsets.prolog_stack_alloc - code_offsets.prolog;
|
||||
func_info.stack_size = stack_size;
|
||||
|
||||
HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
|
||||
void* fn = Emplace(func_info);
|
||||
return fn;
|
||||
}
|
||||
HostToGuestThunk X64HelperEmitter::EmitHostToGuestThunk() {
|
||||
// rcx = target
|
||||
// rdx = arg0 (context)
|
||||
// r8 = arg1 (guest return address)
|
||||
|
||||
struct _code_offsets {
|
||||
size_t prolog;
|
||||
size_t prolog_stack_alloc;
|
||||
size_t body;
|
||||
size_t epilog;
|
||||
size_t tail;
|
||||
} code_offsets = {};
|
||||
_code_offsets code_offsets = {};
|
||||
|
||||
const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
||||
|
||||
|
@ -576,19 +624,13 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
|
|||
return (HostToGuestThunk)fn;
|
||||
}
|
||||
|
||||
GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
|
||||
GuestToHostThunk X64HelperEmitter::EmitGuestToHostThunk() {
|
||||
// rcx = target function
|
||||
// rdx = arg0
|
||||
// r8 = arg1
|
||||
// r9 = arg2
|
||||
|
||||
struct _code_offsets {
|
||||
size_t prolog;
|
||||
size_t prolog_stack_alloc;
|
||||
size_t body;
|
||||
size_t epilog;
|
||||
size_t tail;
|
||||
} code_offsets = {};
|
||||
_code_offsets code_offsets = {};
|
||||
|
||||
const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
||||
|
||||
|
@ -635,17 +677,11 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
|
|||
// X64Emitter handles actually resolving functions.
|
||||
uint64_t ResolveFunction(void* raw_context, uint64_t target_address);
|
||||
|
||||
ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
|
||||
ResolveFunctionThunk X64HelperEmitter::EmitResolveFunctionThunk() {
|
||||
// ebx = target PPC address
|
||||
// rcx = context
|
||||
|
||||
struct _code_offsets {
|
||||
size_t prolog;
|
||||
size_t prolog_stack_alloc;
|
||||
size_t body;
|
||||
size_t epilog;
|
||||
size_t tail;
|
||||
} code_offsets = {};
|
||||
_code_offsets code_offsets = {};
|
||||
|
||||
const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
||||
|
||||
|
@ -688,8 +724,116 @@ ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
|
|||
void* fn = Emplace(func_info);
|
||||
return (ResolveFunctionThunk)fn;
|
||||
}
|
||||
// r11 = size of callers stack, r8 = return address w/ adjustment
|
||||
//i'm not proud of this code, but it shouldn't be executed frequently at all
|
||||
void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper() {
|
||||
_code_offsets code_offsets = {};
|
||||
code_offsets.prolog = getSize();
|
||||
mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints)));
|
||||
mov(eax,
|
||||
GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)));
|
||||
|
||||
void X64ThunkEmitter::EmitSaveVolatileRegs() {
|
||||
lea(ecx, ptr[eax - 1]);
|
||||
mov(r9d, ptr[GetContextReg() + offsetof(ppc::PPCContext, r[1])]);
|
||||
|
||||
Xbyak::Label looper{};
|
||||
Xbyak::Label loopout{};
|
||||
Xbyak::Label signed_underflow{};
|
||||
xor_(r12d, r12d);
|
||||
|
||||
//todo: should use Loop instruction here if hasFastLoop,
|
||||
//currently xbyak does not support it but its super easy to modify xbyak to have it
|
||||
L(looper);
|
||||
imul(edx, ecx, sizeof(X64BackendStackpoint));
|
||||
mov(r10d, ptr[rbx + rdx + offsetof(X64BackendStackpoint, guest_stack_)]);
|
||||
|
||||
cmp(r10d, r9d);
|
||||
|
||||
jge(loopout, T_NEAR);
|
||||
|
||||
inc(r12d);
|
||||
|
||||
if (IsFeatureEnabled(kX64FlagsIndependentVars)) {
|
||||
dec(ecx);
|
||||
} else {
|
||||
sub(ecx, 1);
|
||||
}
|
||||
js(signed_underflow, T_NEAR); // should be impossible!!
|
||||
|
||||
|
||||
jmp(looper, T_NEAR);
|
||||
L(loopout);
|
||||
Xbyak::Label skip_adjust{};
|
||||
cmp(r12d, 1);//should never happen?
|
||||
jle(skip_adjust, T_NEAR);
|
||||
mov(rsp, ptr[rbx + rdx + offsetof(X64BackendStackpoint, host_stack_)]);
|
||||
if (IsFeatureEnabled(kX64FlagsIndependentVars)) {
|
||||
inc(ecx);
|
||||
} else {
|
||||
add(ecx, 1);
|
||||
}
|
||||
|
||||
// this->DebugBreak();
|
||||
sub(rsp, r11); // adjust stack
|
||||
|
||||
mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
|
||||
ecx); // set next stackpoint index to be after the one we restored to
|
||||
L(skip_adjust);
|
||||
|
||||
jmp(r8); // return to caller
|
||||
code_offsets.prolog_stack_alloc = getSize();
|
||||
code_offsets.body = getSize();
|
||||
code_offsets.epilog = getSize();
|
||||
code_offsets.tail = getSize();
|
||||
|
||||
L(signed_underflow);
|
||||
//find a good, compact way to signal error here
|
||||
// maybe an invalid opcode that we execute, then detect in an exception handler?
|
||||
|
||||
this->DebugBreak();
|
||||
// stack unwinding, take first entry
|
||||
//actually, no reason to have this
|
||||
|
||||
/*mov(rsp, ptr[rbx + offsetof(X64BackendStackpoint, host_stack_)]);
|
||||
mov(ptr[rbx + offsetof(X64BackendStackpoint, guest_stack_)], r9d);
|
||||
sub(rsp, r11);
|
||||
xor_(eax, eax);
|
||||
inc(eax);
|
||||
mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
|
||||
eax);
|
||||
|
||||
jmp(r8);*/
|
||||
// this->DebugBreak(); // err, add an xe::FatalError to call for this
|
||||
|
||||
return EmitCurrentForOffsets(code_offsets);
|
||||
}
|
||||
|
||||
void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
||||
void* sync_func, unsigned stack_element_size) {
|
||||
_code_offsets code_offsets = {};
|
||||
code_offsets.prolog = getSize();
|
||||
pop(r8); // return address
|
||||
|
||||
switch (stack_element_size) {
|
||||
case 4:
|
||||
mov(r11d, ptr[r8]);
|
||||
break;
|
||||
case 2:
|
||||
movzx(r11d, word[r8]);
|
||||
break;
|
||||
case 1:
|
||||
movzx(r11d, byte[r8]);
|
||||
break;
|
||||
}
|
||||
add(r8, stack_element_size);
|
||||
jmp(sync_func, T_NEAR);
|
||||
code_offsets.prolog_stack_alloc = getSize();
|
||||
code_offsets.body = getSize();
|
||||
code_offsets.epilog = getSize();
|
||||
code_offsets.tail = getSize();
|
||||
return EmitCurrentForOffsets(code_offsets);
|
||||
}
|
||||
void X64HelperEmitter::EmitSaveVolatileRegs() {
|
||||
// Save off volatile registers.
|
||||
// mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax);
|
||||
mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rcx);
|
||||
|
@ -711,7 +855,7 @@ void X64ThunkEmitter::EmitSaveVolatileRegs() {
|
|||
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm5);
|
||||
}
|
||||
|
||||
void X64ThunkEmitter::EmitLoadVolatileRegs() {
|
||||
void X64HelperEmitter::EmitLoadVolatileRegs() {
|
||||
// mov(rax, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
|
||||
mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
|
||||
mov(rdx, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
|
||||
|
@ -732,7 +876,7 @@ void X64ThunkEmitter::EmitLoadVolatileRegs() {
|
|||
vmovaps(xmm5, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]);
|
||||
}
|
||||
|
||||
void X64ThunkEmitter::EmitSaveNonvolatileRegs() {
|
||||
void X64HelperEmitter::EmitSaveNonvolatileRegs() {
|
||||
mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rbx);
|
||||
mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rbp);
|
||||
#if XE_PLATFORM_WIN32
|
||||
|
@ -760,7 +904,7 @@ void X64ThunkEmitter::EmitSaveNonvolatileRegs() {
|
|||
#endif
|
||||
}
|
||||
|
||||
void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
|
||||
void X64HelperEmitter::EmitLoadNonvolatileRegs() {
|
||||
mov(rbx, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
|
||||
mov(rbp, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
|
||||
#if XE_PLATFORM_WIN32
|
||||
|
@ -788,16 +932,41 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
|
|||
}
|
||||
void X64Backend::InitializeBackendContext(void* ctx) {
|
||||
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
||||
bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
|
||||
bctx->mxcsr_fpu =
|
||||
DEFAULT_FPU_MXCSR; // idk if this is right, check on rgh what the
|
||||
// rounding on ppc is at startup
|
||||
|
||||
/*
|
||||
todo: stackpoint arrays should be pooled virtual memory at the very
|
||||
least there may be some fancy virtual address tricks we can do here
|
||||
|
||||
*/
|
||||
|
||||
bctx->stackpoints = cvars::enable_host_guest_stack_synchronization
|
||||
? new X64BackendStackpoint[cvars::max_stackpoints]
|
||||
: nullptr;
|
||||
bctx->current_stackpoint_depth = 0;
|
||||
bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
|
||||
bctx->flags = 0;
|
||||
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
|
||||
bctx->Ox1000 = 0x1000;
|
||||
bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
|
||||
}
|
||||
void X64Backend::DeinitializeBackendContext(void* ctx) {
|
||||
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
||||
|
||||
if (bctx->stackpoints) {
|
||||
delete[] bctx->stackpoints;
|
||||
bctx->stackpoints = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void X64Backend::PrepareForReentry(void* ctx) {
|
||||
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
||||
|
||||
bctx->current_stackpoint_depth = 0;
|
||||
}
|
||||
|
||||
const uint32_t mxcsr_table[8] = {
|
||||
0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
|
||||
};
|
||||
|
|
|
@ -24,7 +24,8 @@
|
|||
#endif
|
||||
|
||||
DECLARE_int64(x64_extension_mask);
|
||||
|
||||
DECLARE_int64(max_stackpoints);
|
||||
DECLARE_bool(enable_host_guest_stack_synchronization);
|
||||
namespace xe {
|
||||
class Exception;
|
||||
} // namespace xe
|
||||
|
@ -41,14 +42,25 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
|
|||
typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
|
||||
typedef void (*ResolveFunctionThunk)();
|
||||
|
||||
struct X64BackendStackpoint {
|
||||
uint64_t host_stack_;
|
||||
unsigned guest_stack_;
|
||||
// pad to 16 bytes so we never end up having a 64 bit load/store for
|
||||
// host_stack_ straddling two lines. Consider this field reserved for future
|
||||
// use
|
||||
unsigned unused_;
|
||||
};
|
||||
// located prior to the ctx register
|
||||
// some things it would be nice to have be per-emulator instance instead of per
|
||||
// context (somehow placing a global X64BackendCtx prior to membase, so we can
|
||||
// negatively index the membase reg)
|
||||
struct X64BackendContext {
|
||||
void* ResolveFunction_Ptr; // cached pointer to resolvefunction
|
||||
// guest_tick_count is used if inline_loadclock is used
|
||||
uint64_t* guest_tick_count;
|
||||
// records mapping of host_stack to guest_stack
|
||||
X64BackendStackpoint* stackpoints;
|
||||
|
||||
unsigned int current_stackpoint_depth;
|
||||
unsigned int mxcsr_fpu; // currently, the way we implement rounding mode
|
||||
// affects both vmx and the fpu
|
||||
unsigned int mxcsr_vmx;
|
||||
|
@ -81,6 +93,19 @@ class X64Backend : public Backend {
|
|||
return resolve_function_thunk_;
|
||||
}
|
||||
|
||||
void* synchronize_guest_and_host_stack_helper() const {
|
||||
return synchronize_guest_and_host_stack_helper_;
|
||||
}
|
||||
void* synchronize_guest_and_host_stack_helper_for_size(size_t sz) const {
|
||||
switch (sz) {
|
||||
case 1:
|
||||
return synchronize_guest_and_host_stack_helper_size8_;
|
||||
case 2:
|
||||
return synchronize_guest_and_host_stack_helper_size16_;
|
||||
default:
|
||||
return synchronize_guest_and_host_stack_helper_size32_;
|
||||
}
|
||||
}
|
||||
bool Initialize(Processor* processor) override;
|
||||
|
||||
void CommitExecutableRange(uint32_t guest_low, uint32_t guest_high) override;
|
||||
|
@ -97,7 +122,8 @@ class X64Backend : public Backend {
|
|||
void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) override;
|
||||
void UninstallBreakpoint(Breakpoint* breakpoint) override;
|
||||
virtual void InitializeBackendContext(void* ctx) override;
|
||||
|
||||
virtual void DeinitializeBackendContext(void* ctx) override;
|
||||
virtual void PrepareForReentry(void* ctx) override;
|
||||
X64BackendContext* BackendContextForGuestContext(void* ctx) {
|
||||
return reinterpret_cast<X64BackendContext*>(
|
||||
reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
|
||||
|
@ -120,7 +146,12 @@ class X64Backend : public Backend {
|
|||
HostToGuestThunk host_to_guest_thunk_;
|
||||
GuestToHostThunk guest_to_host_thunk_;
|
||||
ResolveFunctionThunk resolve_function_thunk_;
|
||||
void* synchronize_guest_and_host_stack_helper_ = nullptr;
|
||||
|
||||
// loads stack sizes 1 byte, 2 bytes or 4 bytes
|
||||
void* synchronize_guest_and_host_stack_helper_size8_ = nullptr;
|
||||
void* synchronize_guest_and_host_stack_helper_size16_ = nullptr;
|
||||
void* synchronize_guest_and_host_stack_helper_size32_ = nullptr;
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
GuestProfilerData profiler_data_;
|
||||
#endif
|
||||
|
|
|
@ -213,6 +213,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
func_info.stack_size = stack_size;
|
||||
stack_size_ = stack_size;
|
||||
|
||||
PushStackpoint();
|
||||
sub(rsp, (uint32_t)stack_size);
|
||||
|
||||
code_offsets.prolog_stack_alloc = getSize();
|
||||
|
@ -271,6 +272,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
*/
|
||||
// Body.
|
||||
auto block = builder->first_block();
|
||||
synchronize_stack_on_next_instruction_ = false;
|
||||
while (block) {
|
||||
ForgetMxcsrMode(); // at start of block, mxcsr mode is undefined
|
||||
|
||||
|
@ -287,6 +289,12 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
// Process instructions.
|
||||
const Instr* instr = block->instr_head;
|
||||
while (instr) {
|
||||
if (synchronize_stack_on_next_instruction_) {
|
||||
if (instr->GetOpcodeNum() != hir::OPCODE_SOURCE_OFFSET) {
|
||||
synchronize_stack_on_next_instruction_ = false;
|
||||
EnsureSynchronizedGuestAndHostStack();
|
||||
}
|
||||
}
|
||||
const Instr* new_tail = instr;
|
||||
if (!SelectSequence(this, instr, &new_tail)) {
|
||||
// No sequence found!
|
||||
|
@ -314,6 +322,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
|||
EmitProfilerEpilogue();
|
||||
|
||||
add(rsp, (uint32_t)stack_size);
|
||||
PopStackpoint();
|
||||
ret();
|
||||
// todo: do some kind of sorting by alignment?
|
||||
for (auto&& tail_item : tail_code_) {
|
||||
|
@ -453,12 +462,186 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
|
|||
|
||||
// This is used by the X64ThunkEmitter's ResolveFunctionThunk.
|
||||
uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
|
||||
auto thread_state =
|
||||
reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
|
||||
auto guest_context = reinterpret_cast<ppc::PPCContext_s*>(raw_context);
|
||||
|
||||
auto thread_state = guest_context->thread_state;
|
||||
|
||||
// TODO(benvanik): required?
|
||||
assert_not_zero(target_address);
|
||||
|
||||
/*
|
||||
todo: refactor this!
|
||||
|
||||
The purpose of this code is to allow guest longjmp to call into
|
||||
the body of an existing host function. There are a lot of conditions we
|
||||
have to check here to ensure that we do not mess up a normal call to a
|
||||
function
|
||||
|
||||
The address must be within an XexModule (may need to make some changes
|
||||
to instructionaddressflags to remove this limitation) The target address
|
||||
must be a known return site. The guest address must be part of a function
|
||||
that was already translated.
|
||||
|
||||
*/
|
||||
|
||||
if (cvars::enable_host_guest_stack_synchronization) {
|
||||
auto processor = thread_state->processor();
|
||||
auto module_for_address =
|
||||
processor->LookupModule(static_cast<uint32_t>(target_address));
|
||||
|
||||
if (module_for_address) {
|
||||
XexModule* xexmod = dynamic_cast<XexModule*>(module_for_address);
|
||||
if (xexmod) {
|
||||
InfoCacheFlags* flags = xexmod->GetInstructionAddressFlags(
|
||||
static_cast<uint32_t>(target_address));
|
||||
if (flags) {
|
||||
if (flags->is_return_site) {
|
||||
auto ones_with_address = processor->FindFunctionsWithAddress(
|
||||
static_cast<uint32_t>(target_address));
|
||||
if (ones_with_address.size() != 0) {
|
||||
// this loop to find a host address for the guest address is
|
||||
// necessary because FindFunctionsWithAddress works via a range
|
||||
// check, but if the function consists of multiple blocks
|
||||
// scattered around with "holes" of instructions that cannot be
|
||||
// reached in between those holes the instructions that cannot be
|
||||
// reached will incorrectly be considered members of the function
|
||||
|
||||
X64Function* candidate = nullptr;
|
||||
uintptr_t host_address = 0;
|
||||
for (auto&& entry : ones_with_address) {
|
||||
X64Function* xfunc = static_cast<X64Function*>(entry);
|
||||
|
||||
host_address = xfunc->MapGuestAddressToMachineCode(
|
||||
static_cast<uint32_t>(target_address));
|
||||
// host address does exist within the function, and that host
|
||||
// function is not the start of the function, it is instead
|
||||
// somewhere within its existing body
|
||||
// i originally did not have this (xfunc->machine_code() !=
|
||||
// reinterpret_cast<const uint8_t*>(host_address))) condition
|
||||
// here when i distributed builds for testing, no issues arose
|
||||
// related to it but i wanted to be more explicit
|
||||
if (host_address &&
|
||||
xfunc->machine_code() !=
|
||||
reinterpret_cast<const uint8_t*>(host_address)) {
|
||||
candidate = xfunc;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// we found an existing X64Function, and a return site within that
|
||||
// function that has a host address w/ native code
|
||||
if (candidate && host_address) {
|
||||
X64Backend* backend =
|
||||
static_cast<X64Backend*>(processor->backend());
|
||||
// grab the backend context, next we have to check whether the
|
||||
// guest and host stack are out of sync if they arent, its fine
|
||||
// for the backend to create a new function for the guest
|
||||
// address we're resolving if they are, it means that the reason
|
||||
// we're resolving this address is because context is being
|
||||
// restored (probably by longjmp)
|
||||
X64BackendContext* backend_context =
|
||||
backend->BackendContextForGuestContext(guest_context);
|
||||
|
||||
uint32_t current_stackpoint_index =
|
||||
backend_context->current_stackpoint_depth;
|
||||
|
||||
--current_stackpoint_index;
|
||||
|
||||
X64BackendStackpoint* stackpoints =
|
||||
backend_context->stackpoints;
|
||||
|
||||
uint32_t current_guest_stackpointer =
|
||||
static_cast<uint32_t>(guest_context->r[1]);
|
||||
uint32_t num_frames_bigger = 0;
|
||||
|
||||
/*
|
||||
if the current guest stack pointer is bigger than the
|
||||
recorded pointer for this stack thats fine, plenty of
|
||||
functions restore the original stack pointer early
|
||||
|
||||
if more than 1... we're longjmping and sure of it at
|
||||
this point (jumping to a return site that has already been
|
||||
emitted)
|
||||
*/
|
||||
while (current_stackpoint_index != 0xFFFFFFFF) {
|
||||
if (current_guest_stackpointer >
|
||||
stackpoints[current_stackpoint_index].guest_stack_) {
|
||||
--current_stackpoint_index;
|
||||
++num_frames_bigger;
|
||||
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
/*
|
||||
DEFINITELY a longjmp, return original
|
||||
host address. returning the existing host address is going to
|
||||
set off some extra machinery we have set up to support this
|
||||
|
||||
to break it down, our caller (us being
|
||||
this ResolveFunction that this comment is in) is
|
||||
X64Backend::resolve_function_thunk_ which is implemented in
|
||||
x64_backend.cc X64HelperEmitter::EmitResolveFunctionThunk, or
|
||||
a call from the resolver table
|
||||
|
||||
the x64 fastcall abi dictates that the
|
||||
stack must always be 16 byte aligned. We select our stack
|
||||
size for functions to ensure that we keep rsp aligned to 16
|
||||
bytes
|
||||
|
||||
but by calling into the body of an
|
||||
existing function we've pushed our return address onto the
|
||||
stack (dont worry about this return address, it gets
|
||||
discarded in a later step)
|
||||
|
||||
this means that the stack is no longer
|
||||
16 byte aligned, (rsp % 16) now == 8, and this is the only
|
||||
time outside of the prolog or epilog of a function that this
|
||||
will be the case
|
||||
|
||||
so, after all direct or indirect
|
||||
function calls we set
|
||||
X64Emitter::synchronize_stack_on_next_instruction_ to true.
|
||||
On the next instruction that is not
|
||||
OPCODE_SOURCE_OFFSET we will emit a check when we see
|
||||
synchronize_stack_on_next_instruction_ is true. We have to
|
||||
skip OPCODE_SOURCE_OFFSET because its not a "real"
|
||||
instruction and if we emit on it the return address of the
|
||||
function call will point to AFTER our check, so itll never be
|
||||
executed.
|
||||
|
||||
our check is just going to do test esp,
|
||||
15 to see if the stack is misaligned. (using esp instead of
|
||||
rsp saves 1 byte). We tail emit the handling for when the
|
||||
check succeeds because in 99.99999% of function calls it will
|
||||
be aligned, in the end the runtime cost of these checks is 5
|
||||
bytes for the test instruction which ought to be one cycle
|
||||
and 5 bytes for the jmp with no cycles taken for the jump
|
||||
which will be predicted not taken.
|
||||
|
||||
Our handling for the check is implemented in X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper. we don't call it directly though,
|
||||
instead we go through backend()->synchronize_guest_and_host_stack_helper_for_size(num_bytes_needed_to_represent_stack_size). we place the stack size after the
|
||||
call instruction so we can load it in the helper and readjust the return address to point after the literal value.
|
||||
|
||||
The helper is going to search the array of stackpoints to find the first one that is greater than or equal to the current stack pointer, when it finds
|
||||
the entry it will set the currently host rsp to the host stack pointer value in the entry, and then subtract the stack size of the caller from that.
|
||||
the current stackpoint index is adjusted to point to the one after the stackpoint we restored to.
|
||||
|
||||
The helper then jumps back to the function that was longjmp'ed to, with the host stack in its proper state. it just works!
|
||||
|
||||
|
||||
|
||||
*/
|
||||
|
||||
if (num_frames_bigger > 1) {
|
||||
return host_address;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
auto fn = thread_state->processor()->ResolveFunction(
|
||||
static_cast<uint32_t>(target_address));
|
||||
assert_not_null(fn);
|
||||
|
@ -479,7 +662,7 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
|
|||
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
||||
|
||||
call((void*)fn->machine_code());
|
||||
|
||||
synchronize_stack_on_next_instruction_ = true;
|
||||
} else {
|
||||
// tail call
|
||||
EmitTraceUserCallReturn();
|
||||
|
@ -488,8 +671,10 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
|
|||
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||
|
||||
add(rsp, static_cast<uint32_t>(stack_size()));
|
||||
PopStackpoint();
|
||||
jmp((void*)fn->machine_code(), T_NEAR);
|
||||
}
|
||||
|
||||
return;
|
||||
} else if (code_cache_->has_indirection_table()) {
|
||||
// Load the pointer to the indirection table maintained in X64CodeCache.
|
||||
|
@ -513,12 +698,14 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
|
|||
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||
|
||||
add(rsp, static_cast<uint32_t>(stack_size()));
|
||||
PopStackpoint();
|
||||
jmp(rax);
|
||||
} else {
|
||||
// Return address is from the previous SET_RETURN_ADDRESS.
|
||||
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
||||
|
||||
call(rax);
|
||||
synchronize_stack_on_next_instruction_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -557,12 +744,14 @@ void X64Emitter::CallIndirect(const hir::Instr* instr,
|
|||
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||
|
||||
add(rsp, static_cast<uint32_t>(stack_size()));
|
||||
PopStackpoint();
|
||||
jmp(rax);
|
||||
} else {
|
||||
// Return address is from the previous SET_RETURN_ADDRESS.
|
||||
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
||||
|
||||
call(rax);
|
||||
synchronize_stack_on_next_instruction_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1458,6 +1647,126 @@ Xbyak::Address X64Emitter::GetBackendFlagsPtr() const {
|
|||
pt.setBit(32);
|
||||
return pt;
|
||||
}
|
||||
|
||||
void X64Emitter::HandleStackpointOverflowError(ppc::PPCContext* context) {
|
||||
// context->lr
|
||||
// todo: show lr in message?
|
||||
xe::FatalError(
|
||||
"Overflowed stackpoints! Please report this error for this title to "
|
||||
"Xenia developers.");
|
||||
}
|
||||
|
||||
void X64Emitter::PushStackpoint() {
|
||||
if (!cvars::enable_host_guest_stack_synchronization) {
|
||||
return;
|
||||
}
|
||||
// push the current host and guest stack pointers
|
||||
// this is done before a stack frame is set up or any guest instructions are
|
||||
// executed this code is probably the most intrusive part of the stackpoint
|
||||
mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints)));
|
||||
mov(eax,
|
||||
GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)));
|
||||
|
||||
mov(r8, qword[GetContextReg() + offsetof(ppc::PPCContext, r[1])]);
|
||||
|
||||
imul(r9d, eax, sizeof(X64BackendStackpoint));
|
||||
add(rbx, r9);
|
||||
|
||||
mov(qword[rbx + offsetof(X64BackendStackpoint, host_stack_)], rsp);
|
||||
mov(dword[rbx + offsetof(X64BackendStackpoint, guest_stack_)], r8d);
|
||||
if (IsFeatureEnabled(kX64FlagsIndependentVars)) {
|
||||
inc(eax);
|
||||
} else {
|
||||
add(eax, 1);
|
||||
}
|
||||
|
||||
mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
|
||||
eax);
|
||||
|
||||
cmp(eax, (uint32_t)cvars::max_stackpoints);
|
||||
|
||||
Xbyak::Label& overflowed_stackpoints =
|
||||
AddToTail([](X64Emitter& e, Xbyak::Label& our_tail_label) {
|
||||
e.L(our_tail_label);
|
||||
// we never subtracted anything from rsp, so our stack is misaligned and
|
||||
// will fault in guesttohostthunk
|
||||
// e.sub(e.rsp, 8);
|
||||
e.push(e.rax); // easier realign, 1 byte opcode vs 4 bytes for sub
|
||||
|
||||
e.CallNativeSafe((void*)X64Emitter::HandleStackpointOverflowError);
|
||||
});
|
||||
jge(overflowed_stackpoints, T_NEAR);
|
||||
}
|
||||
void X64Emitter::PopStackpoint() {
|
||||
if (!cvars::enable_host_guest_stack_synchronization) {
|
||||
return;
|
||||
}
|
||||
// todo: maybe verify that rsp and r1 == the stackpoint?
|
||||
Xbyak::Address stackpoint_pos_pointer =
|
||||
GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth));
|
||||
stackpoint_pos_pointer.setBit(32);
|
||||
dec(stackpoint_pos_pointer);
|
||||
}
|
||||
|
||||
void X64Emitter::EnsureSynchronizedGuestAndHostStack() {
|
||||
if (!cvars::enable_host_guest_stack_synchronization) {
|
||||
return;
|
||||
}
|
||||
// chrispy: keeping this old slower test here in case in the future changes
|
||||
// need to be made
|
||||
// that result in the stack not being 8 byte misaligned on context reentry
|
||||
|
||||
#if 0
|
||||
Xbyak::Label skip{};
|
||||
mov(r8, qword[GetContextReg() + offsetof(ppc::PPCContext, r[1])]);
|
||||
mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints)));
|
||||
imul(eax,
|
||||
GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
|
||||
sizeof(X64BackendStackpoint));
|
||||
sub(eax, sizeof(X64BackendStackpoint));
|
||||
add(rbx, rax);
|
||||
|
||||
cmp(r8d, dword[rbx + offsetof(X64BackendStackpoint, guest_stack_)]);
|
||||
jle(skip, T_NEAR);
|
||||
Xbyak::Label skip{};
|
||||
mov(r11d, stack_size());
|
||||
call(backend_->synchronize_guest_and_host_stack_helper());
|
||||
L(skip);
|
||||
#endif
|
||||
|
||||
Xbyak::Label& return_from_sync = this->NewCachedLabel();
|
||||
|
||||
// if we got here somehow from setjmp or the like we ought to have a
|
||||
// misaligned stack right now! this provides us with a very fast pretest for
|
||||
// this condition
|
||||
test(esp, 15);
|
||||
|
||||
Xbyak::Label& sync_label = this->AddToTail(
|
||||
[&return_from_sync](X64Emitter& e, Xbyak::Label& our_tail_label) {
|
||||
e.L(our_tail_label);
|
||||
|
||||
uint32_t stack32 = static_cast<uint32_t>(e.stack_size());
|
||||
auto backend = e.backend();
|
||||
|
||||
if (stack32 < 256) {
|
||||
e.call(backend->synchronize_guest_and_host_stack_helper_for_size(1));
|
||||
e.db(stack32);
|
||||
|
||||
} else if (stack32 < 65536) {
|
||||
e.call(backend->synchronize_guest_and_host_stack_helper_for_size(2));
|
||||
e.dw(stack32);
|
||||
} else {
|
||||
// ought to be impossible, a host stack bigger than 65536??
|
||||
e.call(backend->synchronize_guest_and_host_stack_helper_for_size(4));
|
||||
e.dd(stack32);
|
||||
}
|
||||
e.jmp(return_from_sync, T_NEAR);
|
||||
});
|
||||
|
||||
jnz(sync_label, T_NEAR);
|
||||
|
||||
L(return_from_sync);
|
||||
}
|
||||
} // namespace x64
|
||||
} // namespace backend
|
||||
} // namespace cpu
|
||||
|
|
|
@ -299,6 +299,11 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
|
||||
Xbyak::Label& AddToTail(TailEmitCallback callback, uint32_t alignment = 0);
|
||||
Xbyak::Label& NewCachedLabel();
|
||||
|
||||
void PushStackpoint();
|
||||
void PopStackpoint();
|
||||
|
||||
void EnsureSynchronizedGuestAndHostStack();
|
||||
FunctionDebugInfo* debug_info() const { return debug_info_; }
|
||||
|
||||
size_t stack_size() const { return stack_size_; }
|
||||
|
@ -381,13 +386,14 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
bool Emit(hir::HIRBuilder* builder, EmitFunctionInfo& func_info);
|
||||
void EmitGetCurrentThreadId();
|
||||
void EmitTraceUserCallReturn();
|
||||
|
||||
static void HandleStackpointOverflowError(ppc::PPCContext* context);
|
||||
protected:
|
||||
Processor* processor_ = nullptr;
|
||||
X64Backend* backend_ = nullptr;
|
||||
X64CodeCache* code_cache_ = nullptr;
|
||||
XbyakAllocator* allocator_ = nullptr;
|
||||
XexModule* guest_module_ = nullptr;
|
||||
bool synchronize_stack_on_next_instruction_ = false;
|
||||
Xbyak::util::Cpu cpu_;
|
||||
uint64_t feature_flags_ = 0;
|
||||
uint32_t current_guest_function_ = 0;
|
||||
|
|
|
@ -56,6 +56,8 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
|
|||
if (entry) {
|
||||
// If we aren't ready yet spin and wait.
|
||||
if (entry->status == Entry::STATUS_COMPILING) {
|
||||
// chrispy: i think this is dead code, if we are compiling we're holding
|
||||
// the global lock, arent we? so we wouldnt be executing here
|
||||
// Still compiling, so spin.
|
||||
do {
|
||||
global_lock.unlock();
|
||||
|
|
|
@ -110,8 +110,13 @@ uint32_t GuestFunction::MapGuestAddressToMachineCodeOffset(
|
|||
uintptr_t GuestFunction::MapGuestAddressToMachineCode(
|
||||
uint32_t guest_address) const {
|
||||
auto entry = LookupGuestAddress(guest_address);
|
||||
return reinterpret_cast<uintptr_t>(machine_code()) +
|
||||
(entry ? entry->code_offset : 0);
|
||||
|
||||
if (entry) {
|
||||
return reinterpret_cast<uintptr_t>(machine_code()) + entry->code_offset;
|
||||
} else {
|
||||
return 0;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t GuestFunction::MapMachineCodeToGuestAddress(
|
||||
|
|
|
@ -27,18 +27,13 @@
|
|||
#include "xenia/cpu/ppc/ppc_frontend.h"
|
||||
#include "xenia/cpu/ppc/ppc_opcode_info.h"
|
||||
#include "xenia/cpu/processor.h"
|
||||
|
||||
#include "xenia/cpu/xex_module.h"
|
||||
DEFINE_bool(
|
||||
break_on_unimplemented_instructions, true,
|
||||
"Break to the host debugger (or crash if no debugger attached) if an "
|
||||
"unimplemented PowerPC instruction is encountered.",
|
||||
"CPU");
|
||||
|
||||
DEFINE_bool(
|
||||
emit_useless_fpscr_updates, false,
|
||||
"Emit useless fpscr update instructions (pre-10/30/2022 behavior). ",
|
||||
"CPU");
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace ppc {
|
||||
|
@ -94,8 +89,9 @@ bool PPCHIRBuilder::Emit(GuestFunction* function, uint32_t flags) {
|
|||
|
||||
function_ = function;
|
||||
start_address_ = function_->address();
|
||||
//chrispy: i've seen this one happen, not sure why but i think from trying to precompile twice
|
||||
//i've also seen ones with a start and end address that are the same...
|
||||
// chrispy: i've seen this one happen, not sure why but i think from trying to
|
||||
// precompile twice i've also seen ones with a start and end address that are
|
||||
// the same...
|
||||
assert_true(function_->address() <= function_->end_address());
|
||||
instr_count_ = (function_->end_address() - function_->address()) / 4 + 1;
|
||||
|
||||
|
@ -250,7 +246,8 @@ void PPCHIRBuilder::MaybeBreakOnInstruction(uint32_t address) {
|
|||
}
|
||||
|
||||
void PPCHIRBuilder::AnnotateLabel(uint32_t address, Label* label) {
|
||||
//chrispy: label->name is unused, it would be nice to be able to remove the field and this code
|
||||
// chrispy: label->name is unused, it would be nice to be able to remove the
|
||||
// field and this code
|
||||
char name_buffer[13];
|
||||
auto format_result = fmt::format_to_n(name_buffer, 12, "loc_{:08X}", address);
|
||||
name_buffer[format_result.size] = '\0';
|
||||
|
@ -457,37 +454,38 @@ void PPCHIRBuilder::UpdateFPSCR(Value* result, bool update_cr1) {
|
|||
// TODO(benvanik): detect overflow and nan cases.
|
||||
// fx and vx are the most important.
|
||||
/*
|
||||
chrispy: stubbed this out because right now all it does is waste
|
||||
memory and CPU time
|
||||
chrispy: i stubbed this out at one point because all it does is waste
|
||||
memory and CPU time, however, this introduced issues with raiden
|
||||
(substitute w/ titleid later) which probably means they stash stuff in the
|
||||
fpscr?
|
||||
|
||||
*/
|
||||
if (cvars::emit_useless_fpscr_updates) {
|
||||
Value* fx = LoadConstantInt8(0);
|
||||
Value* fex = LoadConstantInt8(0);
|
||||
Value* vx = LoadConstantInt8(0);
|
||||
Value* ox = LoadConstantInt8(0);
|
||||
|
||||
if (update_cr1) {
|
||||
// Store into the CR1 field.
|
||||
// We do this instead of just calling CopyFPSCRToCR1 so that we don't
|
||||
// have to read back the bits and do shifting work.
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx);
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex);
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx);
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox);
|
||||
}
|
||||
Value* fx = LoadConstantInt8(0);
|
||||
Value* fex = LoadConstantInt8(0);
|
||||
Value* vx = LoadConstantInt8(0);
|
||||
Value* ox = LoadConstantInt8(0);
|
||||
|
||||
// Generate our new bits.
|
||||
Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31);
|
||||
new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30));
|
||||
new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29));
|
||||
new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28));
|
||||
|
||||
// Mix into fpscr while preserving sticky bits (FX and OX).
|
||||
Value* bits = LoadFPSCR();
|
||||
bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits);
|
||||
StoreFPSCR(bits);
|
||||
if (update_cr1) {
|
||||
// Store into the CR1 field.
|
||||
// We do this instead of just calling CopyFPSCRToCR1 so that we don't
|
||||
// have to read back the bits and do shifting work.
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx);
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex);
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx);
|
||||
StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox);
|
||||
}
|
||||
|
||||
// Generate our new bits.
|
||||
Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31);
|
||||
new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30));
|
||||
new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29));
|
||||
new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28));
|
||||
|
||||
// Mix into fpscr while preserving sticky bits (FX and OX).
|
||||
Value* bits = LoadFPSCR();
|
||||
bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits);
|
||||
StoreFPSCR(bits);
|
||||
}
|
||||
|
||||
void PPCHIRBuilder::CopyFPSCRToCR1() {
|
||||
|
@ -587,7 +585,24 @@ void PPCHIRBuilder::StoreReserved(Value* val) {
|
|||
Value* PPCHIRBuilder::LoadReserved() {
|
||||
return LoadContext(offsetof(PPCContext, reserved_val), INT64_TYPE);
|
||||
}
|
||||
void PPCHIRBuilder::SetReturnAddress(Value* value) {
|
||||
/*
|
||||
Record the address as being a possible target of a return. This is
|
||||
needed for longjmp emulation. See x64_emitter.cc's ResolveFunction
|
||||
*/
|
||||
Module* mod = this->function_->module();
|
||||
if (value && value->IsConstant()) {
|
||||
if (mod) {
|
||||
XexModule* xexmod = dynamic_cast<XexModule*>(mod);
|
||||
if (xexmod) {
|
||||
auto flags = xexmod->GetInstructionAddressFlags(value->AsUint32());
|
||||
flags->is_return_site = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HIRBuilder::SetReturnAddress(value);
|
||||
}
|
||||
} // namespace ppc
|
||||
} // namespace cpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -80,7 +80,8 @@ class PPCHIRBuilder : public hir::HIRBuilder {
|
|||
|
||||
void StoreReserved(Value* val);
|
||||
Value* LoadReserved();
|
||||
|
||||
//calls original impl in hirbuilder, but also records the is_return_site bit into flags in the guestmodule
|
||||
void SetReturnAddress(Value* value);
|
||||
private:
|
||||
void MaybeBreakOnInstruction(uint32_t address);
|
||||
void AnnotateLabel(uint32_t address, Label* label);
|
||||
|
|
|
@ -263,12 +263,11 @@ Function* Processor::ResolveFunction(uint32_t address) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
if (!DemandFunction(function)) {
|
||||
entry->status = Entry::STATUS_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
//only add it to the list of resolved functions if resolving succeeded
|
||||
//only add it to the list of resolved functions if resolving succeeded
|
||||
auto module_for = function->module();
|
||||
|
||||
auto xexmod = dynamic_cast<XexModule*>(module_for);
|
||||
|
@ -291,23 +290,23 @@ Function* Processor::ResolveFunction(uint32_t address) {
|
|||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
Module* Processor::LookupModule(uint32_t address) {
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
// TODO(benvanik): sort by code address (if contiguous) so can bsearch.
|
||||
// TODO(benvanik): cache last module low/high, as likely to be in there.
|
||||
for (const auto& module : modules_) {
|
||||
if (module->ContainsAddress(address)) {
|
||||
return module.get();
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
Function* Processor::LookupFunction(uint32_t address) {
|
||||
// TODO(benvanik): fast reject invalid addresses/log errors.
|
||||
|
||||
// Find the module that contains the address.
|
||||
Module* code_module = nullptr;
|
||||
{
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
// TODO(benvanik): sort by code address (if contiguous) so can bsearch.
|
||||
// TODO(benvanik): cache last module low/high, as likely to be in there.
|
||||
for (const auto& module : modules_) {
|
||||
if (module->ContainsAddress(address)) {
|
||||
code_module = module.get();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Module* code_module = LookupModule(address);
|
||||
|
||||
if (!code_module) {
|
||||
// No module found that could contain the address.
|
||||
return nullptr;
|
||||
|
|
|
@ -115,6 +115,7 @@ class Processor {
|
|||
void RemoveFunctionByAddress(uint32_t address);
|
||||
|
||||
Function* LookupFunction(uint32_t address);
|
||||
Module* LookupModule(uint32_t address);
|
||||
Function* LookupFunction(Module* module, uint32_t address);
|
||||
Function* ResolveFunction(uint32_t address);
|
||||
|
||||
|
|
|
@ -78,7 +78,7 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
|
|||
// Allocate with 64b alignment.
|
||||
|
||||
context_ = reinterpret_cast<ppc::PPCContext*>(
|
||||
AllocateContext());
|
||||
AllocateContext());
|
||||
processor->backend()->InitializeBackendContext(context_);
|
||||
assert_true(((uint64_t)context_ & 0x3F) == 0);
|
||||
std::memset(context_, 0, sizeof(ppc::PPCContext));
|
||||
|
@ -105,9 +105,9 @@ ThreadState::~ThreadState() {
|
|||
thread_state_ = nullptr;
|
||||
}
|
||||
if (context_) {
|
||||
processor_->backend()->DeinitializeBackendContext(context_);
|
||||
FreeContext(reinterpret_cast<void*>(context_));
|
||||
}
|
||||
// memory::AlignedFree(context_);
|
||||
}
|
||||
|
||||
void ThreadState::Bind(ThreadState* thread_state) {
|
||||
|
|
|
@ -38,9 +38,10 @@ DEFINE_bool(disable_instruction_infocache, false,
|
|||
"CPU");
|
||||
|
||||
DEFINE_bool(
|
||||
disable_early_precompilation, false,
|
||||
"Disables pre-compiling guest functions that we know we've called/that "
|
||||
"we've recognized as being functions via simple heuristics.",
|
||||
enable_early_precompilation, false,
|
||||
"Enable pre-compiling guest functions that we know we've called/that "
|
||||
"we've recognized as being functions via simple heuristics, good for error "
|
||||
"finding/stress testing with the JIT",
|
||||
"CPU");
|
||||
|
||||
static const uint8_t xe_xex2_retail_key[16] = {
|
||||
|
@ -1115,6 +1116,7 @@ void XexModule::Precompile() {
|
|||
if (!FindSaveRest()) {
|
||||
return;
|
||||
}
|
||||
|
||||
info_cache_.Init(this);
|
||||
PrecompileDiscoveredFunctions();
|
||||
}
|
||||
|
@ -1343,22 +1345,26 @@ void XexInfoCache::Init(XexModule* xexmod) {
|
|||
num_codebytes += 3; // round up to nearest multiple of 4
|
||||
num_codebytes &= ~3;
|
||||
|
||||
bool did_exist = true;
|
||||
if (!std::filesystem::exists(infocache_path)) {
|
||||
recreate:
|
||||
xe::filesystem::CreateEmptyFile(infocache_path);
|
||||
did_exist = false;
|
||||
}
|
||||
auto try_open = [this, &infocache_path, num_codebytes]() {
|
||||
bool did_exist = true;
|
||||
|
||||
// todo: prepopulate with stuff from pdata, dll exports
|
||||
if (!std::filesystem::exists(infocache_path)) {
|
||||
xe::filesystem::CreateEmptyFile(infocache_path);
|
||||
did_exist = false;
|
||||
}
|
||||
|
||||
this->executable_addr_flags_ = std::move(xe::MappedMemory::Open(
|
||||
infocache_path, xe::MappedMemory::Mode::kReadWrite, 0,
|
||||
sizeof(InfoCacheFlagsHeader) +
|
||||
(sizeof(InfoCacheFlags) *
|
||||
(num_codebytes /
|
||||
4)))); // one infocacheflags entry for each PPC instr-sized addr
|
||||
// todo: prepopulate with stuff from pdata, dll exports
|
||||
|
||||
this->executable_addr_flags_ = std::move(xe::MappedMemory::Open(
|
||||
infocache_path, xe::MappedMemory::Mode::kReadWrite, 0,
|
||||
sizeof(InfoCacheFlagsHeader) +
|
||||
(sizeof(InfoCacheFlags) *
|
||||
(num_codebytes /
|
||||
4)))); // one infocacheflags entry for each PPC instr-sized addr
|
||||
return did_exist;
|
||||
};
|
||||
|
||||
bool did_exist = try_open();
|
||||
if (!did_exist) {
|
||||
GetHeader()->version = CURRENT_INFOCACHE_VERSION;
|
||||
|
||||
|
@ -1366,7 +1372,7 @@ void XexInfoCache::Init(XexModule* xexmod) {
|
|||
if (GetHeader()->version != CURRENT_INFOCACHE_VERSION) {
|
||||
this->executable_addr_flags_->Close();
|
||||
std::filesystem::remove(infocache_path);
|
||||
goto recreate;
|
||||
try_open();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1380,7 +1386,7 @@ InfoCacheFlags* XexModule::GetInstructionAddressFlags(uint32_t guest_addr) {
|
|||
return info_cache_.LookupFlags(guest_addr);
|
||||
}
|
||||
void XexModule::PrecompileDiscoveredFunctions() {
|
||||
if (cvars::disable_early_precompilation) {
|
||||
if (!cvars::enable_early_precompilation) {
|
||||
return;
|
||||
}
|
||||
auto others = PreanalyzeCode();
|
||||
|
@ -1397,7 +1403,7 @@ void XexModule::PrecompileDiscoveredFunctions() {
|
|||
}
|
||||
}
|
||||
void XexModule::PrecompileKnownFunctions() {
|
||||
if (cvars::disable_early_precompilation) {
|
||||
if (!cvars::enable_early_precompilation) {
|
||||
return;
|
||||
}
|
||||
uint32_t start = 0;
|
||||
|
@ -1435,18 +1441,14 @@ static bool IsOpcodeBL(unsigned w) {
|
|||
|
||||
std::vector<uint32_t> XexModule::PreanalyzeCode() {
|
||||
uint32_t low_8_aligned = xe::align<uint32_t>(low_address_, 8);
|
||||
|
||||
|
||||
|
||||
uint32_t highest_exec_addr = 0;
|
||||
|
||||
for (auto&& sec : pe_sections_) {
|
||||
if ((sec.flags & kXEPESectionContainsCode)) {
|
||||
|
||||
|
||||
highest_exec_addr =
|
||||
highest_exec_addr =
|
||||
std::max<uint32_t>(highest_exec_addr, sec.address + sec.size);
|
||||
}
|
||||
}
|
||||
}
|
||||
uint32_t high_8_aligned = highest_exec_addr & ~(8U - 1);
|
||||
uint32_t n_possible_8byte_addresses = (high_8_aligned - low_8_aligned) / 8;
|
||||
|
@ -1476,7 +1478,7 @@ std::vector<uint32_t> XexModule::PreanalyzeCode() {
|
|||
uint32_t mfspr_r12_lr32 =
|
||||
*reinterpret_cast<const uint32_t*>(&mfspr_r12_lr[0]);
|
||||
|
||||
auto add_new_func = [funcstart_candidate_stack, &stack_pos](uint32_t addr) {
|
||||
auto add_new_func = [funcstart_candidate_stack, &stack_pos](uint32_t addr) {
|
||||
funcstart_candidate_stack[stack_pos++] = addr;
|
||||
};
|
||||
/*
|
||||
|
@ -1926,7 +1928,7 @@ bool XexModule::FindSaveRest() {
|
|||
address += 2 * 4;
|
||||
}
|
||||
}
|
||||
if (!cvars::disable_early_precompilation) {
|
||||
if (cvars::enable_early_precompilation) {
|
||||
for (auto&& to_ensure_precompiled : resolve_on_exit) {
|
||||
// we want to make sure an address for these functions is available before
|
||||
// any other functions are compiled for code generation purposes but we do
|
||||
|
|
|
@ -29,23 +29,27 @@ constexpr fourcc_t kXEX1Signature = make_fourcc("XEX1");
|
|||
constexpr fourcc_t kXEX2Signature = make_fourcc("XEX2");
|
||||
constexpr fourcc_t kElfSignature = make_fourcc(0x7F, 'E', 'L', 'F');
|
||||
|
||||
|
||||
class Runtime;
|
||||
struct InfoCacheFlags {
|
||||
uint32_t was_resolved : 1; // has this address ever been called/requested
|
||||
// via resolvefunction?
|
||||
uint32_t accessed_mmio : 1;
|
||||
uint32_t is_syscall_func : 1;
|
||||
uint32_t reserved : 29;
|
||||
uint32_t is_return_site : 1; // address can be reached from another function
|
||||
// by returning
|
||||
uint32_t reserved : 28;
|
||||
};
|
||||
static_assert(sizeof(InfoCacheFlags) == 4,
|
||||
"InfoCacheFlags size should be equal to sizeof ppc instruction.");
|
||||
|
||||
struct XexInfoCache {
|
||||
//increment this to invalidate all user infocaches
|
||||
static constexpr uint32_t CURRENT_INFOCACHE_VERSION = 1;
|
||||
// increment this to invalidate all user infocaches
|
||||
static constexpr uint32_t CURRENT_INFOCACHE_VERSION = 4;
|
||||
|
||||
struct InfoCacheFlagsHeader {
|
||||
uint32_t version;
|
||||
|
||||
unsigned char reserved[252];
|
||||
unsigned char reserved[252];
|
||||
|
||||
InfoCacheFlags* LookupFlags(unsigned offset) {
|
||||
return &reinterpret_cast<InfoCacheFlags*>(&this[1])[offset];
|
||||
|
@ -228,7 +232,8 @@ class XexModule : public xe::cpu::Module {
|
|||
|
||||
InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr);
|
||||
|
||||
virtual void Precompile() override;
|
||||
virtual void Precompile() override;
|
||||
|
||||
protected:
|
||||
std::unique_ptr<Function> CreateFunction(uint32_t address) override;
|
||||
|
||||
|
|
|
@ -1911,21 +1911,8 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
|
|||
void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
|
||||
uint32_t base,
|
||||
uint32_t num_registers) {
|
||||
RingBuffer::ReadRange range =
|
||||
ring->BeginRead(num_registers * sizeof(uint32_t));
|
||||
|
||||
XE_LIKELY_IF(!range.second) {
|
||||
uint32_t num_regs_firstrange =
|
||||
static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
|
||||
|
||||
D3D12CommandProcessor::WriteRegistersFromMem(
|
||||
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
||||
num_regs_firstrange);
|
||||
ring->EndRead(range);
|
||||
}
|
||||
else {
|
||||
return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers);
|
||||
}
|
||||
WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
|
||||
num_registers);
|
||||
}
|
||||
|
||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||
|
@ -2042,7 +2029,6 @@ D3D12CommandProcessor::WriteRegisterRangeFromRing_WithKnownBound(
|
|||
RingBuffer::ReadRange range =
|
||||
ring->BeginRead(num_registers * sizeof(uint32_t));
|
||||
|
||||
|
||||
XE_LIKELY_IF(!range.second) {
|
||||
WriteRegisterRangeFromMem_WithKnownBound<register_lower_bound,
|
||||
register_upper_bound>(
|
||||
|
@ -2710,9 +2696,9 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
|||
}
|
||||
|
||||
if (vfetch_current_queued) {
|
||||
// so far, i have never seen vfetch_current_queued > 4. 1 is most common, 2 happens occasionally. did not test many games though
|
||||
// pre-acquire the critical region so we're not repeatedly re-acquiring it
|
||||
// in requestrange
|
||||
// so far, i have never seen vfetch_current_queued > 4. 1 is most common,
|
||||
// 2 happens occasionally. did not test many games though pre-acquire the
|
||||
// critical region so we're not repeatedly re-acquiring it in requestrange
|
||||
auto shared_memory_request_range_hoisted =
|
||||
global_critical_region::Acquire();
|
||||
|
||||
|
@ -4351,7 +4337,8 @@ bool D3D12CommandProcessor::UpdateBindings(
|
|||
uint32_t float_constant_index;
|
||||
while (xe::bit_scan_forward(float_constant_map_entry,
|
||||
&float_constant_index)) {
|
||||
float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry);
|
||||
float_constant_map_entry =
|
||||
xe::clear_lowest_bit(float_constant_map_entry);
|
||||
std::memcpy(float_constants,
|
||||
®s[XE_GPU_REG_SHADER_CONSTANT_000_X + (i << 8) +
|
||||
(float_constant_index << 2)]
|
||||
|
@ -4382,7 +4369,8 @@ bool D3D12CommandProcessor::UpdateBindings(
|
|||
uint32_t float_constant_index;
|
||||
while (xe::bit_scan_forward(float_constant_map_entry,
|
||||
&float_constant_index)) {
|
||||
float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry);
|
||||
float_constant_map_entry =
|
||||
xe::clear_lowest_bit(float_constant_map_entry);
|
||||
std::memcpy(float_constants,
|
||||
®s[XE_GPU_REG_SHADER_CONSTANT_256_X + (i << 8) +
|
||||
(float_constant_index << 2)]
|
||||
|
|
|
@ -41,10 +41,23 @@ DECLARE_XAM_EXPORT1(XamEnableInactivityProcessing, kInput, kStub);
|
|||
|
||||
// https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputgetcapabilities(v=vs.85).aspx
|
||||
dword_result_t XamInputGetCapabilities_entry(
|
||||
dword_t user_index, dword_t flags, pointer_t<X_INPUT_CAPABILITIES> caps) {
|
||||
dword_t user_index, dword_t _flags, pointer_t<X_INPUT_CAPABILITIES> caps) {
|
||||
unsigned flags = _flags;
|
||||
//chrispy: actually, it appears that caps is never checked for null, it is memset at the start regardless
|
||||
if (!caps) {
|
||||
return X_ERROR_BAD_ARGUMENTS;
|
||||
}
|
||||
if ((flags & 0x40000000) != 0) {
|
||||
//should trap
|
||||
}
|
||||
|
||||
if ((flags & 4) != 0) {
|
||||
//should trap
|
||||
}
|
||||
if (!flags) {
|
||||
flags = 3;
|
||||
}
|
||||
|
||||
|
||||
if ((flags & 0xFF) && (flags & XINPUT_FLAG_GAMEPAD) == 0) {
|
||||
// Ignore any query for other types of devices.
|
||||
|
@ -118,7 +131,7 @@ dword_result_t XamInputGetState_entry(dword_t user_index, dword_t flags,
|
|||
DECLARE_XAM_EXPORT2(XamInputGetState, kInput, kImplemented, kHighFrequency);
|
||||
|
||||
// https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputsetstate(v=vs.85).aspx
|
||||
dword_result_t XamInputSetState_entry(dword_t user_index, dword_t unk,
|
||||
dword_result_t XamInputSetState_entry(dword_t user_index, dword_t flags, /* flags, as far as i can see, is not used*/
|
||||
pointer_t<X_INPUT_VIBRATION> vibration) {
|
||||
if (user_index >= 4) {
|
||||
return X_E_DEVICE_NOT_CONNECTED;
|
||||
|
|
|
@ -508,7 +508,16 @@ dword_result_t RtlInitializeCriticalSectionAndSpinCount_entry(
|
|||
DECLARE_XBOXKRNL_EXPORT1(RtlInitializeCriticalSectionAndSpinCount, kNone,
|
||||
kImplemented);
|
||||
|
||||
static void CriticalSectionPrefetchW(const void* vp) {
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
|
||||
swcache::PrefetchW(vp);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void RtlEnterCriticalSection_entry(pointer_t<X_RTL_CRITICAL_SECTION> cs) {
|
||||
CriticalSectionPrefetchW(&cs->lock_count);
|
||||
uint32_t cur_thread = XThread::GetCurrentThread()->guest_object();
|
||||
uint32_t spin_count = cs->header.absolute * 256;
|
||||
|
||||
|
@ -544,6 +553,7 @@ DECLARE_XBOXKRNL_EXPORT2(RtlEnterCriticalSection, kNone, kImplemented,
|
|||
|
||||
dword_result_t RtlTryEnterCriticalSection_entry(
|
||||
pointer_t<X_RTL_CRITICAL_SECTION> cs) {
|
||||
CriticalSectionPrefetchW(&cs->lock_count);
|
||||
uint32_t thread = XThread::GetCurrentThread()->guest_object();
|
||||
|
||||
if (xe::atomic_cas(-1, 0, &cs->lock_count)) {
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
******************************************************************************
|
||||
*/
|
||||
|
||||
#include "xenia/kernel/xboxkrnl/xboxkrnl_threading.h"
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include "xenia/base/atomic.h"
|
||||
|
@ -18,7 +19,6 @@
|
|||
#include "xenia/kernel/user_module.h"
|
||||
#include "xenia/kernel/util/shim_utils.h"
|
||||
#include "xenia/kernel/xboxkrnl/xboxkrnl_private.h"
|
||||
#include "xenia/kernel/xboxkrnl/xboxkrnl_threading.h"
|
||||
#include "xenia/kernel/xevent.h"
|
||||
#include "xenia/kernel/xmutant.h"
|
||||
#include "xenia/kernel/xsemaphore.h"
|
||||
|
@ -165,8 +165,16 @@ dword_result_t NtResumeThread_entry(dword_t handle,
|
|||
uint32_t suspend_count = 0;
|
||||
|
||||
auto thread = kernel_state()->object_table()->LookupObject<XThread>(handle);
|
||||
|
||||
if (thread) {
|
||||
result = thread->Resume(&suspend_count);
|
||||
if (thread->type() == XObject::Type::Thread) {
|
||||
result = thread->Resume(&suspend_count);
|
||||
|
||||
} else {
|
||||
return X_STATUS_OBJECT_TYPE_MISMATCH;
|
||||
}
|
||||
} else {
|
||||
return X_STATUS_INVALID_HANDLE;
|
||||
}
|
||||
if (suspend_count_ptr) {
|
||||
*suspend_count_ptr = suspend_count;
|
||||
|
@ -190,15 +198,27 @@ dword_result_t KeResumeThread_entry(lpvoid_t thread_ptr) {
|
|||
DECLARE_XBOXKRNL_EXPORT1(KeResumeThread, kThreading, kImplemented);
|
||||
|
||||
dword_result_t NtSuspendThread_entry(dword_t handle,
|
||||
lpdword_t suspend_count_ptr) {
|
||||
lpdword_t suspend_count_ptr,
|
||||
const ppc_context_t& context) {
|
||||
X_RESULT result = X_STATUS_SUCCESS;
|
||||
uint32_t suspend_count = 0;
|
||||
|
||||
auto thread = kernel_state()->object_table()->LookupObject<XThread>(handle);
|
||||
if (thread) {
|
||||
result = thread->Suspend(&suspend_count);
|
||||
if (thread->type() == XObject::Type::Thread) {
|
||||
auto current_pcr = context->TranslateVirtualGPR<X_KPCR*>(context->r[13]);
|
||||
|
||||
if (current_pcr->current_thread == thread->guest_object() ||
|
||||
!thread->guest_object<X_KTHREAD>()->terminated) {
|
||||
result = thread->Suspend(&suspend_count);
|
||||
} else {
|
||||
return X_STATUS_THREAD_IS_TERMINATING;
|
||||
}
|
||||
} else {
|
||||
return X_STATUS_OBJECT_TYPE_MISMATCH;
|
||||
}
|
||||
} else {
|
||||
result = X_STATUS_INVALID_HANDLE;
|
||||
return X_STATUS_INVALID_HANDLE;
|
||||
}
|
||||
|
||||
if (suspend_count_ptr) {
|
||||
|
@ -213,23 +233,23 @@ void KeSetCurrentStackPointers_entry(lpvoid_t stack_ptr,
|
|||
pointer_t<X_KTHREAD> thread,
|
||||
lpvoid_t stack_alloc_base,
|
||||
lpvoid_t stack_base,
|
||||
lpvoid_t stack_limit) {
|
||||
lpvoid_t stack_limit, const ppc_context_t& context) {
|
||||
auto current_thread = XThread::GetCurrentThread();
|
||||
auto context = current_thread->thread_state()->context();
|
||||
auto pcr = kernel_memory()->TranslateVirtual<X_KPCR*>(
|
||||
static_cast<uint32_t>(context->r[13]));
|
||||
|
||||
auto pcr = context->TranslateVirtualGPR<X_KPCR*>(context->r[13]);
|
||||
|
||||
thread->stack_alloc_base = stack_alloc_base.value();
|
||||
thread->stack_base = stack_base.value();
|
||||
thread->stack_limit = stack_limit.value();
|
||||
pcr->stack_base_ptr = stack_base.guest_address();
|
||||
pcr->stack_end_ptr = stack_limit.guest_address();
|
||||
context->r[1] = stack_ptr.guest_address();
|
||||
|
||||
|
||||
// If a fiber is set, and the thread matches, reenter to avoid issues with
|
||||
// host stack overflowing.
|
||||
if (thread->fiber_ptr &&
|
||||
current_thread->guest_object() == thread.guest_address()) {
|
||||
context->processor->backend()->PrepareForReentry(context.value());
|
||||
current_thread->Reenter(static_cast<uint32_t>(context->lr));
|
||||
}
|
||||
}
|
||||
|
@ -1018,7 +1038,8 @@ void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr,
|
|||
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||
|
||||
PrefetchForCAS(lock);
|
||||
while (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
|
||||
while (!xe::atomic_cas(
|
||||
0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
// todo: this is just a nop if they don't have SMT, which is not great
|
||||
// either...
|
||||
|
@ -1038,7 +1059,8 @@ dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(
|
|||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||
PrefetchForCAS(lock);
|
||||
if (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
|
||||
if (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])),
|
||||
lock)) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
|
@ -1281,7 +1303,8 @@ DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented);
|
|||
|
||||
void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr,
|
||||
const ppc_context_t& ppc_context) {
|
||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||
auto old_irql =
|
||||
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||
|
||||
int32_t lock_count = ++lock_ptr->lock_count;
|
||||
if (!lock_count) {
|
||||
|
@ -1318,7 +1341,8 @@ DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading,
|
|||
|
||||
void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr,
|
||||
const ppc_context_t& ppc_context) {
|
||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||
auto old_irql =
|
||||
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||
|
||||
int32_t lock_count = ++lock_ptr->lock_count;
|
||||
if (!lock_count ||
|
||||
|
|
|
@ -33,8 +33,15 @@ DEFINE_bool(ignore_thread_priorities, true,
|
|||
DEFINE_bool(ignore_thread_affinities, true,
|
||||
"Ignores game-specified thread affinities.", "Kernel");
|
||||
|
||||
|
||||
#if 0
|
||||
DEFINE_int64(stack_size_multiplier_hack, 1,
|
||||
"A hack for games with setjmp/longjmp issues.", "Kernel");
|
||||
DEFINE_int64(main_xthread_stack_size_multiplier_hack, 1,
|
||||
"A hack for games with setjmp/longjmp issues.", "Kernel");
|
||||
#endif
|
||||
namespace xe {
|
||||
namespace kernel {
|
||||
namespace kernel {
|
||||
|
||||
const uint32_t XAPC::kSize;
|
||||
const uint32_t XAPC::kDummyKernelRoutine;
|
||||
|
@ -373,8 +380,23 @@ X_STATUS XThread::Create() {
|
|||
RetainHandle();
|
||||
|
||||
xe::threading::Thread::CreationParameters params;
|
||||
params.stack_size = 16_MiB; // Allocate a big host stack.
|
||||
|
||||
|
||||
|
||||
params.create_suspended = true;
|
||||
|
||||
#if 0
|
||||
uint64_t stack_size_mult = cvars::stack_size_multiplier_hack;
|
||||
|
||||
if (main_thread_) {
|
||||
stack_size_mult =
|
||||
static_cast<uint64_t>(cvars::main_xthread_stack_size_multiplier_hack);
|
||||
|
||||
}
|
||||
#else
|
||||
uint64_t stack_size_mult = 1;
|
||||
#endif
|
||||
params.stack_size = 16_MiB * stack_size_mult; // Allocate a big host stack.
|
||||
thread_ = xe::threading::Thread::Create(params, [this]() {
|
||||
// Set thread ID override. This is used by logging.
|
||||
xe::threading::set_current_thread_id(handle());
|
||||
|
@ -433,6 +455,9 @@ X_STATUS XThread::Create() {
|
|||
X_STATUS XThread::Exit(int exit_code) {
|
||||
// This may only be called on the thread itself.
|
||||
assert_true(XThread::GetCurrentThread() == this);
|
||||
//TODO(chrispy): not sure if this order is correct, should it come after apcs?
|
||||
guest_object<X_KTHREAD>()->terminated = 1;
|
||||
|
||||
|
||||
// TODO(benvanik): dispatch events? waiters? etc?
|
||||
RundownAPCs();
|
||||
|
|
|
@ -121,7 +121,7 @@ struct X_KTHREAD {
|
|||
uint8_t unk_B4[0x8]; // 0xB4
|
||||
uint8_t suspend_count; // 0xBC
|
||||
uint8_t unk_BD; // 0xBD
|
||||
uint8_t unk_BE; // 0xBE
|
||||
uint8_t terminated; // 0xBE
|
||||
uint8_t current_cpu; // 0xBF
|
||||
uint8_t unk_C0[0x10]; // 0xC0
|
||||
xe::be<uint32_t> stack_alloc_base; // 0xD0
|
||||
|
|
|
@ -316,8 +316,8 @@ void Memory::Reset() {
|
|||
heaps_.v90000000.Reset();
|
||||
heaps_.physical.Reset();
|
||||
}
|
||||
//clang does not like non-standard layout offsetof
|
||||
#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL==0
|
||||
// clang does not like non-standard layout offsetof
|
||||
#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL == 0
|
||||
XE_NOALIAS
|
||||
const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
||||
#define HEAP_INDEX(name) \
|
||||
|
@ -359,7 +359,6 @@ const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
|||
#else
|
||||
XE_NOALIAS
|
||||
const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
||||
|
||||
if (address < 0x40000000) {
|
||||
return &heaps_.v00000000;
|
||||
} else if (address < 0x7F000000) {
|
||||
|
@ -964,6 +963,14 @@ bool BaseHeap::AllocFixed(uint32_t base_address, uint32_t size,
|
|||
|
||||
return true;
|
||||
}
|
||||
template<typename T>
|
||||
static inline T QuickMod(T value, uint32_t modv) {
|
||||
if (xe::is_pow2(modv)) {
|
||||
return value & (modv - 1);
|
||||
} else {
|
||||
return value % modv;
|
||||
}
|
||||
}
|
||||
|
||||
bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
||||
uint32_t size, uint32_t alignment,
|
||||
|
@ -976,8 +983,9 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
|||
low_address = std::max(heap_base_, xe::align(low_address, alignment));
|
||||
high_address = std::min(heap_base_ + (heap_size_ - 1),
|
||||
xe::align(high_address, alignment));
|
||||
uint32_t low_page_number = (low_address - heap_base_) / page_size_;
|
||||
uint32_t high_page_number = (high_address - heap_base_) / page_size_;
|
||||
|
||||
uint32_t low_page_number = (low_address - heap_base_) >> page_size_shift_;
|
||||
uint32_t high_page_number = (high_address - heap_base_) >> page_size_shift_;
|
||||
low_page_number = std::min(uint32_t(page_table_.size()) - 1, low_page_number);
|
||||
high_page_number =
|
||||
std::min(uint32_t(page_table_.size()) - 1, high_page_number);
|
||||
|
@ -995,8 +1003,10 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
|||
// TODO(benvanik): optimized searching (free list buckets, bitmap, etc).
|
||||
uint32_t start_page_number = UINT_MAX;
|
||||
uint32_t end_page_number = UINT_MAX;
|
||||
uint32_t page_scan_stride = alignment / page_size_;
|
||||
high_page_number = high_page_number - (high_page_number % page_scan_stride);
|
||||
// chrispy:todo, page_scan_stride is probably always a power of two...
|
||||
uint32_t page_scan_stride = alignment >> page_size_shift_;
|
||||
high_page_number =
|
||||
high_page_number - QuickMod(high_page_number, page_scan_stride);
|
||||
if (top_down) {
|
||||
for (int64_t base_page_number =
|
||||
high_page_number - xe::round_up(page_count, page_scan_stride);
|
||||
|
@ -1024,7 +1034,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
|||
base_page_number = -1;
|
||||
} else {
|
||||
base_page_number = page_number - page_count;
|
||||
base_page_number -= base_page_number % page_scan_stride;
|
||||
base_page_number -= QuickMod(base_page_number, page_scan_stride);
|
||||
base_page_number += page_scan_stride; // cancel out loop logic
|
||||
}
|
||||
break;
|
||||
|
@ -1072,7 +1082,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
|||
if (start_page_number == UINT_MAX || end_page_number == UINT_MAX) {
|
||||
// Out of memory.
|
||||
XELOGE("BaseHeap::Alloc failed to find contiguous range");
|
||||
//assert_always("Heap exhausted!");
|
||||
// assert_always("Heap exhausted!");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -1084,15 +1094,15 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
|||
? xe::memory::AllocationType::kCommit
|
||||
: xe::memory::AllocationType::kReserve;
|
||||
void* result = xe::memory::AllocFixed(
|
||||
TranslateRelative(start_page_number * page_size_),
|
||||
page_count * page_size_, alloc_type, ToPageAccess(protect));
|
||||
TranslateRelative(start_page_number << page_size_shift_),
|
||||
page_count << page_size_shift_, alloc_type, ToPageAccess(protect));
|
||||
if (!result) {
|
||||
XELOGE("BaseHeap::Alloc failed to alloc range from host");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (cvars::scribble_heap && (protect & kMemoryProtectWrite)) {
|
||||
std::memset(result, 0xCD, page_count * page_size_);
|
||||
std::memset(result, 0xCD, page_count << page_size_shift_);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1108,7 +1118,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
|||
unreserved_page_count_--;
|
||||
}
|
||||
|
||||
*out_address = heap_base_ + (start_page_number * page_size_);
|
||||
*out_address = heap_base_ + (start_page_number << page_size_shift_);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1719,8 +1729,7 @@ XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
|
|||
uint32_t first_guest_page = SystemPagenumToGuestPagenum(system_page_first);
|
||||
uint32_t last_guest_page = SystemPagenumToGuestPagenum(system_page_last);
|
||||
|
||||
uint32_t guest_one =
|
||||
SystemPagenumToGuestPagenum(1);
|
||||
uint32_t guest_one = SystemPagenumToGuestPagenum(1);
|
||||
|
||||
uint32_t system_one = GuestPagenumToSystemPagenum(1);
|
||||
for (; i <= system_page_last; ++i) {
|
||||
|
@ -1755,7 +1764,6 @@ XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
|
|||
#endif
|
||||
|
||||
uint32_t guest_page_number = SystemPagenumToGuestPagenum(i);
|
||||
//swcache::PrefetchL1(&page_table_ptr[guest_page_number + 8]);
|
||||
xe::memory::PageAccess current_page_access =
|
||||
ToPageAccess(page_table_ptr[guest_page_number].current_protect);
|
||||
bool protect_system_page = false;
|
||||
|
|
|
@ -19,11 +19,96 @@
|
|||
|
||||
DEFINE_bool(enable_console, false, "Open a console window with the main window",
|
||||
"General");
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
DEFINE_bool(enable_rdrand_ntdll_patch, true,
|
||||
"Hot-patches ntdll at the start of the process to not use rdrand "
|
||||
"as part of the RNG for heap randomization. Can reduce CPU usage "
|
||||
"significantly, but is untested on all Windows versions.",
|
||||
"Win32");
|
||||
// begin ntdll hack
|
||||
#include <psapi.h>
|
||||
static bool g_didfailtowrite = false;
|
||||
static void write_process_memory(HANDLE process, uintptr_t offset,
|
||||
unsigned size, const unsigned char* bvals) {
|
||||
if (!WriteProcessMemory(process, (void*)offset, bvals, size, nullptr)) {
|
||||
if (!g_didfailtowrite) {
|
||||
MessageBoxA(nullptr, "Failed to write to process!", "Failed", MB_OK);
|
||||
g_didfailtowrite = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const unsigned char pattern_cmp_processorfeature_28_[] = {
|
||||
0x80, 0x3C, 0x25, 0x90,
|
||||
0x02, 0xFE, 0x7F, 0x00}; // cmp byte ptr ds:7FFE0290h, 0
|
||||
static const unsigned char pattern_replacement[] = {
|
||||
0x48, 0x39, 0xe4, // cmp rsp, rsp = always Z
|
||||
0x0F, 0x1F, 0x44, 0x00, 0x00 // 5byte nop
|
||||
};
|
||||
static void patch_ntdll_instance(HANDLE process, uintptr_t ntdll_base) {
|
||||
MODULEINFO modinfo;
|
||||
|
||||
GetModuleInformation(process, (HMODULE)ntdll_base, &modinfo,
|
||||
sizeof(MODULEINFO));
|
||||
|
||||
std::vector<uintptr_t> possible_places{};
|
||||
|
||||
unsigned char* strt = (unsigned char*)modinfo.lpBaseOfDll;
|
||||
|
||||
for (unsigned i = 0; i < modinfo.SizeOfImage; ++i) {
|
||||
for (unsigned j = 0; j < sizeof(pattern_cmp_processorfeature_28_); ++j) {
|
||||
if (strt[i + j] != pattern_cmp_processorfeature_28_[j]) {
|
||||
goto miss;
|
||||
}
|
||||
}
|
||||
possible_places.push_back((uintptr_t)(&strt[i]));
|
||||
miss:;
|
||||
}
|
||||
|
||||
for (auto&& place : possible_places) {
|
||||
write_process_memory(process, place, sizeof(pattern_replacement),
|
||||
pattern_replacement);
|
||||
}
|
||||
}
|
||||
|
||||
static void do_ntdll_hack_this_process() {
|
||||
patch_ntdll_instance(GetCurrentProcess(),
|
||||
(uintptr_t)GetModuleHandleA("ntdll.dll"));
|
||||
}
|
||||
#endif
|
||||
// end ntdll hack
|
||||
LONG _UnhandledExceptionFilter(_EXCEPTION_POINTERS* ExceptionInfo) {
|
||||
PVOID exception_addr = ExceptionInfo->ExceptionRecord->ExceptionAddress;
|
||||
|
||||
DWORD64 last_stackpointer = ExceptionInfo->ContextRecord->Rsp;
|
||||
|
||||
DWORD64 last_rip = ExceptionInfo->ContextRecord->Rip;
|
||||
|
||||
DWORD except_code = ExceptionInfo->ExceptionRecord->ExceptionCode;
|
||||
|
||||
DWORD last_error = GetLastError();
|
||||
|
||||
NTSTATUS stat = __readgsdword(0x1250);
|
||||
|
||||
int last_errno_value = errno;
|
||||
|
||||
|
||||
|
||||
char except_message_buf[1024];
|
||||
|
||||
sprintf_s(except_message_buf,
|
||||
"Exception encountered!\nException address: %p\nStackpointer: "
|
||||
"%p\nInstruction pointer: %p\nExceptionCode: 0x%X\nLast Win32 "
|
||||
"Error: 0x%X\nLast NTSTATUS: 0x%X\nLast errno value: 0x%X\n",
|
||||
exception_addr, (void*)last_stackpointer, (void*)last_rip, except_code,
|
||||
last_error, stat, last_errno_value);
|
||||
MessageBoxA(nullptr, except_message_buf, "Unhandled Exception", MB_ICONERROR);
|
||||
return EXCEPTION_CONTINUE_SEARCH;
|
||||
}
|
||||
int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev,
|
||||
LPWSTR command_line, int show_cmd) {
|
||||
int result;
|
||||
|
||||
SetUnhandledExceptionFilter(_UnhandledExceptionFilter);
|
||||
{
|
||||
xe::ui::Win32WindowedAppContext app_context(hinstance, show_cmd);
|
||||
// TODO(Triang3l): Initialize creates a window. Set DPI awareness via the
|
||||
|
@ -40,13 +125,6 @@ int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev,
|
|||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
// TODO(Triang3l): Rework this, need to initialize the console properly,
|
||||
// disable has_console_attached_ by default in windowed apps, and attach
|
||||
// only if needed.
|
||||
if (cvars::enable_console) {
|
||||
xe::AttachConsole();
|
||||
}
|
||||
|
||||
// Initialize COM on the UI thread with the apartment-threaded concurrency
|
||||
// model, so dialogs can be used.
|
||||
if (FAILED(CoInitializeEx(nullptr, COINIT_APARTMENTTHREADED))) {
|
||||
|
@ -55,8 +133,22 @@ int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev,
|
|||
|
||||
xe::InitializeWin32App(app->GetName());
|
||||
|
||||
result =
|
||||
app->OnInitialize() ? app_context.RunMainMessageLoop() : EXIT_FAILURE;
|
||||
if (app->OnInitialize()) {
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
if (cvars::enable_rdrand_ntdll_patch) {
|
||||
do_ntdll_hack_this_process();
|
||||
}
|
||||
#endif
|
||||
// TODO(Triang3l): Rework this, need to initialize the console properly,
|
||||
// disable has_console_attached_ by default in windowed apps, and attach
|
||||
// only if needed.
|
||||
if (cvars::enable_console) {
|
||||
xe::AttachConsole();
|
||||
}
|
||||
result = app_context.RunMainMessageLoop();
|
||||
} else {
|
||||
result = EXIT_FAILURE;
|
||||
}
|
||||
|
||||
app->InvokeOnDestroy();
|
||||
}
|
||||
|
|
|
@ -61,6 +61,7 @@ typedef uint32_t X_STATUS;
|
|||
#define X_STATUS_OBJECT_NAME_COLLISION ((X_STATUS)0xC0000035L)
|
||||
#define X_STATUS_INVALID_PAGE_PROTECTION ((X_STATUS)0xC0000045L)
|
||||
#define X_STATUS_MUTANT_NOT_OWNED ((X_STATUS)0xC0000046L)
|
||||
#define X_STATUS_THREAD_IS_TERMINATING ((X_STATUS)0xC000004BL)
|
||||
#define X_STATUS_PROCEDURE_NOT_FOUND ((X_STATUS)0xC000007AL)
|
||||
#define X_STATUS_INSUFFICIENT_RESOURCES ((X_STATUS)0xC000009AL)
|
||||
#define X_STATUS_MEMORY_NOT_ALLOCATED ((X_STATUS)0xC00000A0L)
|
||||
|
|
Loading…
Reference in New Issue