Merge pull request #96 from chrisps/host_guest_stack_synchronization
Host/Guest stack sync, exception messagebox, kernel improvements, minor opt
This commit is contained in:
commit
0674b68143
|
@ -103,3 +103,5 @@ node_modules/.bin/
|
||||||
/tools/shader-playground/*.dll
|
/tools/shader-playground/*.dll
|
||||||
/profile_print_times.py
|
/profile_print_times.py
|
||||||
/profile_times.txt
|
/profile_times.txt
|
||||||
|
/cache1
|
||||||
|
/cache0
|
||||||
|
|
|
@ -35,13 +35,15 @@ static bool has_shell_environment_variable() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void AttachConsole() {
|
void AttachConsole() {
|
||||||
bool has_console = ::AttachConsole(ATTACH_PARENT_PROCESS) == TRUE;
|
|
||||||
|
bool has_console = ::AttachConsole(ATTACH_PARENT_PROCESS) == TRUE;
|
||||||
|
#if 0
|
||||||
if (!has_console || !has_shell_environment_variable()) {
|
if (!has_console || !has_shell_environment_variable()) {
|
||||||
// We weren't launched from a console, so just return.
|
// We weren't launched from a console, so just return.
|
||||||
has_console_attached_ = false;
|
has_console_attached_ = false;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
AllocConsole();
|
AllocConsole();
|
||||||
|
|
||||||
has_console_attached_ = true;
|
has_console_attached_ = true;
|
||||||
|
|
|
@ -410,34 +410,7 @@ static float ArchReciprocal(float den) {
|
||||||
return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
|
return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
|
||||||
using ArchFloatMask = float;
|
|
||||||
|
|
||||||
XE_FORCEINLINE
|
|
||||||
static ArchFloatMask ArchCmpneqFloatMask(float x, float y) {
|
|
||||||
return _mm_cvtss_f32(_mm_cmpneq_ss(_mm_set_ss(x), _mm_set_ss(y)));
|
|
||||||
}
|
|
||||||
XE_FORCEINLINE
|
|
||||||
static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
|
||||||
return _mm_cvtss_f32(_mm_or_ps(_mm_set_ss(x), _mm_set_ss(y)));
|
|
||||||
}
|
|
||||||
XE_FORCEINLINE
|
|
||||||
static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
|
||||||
return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x), _mm_set_ss(y)));
|
|
||||||
}
|
|
||||||
|
|
||||||
XE_FORCEINLINE
|
|
||||||
static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
|
||||||
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x), _mm_set_ss(y)));
|
|
||||||
}
|
|
||||||
|
|
||||||
XE_FORCEINLINE
|
|
||||||
static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) {
|
|
||||||
return static_cast<uint32_t>(_mm_movemask_ps(_mm_set_ss(x)));
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr ArchFloatMask floatmask_zero = .0f;
|
|
||||||
#else
|
|
||||||
using ArchFloatMask = __m128;
|
using ArchFloatMask = __m128;
|
||||||
|
|
||||||
XE_FORCEINLINE
|
XE_FORCEINLINE
|
||||||
|
@ -464,7 +437,7 @@ static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) {
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr ArchFloatMask floatmask_zero{.0f};
|
constexpr ArchFloatMask floatmask_zero{.0f};
|
||||||
#endif
|
|
||||||
#else
|
#else
|
||||||
static float ArchMin(float x, float y) { return std::min<float>(x, y); }
|
static float ArchMin(float x, float y) { return std::min<float>(x, y); }
|
||||||
static float ArchMax(float x, float y) { return std::max<float>(x, y); }
|
static float ArchMax(float x, float y) { return std::max<float>(x, y); }
|
||||||
|
@ -610,17 +583,17 @@ union IDivExtraInfo {
|
||||||
} info;
|
} info;
|
||||||
};
|
};
|
||||||
// returns magicnum multiplier
|
// returns magicnum multiplier
|
||||||
static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
|
static constexpr uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
|
||||||
IDivExtraInfo extra;
|
IDivExtraInfo extra{};
|
||||||
|
|
||||||
uint32_t d = _denom;
|
uint32_t d = _denom;
|
||||||
int p;
|
int p=0;
|
||||||
uint32_t nc, delta, q1, r1, q2, r2;
|
uint32_t nc=0, delta=0, q1=0, r1=0, q2=0, r2=0;
|
||||||
struct {
|
struct {
|
||||||
unsigned M;
|
unsigned M;
|
||||||
int a;
|
int a;
|
||||||
int s;
|
int s;
|
||||||
} magu;
|
} magu{};
|
||||||
magu.a = 0;
|
magu.a = 0;
|
||||||
nc = -1 - ((uint32_t) - (int32_t)d) % d;
|
nc = -1 - ((uint32_t) - (int32_t)d) % d;
|
||||||
p = 31;
|
p = 31;
|
||||||
|
@ -660,13 +633,13 @@ static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
|
||||||
return static_cast<uint64_t>(q2 + 1);
|
return static_cast<uint64_t>(q2 + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
|
static constexpr uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
|
||||||
uint32_t extradata) {
|
uint32_t extradata) {
|
||||||
IDivExtraInfo extra;
|
IDivExtraInfo extra{};
|
||||||
|
|
||||||
extra.value_ = extradata;
|
extra.value_ = extradata;
|
||||||
|
|
||||||
uint32_t result = ((uint64_t)(num) * (uint64_t)mul) >> 32;
|
uint32_t result = static_cast<uint32_t>((static_cast<uint64_t>(num) * static_cast<uint64_t>(mul)) >> 32);
|
||||||
if (extra.info.add_) {
|
if (extra.info.add_) {
|
||||||
uint32_t addend = result + num;
|
uint32_t addend = result + num;
|
||||||
addend = ((addend < result ? 0x80000000 : 0) | addend);
|
addend = ((addend < result ? 0x80000000 : 0) | addend);
|
||||||
|
@ -675,7 +648,7 @@ static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
|
||||||
return result >> extra.info.shift_;
|
return result >> extra.info.shift_;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
|
static constexpr uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
|
||||||
uint32_t extradata, uint32_t original) {
|
uint32_t extradata, uint32_t original) {
|
||||||
uint32_t dived = ApplyUint32Div(num, mul, extradata);
|
uint32_t dived = ApplyUint32Div(num, mul, extradata);
|
||||||
unsigned result = num - (dived * original);
|
unsigned result = num - (dived * original);
|
||||||
|
@ -686,12 +659,12 @@ static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
|
||||||
struct MagicDiv {
|
struct MagicDiv {
|
||||||
uint32_t multiplier_;
|
uint32_t multiplier_;
|
||||||
uint32_t extradata_;
|
uint32_t extradata_;
|
||||||
MagicDiv() : multiplier_(0), extradata_(0) {}
|
constexpr MagicDiv() : multiplier_(0), extradata_(0) {}
|
||||||
MagicDiv(uint32_t original) {
|
constexpr MagicDiv(uint32_t original) : MagicDiv() {
|
||||||
multiplier_ = PregenerateUint32Div(original, extradata_);
|
multiplier_ = PregenerateUint32Div(original, extradata_);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t Apply(uint32_t numerator) const {
|
constexpr uint32_t Apply(uint32_t numerator) const {
|
||||||
return ApplyUint32Div(numerator, multiplier_, extradata_);
|
return ApplyUint32Div(numerator, multiplier_, extradata_);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -28,6 +28,9 @@ namespace xe {
|
||||||
namespace memory {
|
namespace memory {
|
||||||
|
|
||||||
size_t page_size() {
|
size_t page_size() {
|
||||||
|
#if XE_ARCH_AMD64 == 1
|
||||||
|
return 4096;
|
||||||
|
#else
|
||||||
static size_t value = 0;
|
static size_t value = 0;
|
||||||
if (!value) {
|
if (!value) {
|
||||||
SYSTEM_INFO si;
|
SYSTEM_INFO si;
|
||||||
|
@ -35,9 +38,13 @@ size_t page_size() {
|
||||||
value = si.dwPageSize;
|
value = si.dwPageSize;
|
||||||
}
|
}
|
||||||
return value;
|
return value;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t allocation_granularity() {
|
size_t allocation_granularity() {
|
||||||
|
#if XE_ARCH_AMD64 == 1 && XE_PLATFORM_WIN32 == 1
|
||||||
|
return 65536;
|
||||||
|
#else
|
||||||
static size_t value = 0;
|
static size_t value = 0;
|
||||||
if (!value) {
|
if (!value) {
|
||||||
SYSTEM_INFO si;
|
SYSTEM_INFO si;
|
||||||
|
@ -45,6 +52,7 @@ size_t allocation_granularity() {
|
||||||
value = si.dwAllocationGranularity;
|
value = si.dwAllocationGranularity;
|
||||||
}
|
}
|
||||||
return value;
|
return value;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
DWORD ToWin32ProtectFlags(PageAccess access) {
|
DWORD ToWin32ProtectFlags(PageAccess access) {
|
||||||
|
|
|
@ -37,7 +37,7 @@
|
||||||
#define XE_USE_NTDLL_FUNCTIONS 1
|
#define XE_USE_NTDLL_FUNCTIONS 1
|
||||||
//chrispy: disabling this for now, more research needs to be done imo, although it does work very well on my machine
|
//chrispy: disabling this for now, more research needs to be done imo, although it does work very well on my machine
|
||||||
//
|
//
|
||||||
#define XE_USE_KUSER_SHARED 0
|
#define XE_USE_KUSER_SHARED 1
|
||||||
#if XE_USE_NTDLL_FUNCTIONS == 1
|
#if XE_USE_NTDLL_FUNCTIONS == 1
|
||||||
/*
|
/*
|
||||||
ntdll versions of functions often skip through a lot of extra garbage in
|
ntdll versions of functions often skip through a lot of extra garbage in
|
||||||
|
|
|
@ -67,7 +67,22 @@ class Backend {
|
||||||
// up until the start of ctx may be used by the backend to store whatever data
|
// up until the start of ctx may be used by the backend to store whatever data
|
||||||
// they want
|
// they want
|
||||||
virtual void InitializeBackendContext(void* ctx) {}
|
virtual void InitializeBackendContext(void* ctx) {}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Free any dynamically allocated data/resources that the backendcontext uses
|
||||||
|
*/
|
||||||
|
virtual void DeinitializeBackendContext(void* ctx) {}
|
||||||
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode){};
|
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode){};
|
||||||
|
/*
|
||||||
|
called by KeSetCurrentStackPointers in xboxkrnl_threading.cc just prior
|
||||||
|
to calling XThread::Reenter this is an opportunity for a backend to clear any
|
||||||
|
data related to the guest stack
|
||||||
|
|
||||||
|
in the case of the X64 backend, it means we reset the stackpoint index
|
||||||
|
to 0, since its a new stack and all of our old entries are invalid now
|
||||||
|
|
||||||
|
* */
|
||||||
|
virtual void PrepareForReentry(void* ctx) {}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
Processor* processor_ = nullptr;
|
Processor* processor_ = nullptr;
|
||||||
|
|
|
@ -31,7 +31,16 @@ DEFINE_bool(record_mmio_access_exceptions, true,
|
||||||
"For guest addresses records whether we caught any mmio accesses "
|
"For guest addresses records whether we caught any mmio accesses "
|
||||||
"for them. This info can then be used on a subsequent run to "
|
"for them. This info can then be used on a subsequent run to "
|
||||||
"instruct the recompiler to emit checks",
|
"instruct the recompiler to emit checks",
|
||||||
"CPU");
|
"x64");
|
||||||
|
|
||||||
|
DEFINE_int64(max_stackpoints, 65536,
|
||||||
|
"Max number of host->guest stack mappings we can record.", "x64");
|
||||||
|
|
||||||
|
DEFINE_bool(enable_host_guest_stack_synchronization, true,
|
||||||
|
"Records entries for guest/host stack mappings at function starts "
|
||||||
|
"and checks for reentry at return sites. Has slight performance "
|
||||||
|
"impact, but fixes crashes in games that use setjmp/longjmp.",
|
||||||
|
"x64");
|
||||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||||
DECLARE_bool(instrument_call_times);
|
DECLARE_bool(instrument_call_times);
|
||||||
#endif
|
#endif
|
||||||
|
@ -41,15 +50,29 @@ namespace cpu {
|
||||||
namespace backend {
|
namespace backend {
|
||||||
namespace x64 {
|
namespace x64 {
|
||||||
|
|
||||||
class X64ThunkEmitter : public X64Emitter {
|
class X64HelperEmitter : public X64Emitter {
|
||||||
public:
|
public:
|
||||||
X64ThunkEmitter(X64Backend* backend, XbyakAllocator* allocator);
|
struct _code_offsets {
|
||||||
~X64ThunkEmitter() override;
|
size_t prolog;
|
||||||
|
size_t prolog_stack_alloc;
|
||||||
|
size_t body;
|
||||||
|
size_t epilog;
|
||||||
|
size_t tail;
|
||||||
|
};
|
||||||
|
X64HelperEmitter(X64Backend* backend, XbyakAllocator* allocator);
|
||||||
|
~X64HelperEmitter() override;
|
||||||
HostToGuestThunk EmitHostToGuestThunk();
|
HostToGuestThunk EmitHostToGuestThunk();
|
||||||
GuestToHostThunk EmitGuestToHostThunk();
|
GuestToHostThunk EmitGuestToHostThunk();
|
||||||
ResolveFunctionThunk EmitResolveFunctionThunk();
|
ResolveFunctionThunk EmitResolveFunctionThunk();
|
||||||
|
void* EmitGuestAndHostSynchronizeStackHelper();
|
||||||
|
// 1 for loading byte, 2 for halfword and 4 for word.
|
||||||
|
// these specialized versions save space in the caller
|
||||||
|
void* EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
||||||
|
void* sync_func, unsigned stack_element_size);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void* EmitCurrentForOffsets(const _code_offsets& offsets,
|
||||||
|
size_t stack_size = 0);
|
||||||
// The following four functions provide save/load functionality for registers.
|
// The following four functions provide save/load functionality for registers.
|
||||||
// They assume at least StackLayout::THUNK_STACK_SIZE bytes have been
|
// They assume at least StackLayout::THUNK_STACK_SIZE bytes have been
|
||||||
// allocated on the stack.
|
// allocated on the stack.
|
||||||
|
@ -184,11 +207,26 @@ bool X64Backend::Initialize(Processor* processor) {
|
||||||
|
|
||||||
// Generate thunks used to transition between jitted code and host code.
|
// Generate thunks used to transition between jitted code and host code.
|
||||||
XbyakAllocator allocator;
|
XbyakAllocator allocator;
|
||||||
X64ThunkEmitter thunk_emitter(this, &allocator);
|
X64HelperEmitter thunk_emitter(this, &allocator);
|
||||||
host_to_guest_thunk_ = thunk_emitter.EmitHostToGuestThunk();
|
host_to_guest_thunk_ = thunk_emitter.EmitHostToGuestThunk();
|
||||||
guest_to_host_thunk_ = thunk_emitter.EmitGuestToHostThunk();
|
guest_to_host_thunk_ = thunk_emitter.EmitGuestToHostThunk();
|
||||||
resolve_function_thunk_ = thunk_emitter.EmitResolveFunctionThunk();
|
resolve_function_thunk_ = thunk_emitter.EmitResolveFunctionThunk();
|
||||||
|
|
||||||
|
if (cvars::enable_host_guest_stack_synchronization) {
|
||||||
|
synchronize_guest_and_host_stack_helper_ =
|
||||||
|
thunk_emitter.EmitGuestAndHostSynchronizeStackHelper();
|
||||||
|
|
||||||
|
synchronize_guest_and_host_stack_helper_size8_ =
|
||||||
|
thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
||||||
|
synchronize_guest_and_host_stack_helper_, 1);
|
||||||
|
synchronize_guest_and_host_stack_helper_size16_ =
|
||||||
|
thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
||||||
|
synchronize_guest_and_host_stack_helper_, 2);
|
||||||
|
synchronize_guest_and_host_stack_helper_size32_ =
|
||||||
|
thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
||||||
|
synchronize_guest_and_host_stack_helper_, 4);
|
||||||
|
}
|
||||||
|
|
||||||
// Set the code cache to use the ResolveFunction thunk for default
|
// Set the code cache to use the ResolveFunction thunk for default
|
||||||
// indirections.
|
// indirections.
|
||||||
assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
|
assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
|
||||||
|
@ -203,9 +241,10 @@ bool X64Backend::Initialize(Processor* processor) {
|
||||||
|
|
||||||
// Setup exception callback
|
// Setup exception callback
|
||||||
ExceptionHandler::Install(&ExceptionCallbackThunk, this);
|
ExceptionHandler::Install(&ExceptionCallbackThunk, this);
|
||||||
|
if (cvars::record_mmio_access_exceptions) {
|
||||||
processor->memory()->SetMMIOExceptionRecordingCallback(
|
processor->memory()->SetMMIOExceptionRecordingCallback(
|
||||||
ForwardMMIOAccessForRecording, (void*)this);
|
ForwardMMIOAccessForRecording, (void*)this);
|
||||||
|
}
|
||||||
|
|
||||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||||
if (cvars::instrument_call_times) {
|
if (cvars::instrument_call_times) {
|
||||||
|
@ -509,23 +548,32 @@ bool X64Backend::ExceptionCallback(Exception* ex) {
|
||||||
return processor()->OnThreadBreakpointHit(ex);
|
return processor()->OnThreadBreakpointHit(ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
X64ThunkEmitter::X64ThunkEmitter(X64Backend* backend, XbyakAllocator* allocator)
|
X64HelperEmitter::X64HelperEmitter(X64Backend* backend,
|
||||||
|
XbyakAllocator* allocator)
|
||||||
: X64Emitter(backend, allocator) {}
|
: X64Emitter(backend, allocator) {}
|
||||||
|
|
||||||
X64ThunkEmitter::~X64ThunkEmitter() {}
|
X64HelperEmitter::~X64HelperEmitter() {}
|
||||||
|
void* X64HelperEmitter::EmitCurrentForOffsets(const _code_offsets& code_offsets,
|
||||||
|
size_t stack_size) {
|
||||||
|
EmitFunctionInfo func_info = {};
|
||||||
|
func_info.code_size.total = getSize();
|
||||||
|
func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
|
||||||
|
func_info.code_size.body = code_offsets.epilog - code_offsets.body;
|
||||||
|
func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
|
||||||
|
func_info.code_size.tail = getSize() - code_offsets.tail;
|
||||||
|
func_info.prolog_stack_alloc_offset =
|
||||||
|
code_offsets.prolog_stack_alloc - code_offsets.prolog;
|
||||||
|
func_info.stack_size = stack_size;
|
||||||
|
|
||||||
HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
|
void* fn = Emplace(func_info);
|
||||||
|
return fn;
|
||||||
|
}
|
||||||
|
HostToGuestThunk X64HelperEmitter::EmitHostToGuestThunk() {
|
||||||
// rcx = target
|
// rcx = target
|
||||||
// rdx = arg0 (context)
|
// rdx = arg0 (context)
|
||||||
// r8 = arg1 (guest return address)
|
// r8 = arg1 (guest return address)
|
||||||
|
|
||||||
struct _code_offsets {
|
_code_offsets code_offsets = {};
|
||||||
size_t prolog;
|
|
||||||
size_t prolog_stack_alloc;
|
|
||||||
size_t body;
|
|
||||||
size_t epilog;
|
|
||||||
size_t tail;
|
|
||||||
} code_offsets = {};
|
|
||||||
|
|
||||||
const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
||||||
|
|
||||||
|
@ -576,19 +624,13 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
|
||||||
return (HostToGuestThunk)fn;
|
return (HostToGuestThunk)fn;
|
||||||
}
|
}
|
||||||
|
|
||||||
GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
|
GuestToHostThunk X64HelperEmitter::EmitGuestToHostThunk() {
|
||||||
// rcx = target function
|
// rcx = target function
|
||||||
// rdx = arg0
|
// rdx = arg0
|
||||||
// r8 = arg1
|
// r8 = arg1
|
||||||
// r9 = arg2
|
// r9 = arg2
|
||||||
|
|
||||||
struct _code_offsets {
|
_code_offsets code_offsets = {};
|
||||||
size_t prolog;
|
|
||||||
size_t prolog_stack_alloc;
|
|
||||||
size_t body;
|
|
||||||
size_t epilog;
|
|
||||||
size_t tail;
|
|
||||||
} code_offsets = {};
|
|
||||||
|
|
||||||
const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
||||||
|
|
||||||
|
@ -635,17 +677,11 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
|
||||||
// X64Emitter handles actually resolving functions.
|
// X64Emitter handles actually resolving functions.
|
||||||
uint64_t ResolveFunction(void* raw_context, uint64_t target_address);
|
uint64_t ResolveFunction(void* raw_context, uint64_t target_address);
|
||||||
|
|
||||||
ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
|
ResolveFunctionThunk X64HelperEmitter::EmitResolveFunctionThunk() {
|
||||||
// ebx = target PPC address
|
// ebx = target PPC address
|
||||||
// rcx = context
|
// rcx = context
|
||||||
|
|
||||||
struct _code_offsets {
|
_code_offsets code_offsets = {};
|
||||||
size_t prolog;
|
|
||||||
size_t prolog_stack_alloc;
|
|
||||||
size_t body;
|
|
||||||
size_t epilog;
|
|
||||||
size_t tail;
|
|
||||||
} code_offsets = {};
|
|
||||||
|
|
||||||
const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
||||||
|
|
||||||
|
@ -688,8 +724,116 @@ ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
|
||||||
void* fn = Emplace(func_info);
|
void* fn = Emplace(func_info);
|
||||||
return (ResolveFunctionThunk)fn;
|
return (ResolveFunctionThunk)fn;
|
||||||
}
|
}
|
||||||
|
// r11 = size of callers stack, r8 = return address w/ adjustment
|
||||||
|
//i'm not proud of this code, but it shouldn't be executed frequently at all
|
||||||
|
void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper() {
|
||||||
|
_code_offsets code_offsets = {};
|
||||||
|
code_offsets.prolog = getSize();
|
||||||
|
mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints)));
|
||||||
|
mov(eax,
|
||||||
|
GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)));
|
||||||
|
|
||||||
void X64ThunkEmitter::EmitSaveVolatileRegs() {
|
lea(ecx, ptr[eax - 1]);
|
||||||
|
mov(r9d, ptr[GetContextReg() + offsetof(ppc::PPCContext, r[1])]);
|
||||||
|
|
||||||
|
Xbyak::Label looper{};
|
||||||
|
Xbyak::Label loopout{};
|
||||||
|
Xbyak::Label signed_underflow{};
|
||||||
|
xor_(r12d, r12d);
|
||||||
|
|
||||||
|
//todo: should use Loop instruction here if hasFastLoop,
|
||||||
|
//currently xbyak does not support it but its super easy to modify xbyak to have it
|
||||||
|
L(looper);
|
||||||
|
imul(edx, ecx, sizeof(X64BackendStackpoint));
|
||||||
|
mov(r10d, ptr[rbx + rdx + offsetof(X64BackendStackpoint, guest_stack_)]);
|
||||||
|
|
||||||
|
cmp(r10d, r9d);
|
||||||
|
|
||||||
|
jge(loopout, T_NEAR);
|
||||||
|
|
||||||
|
inc(r12d);
|
||||||
|
|
||||||
|
if (IsFeatureEnabled(kX64FlagsIndependentVars)) {
|
||||||
|
dec(ecx);
|
||||||
|
} else {
|
||||||
|
sub(ecx, 1);
|
||||||
|
}
|
||||||
|
js(signed_underflow, T_NEAR); // should be impossible!!
|
||||||
|
|
||||||
|
|
||||||
|
jmp(looper, T_NEAR);
|
||||||
|
L(loopout);
|
||||||
|
Xbyak::Label skip_adjust{};
|
||||||
|
cmp(r12d, 1);//should never happen?
|
||||||
|
jle(skip_adjust, T_NEAR);
|
||||||
|
mov(rsp, ptr[rbx + rdx + offsetof(X64BackendStackpoint, host_stack_)]);
|
||||||
|
if (IsFeatureEnabled(kX64FlagsIndependentVars)) {
|
||||||
|
inc(ecx);
|
||||||
|
} else {
|
||||||
|
add(ecx, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// this->DebugBreak();
|
||||||
|
sub(rsp, r11); // adjust stack
|
||||||
|
|
||||||
|
mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
|
||||||
|
ecx); // set next stackpoint index to be after the one we restored to
|
||||||
|
L(skip_adjust);
|
||||||
|
|
||||||
|
jmp(r8); // return to caller
|
||||||
|
code_offsets.prolog_stack_alloc = getSize();
|
||||||
|
code_offsets.body = getSize();
|
||||||
|
code_offsets.epilog = getSize();
|
||||||
|
code_offsets.tail = getSize();
|
||||||
|
|
||||||
|
L(signed_underflow);
|
||||||
|
//find a good, compact way to signal error here
|
||||||
|
// maybe an invalid opcode that we execute, then detect in an exception handler?
|
||||||
|
|
||||||
|
this->DebugBreak();
|
||||||
|
// stack unwinding, take first entry
|
||||||
|
//actually, no reason to have this
|
||||||
|
|
||||||
|
/*mov(rsp, ptr[rbx + offsetof(X64BackendStackpoint, host_stack_)]);
|
||||||
|
mov(ptr[rbx + offsetof(X64BackendStackpoint, guest_stack_)], r9d);
|
||||||
|
sub(rsp, r11);
|
||||||
|
xor_(eax, eax);
|
||||||
|
inc(eax);
|
||||||
|
mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
|
||||||
|
eax);
|
||||||
|
|
||||||
|
jmp(r8);*/
|
||||||
|
// this->DebugBreak(); // err, add an xe::FatalError to call for this
|
||||||
|
|
||||||
|
return EmitCurrentForOffsets(code_offsets);
|
||||||
|
}
|
||||||
|
|
||||||
|
void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
||||||
|
void* sync_func, unsigned stack_element_size) {
|
||||||
|
_code_offsets code_offsets = {};
|
||||||
|
code_offsets.prolog = getSize();
|
||||||
|
pop(r8); // return address
|
||||||
|
|
||||||
|
switch (stack_element_size) {
|
||||||
|
case 4:
|
||||||
|
mov(r11d, ptr[r8]);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
movzx(r11d, word[r8]);
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
movzx(r11d, byte[r8]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
add(r8, stack_element_size);
|
||||||
|
jmp(sync_func, T_NEAR);
|
||||||
|
code_offsets.prolog_stack_alloc = getSize();
|
||||||
|
code_offsets.body = getSize();
|
||||||
|
code_offsets.epilog = getSize();
|
||||||
|
code_offsets.tail = getSize();
|
||||||
|
return EmitCurrentForOffsets(code_offsets);
|
||||||
|
}
|
||||||
|
void X64HelperEmitter::EmitSaveVolatileRegs() {
|
||||||
// Save off volatile registers.
|
// Save off volatile registers.
|
||||||
// mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax);
|
// mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax);
|
||||||
mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rcx);
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rcx);
|
||||||
|
@ -711,7 +855,7 @@ void X64ThunkEmitter::EmitSaveVolatileRegs() {
|
||||||
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm5);
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm5);
|
||||||
}
|
}
|
||||||
|
|
||||||
void X64ThunkEmitter::EmitLoadVolatileRegs() {
|
void X64HelperEmitter::EmitLoadVolatileRegs() {
|
||||||
// mov(rax, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
|
// mov(rax, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
|
||||||
mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
|
mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
|
||||||
mov(rdx, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
|
mov(rdx, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
|
||||||
|
@ -732,7 +876,7 @@ void X64ThunkEmitter::EmitLoadVolatileRegs() {
|
||||||
vmovaps(xmm5, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]);
|
vmovaps(xmm5, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void X64ThunkEmitter::EmitSaveNonvolatileRegs() {
|
void X64HelperEmitter::EmitSaveNonvolatileRegs() {
|
||||||
mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rbx);
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rbx);
|
||||||
mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rbp);
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rbp);
|
||||||
#if XE_PLATFORM_WIN32
|
#if XE_PLATFORM_WIN32
|
||||||
|
@ -760,7 +904,7 @@ void X64ThunkEmitter::EmitSaveNonvolatileRegs() {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
|
void X64HelperEmitter::EmitLoadNonvolatileRegs() {
|
||||||
mov(rbx, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
|
mov(rbx, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
|
||||||
mov(rbp, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
|
mov(rbp, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
|
||||||
#if XE_PLATFORM_WIN32
|
#if XE_PLATFORM_WIN32
|
||||||
|
@ -788,16 +932,41 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
|
||||||
}
|
}
|
||||||
void X64Backend::InitializeBackendContext(void* ctx) {
|
void X64Backend::InitializeBackendContext(void* ctx) {
|
||||||
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
||||||
bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
|
|
||||||
bctx->mxcsr_fpu =
|
bctx->mxcsr_fpu =
|
||||||
DEFAULT_FPU_MXCSR; // idk if this is right, check on rgh what the
|
DEFAULT_FPU_MXCSR; // idk if this is right, check on rgh what the
|
||||||
// rounding on ppc is at startup
|
// rounding on ppc is at startup
|
||||||
|
|
||||||
|
/*
|
||||||
|
todo: stackpoint arrays should be pooled virtual memory at the very
|
||||||
|
least there may be some fancy virtual address tricks we can do here
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
bctx->stackpoints = cvars::enable_host_guest_stack_synchronization
|
||||||
|
? new X64BackendStackpoint[cvars::max_stackpoints]
|
||||||
|
: nullptr;
|
||||||
|
bctx->current_stackpoint_depth = 0;
|
||||||
bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
|
bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
|
||||||
bctx->flags = 0;
|
bctx->flags = 0;
|
||||||
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
|
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
|
||||||
bctx->Ox1000 = 0x1000;
|
bctx->Ox1000 = 0x1000;
|
||||||
bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
|
bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
|
||||||
}
|
}
|
||||||
|
void X64Backend::DeinitializeBackendContext(void* ctx) {
|
||||||
|
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
||||||
|
|
||||||
|
if (bctx->stackpoints) {
|
||||||
|
delete[] bctx->stackpoints;
|
||||||
|
bctx->stackpoints = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void X64Backend::PrepareForReentry(void* ctx) {
|
||||||
|
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
||||||
|
|
||||||
|
bctx->current_stackpoint_depth = 0;
|
||||||
|
}
|
||||||
|
|
||||||
const uint32_t mxcsr_table[8] = {
|
const uint32_t mxcsr_table[8] = {
|
||||||
0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
|
0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
|
||||||
};
|
};
|
||||||
|
|
|
@ -24,7 +24,8 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
DECLARE_int64(x64_extension_mask);
|
DECLARE_int64(x64_extension_mask);
|
||||||
|
DECLARE_int64(max_stackpoints);
|
||||||
|
DECLARE_bool(enable_host_guest_stack_synchronization);
|
||||||
namespace xe {
|
namespace xe {
|
||||||
class Exception;
|
class Exception;
|
||||||
} // namespace xe
|
} // namespace xe
|
||||||
|
@ -41,14 +42,25 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
|
||||||
typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
|
typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
|
||||||
typedef void (*ResolveFunctionThunk)();
|
typedef void (*ResolveFunctionThunk)();
|
||||||
|
|
||||||
|
struct X64BackendStackpoint {
|
||||||
|
uint64_t host_stack_;
|
||||||
|
unsigned guest_stack_;
|
||||||
|
// pad to 16 bytes so we never end up having a 64 bit load/store for
|
||||||
|
// host_stack_ straddling two lines. Consider this field reserved for future
|
||||||
|
// use
|
||||||
|
unsigned unused_;
|
||||||
|
};
|
||||||
// located prior to the ctx register
|
// located prior to the ctx register
|
||||||
// some things it would be nice to have be per-emulator instance instead of per
|
// some things it would be nice to have be per-emulator instance instead of per
|
||||||
// context (somehow placing a global X64BackendCtx prior to membase, so we can
|
// context (somehow placing a global X64BackendCtx prior to membase, so we can
|
||||||
// negatively index the membase reg)
|
// negatively index the membase reg)
|
||||||
struct X64BackendContext {
|
struct X64BackendContext {
|
||||||
void* ResolveFunction_Ptr; // cached pointer to resolvefunction
|
// guest_tick_count is used if inline_loadclock is used
|
||||||
uint64_t* guest_tick_count;
|
uint64_t* guest_tick_count;
|
||||||
|
// records mapping of host_stack to guest_stack
|
||||||
|
X64BackendStackpoint* stackpoints;
|
||||||
|
|
||||||
|
unsigned int current_stackpoint_depth;
|
||||||
unsigned int mxcsr_fpu; // currently, the way we implement rounding mode
|
unsigned int mxcsr_fpu; // currently, the way we implement rounding mode
|
||||||
// affects both vmx and the fpu
|
// affects both vmx and the fpu
|
||||||
unsigned int mxcsr_vmx;
|
unsigned int mxcsr_vmx;
|
||||||
|
@ -81,6 +93,19 @@ class X64Backend : public Backend {
|
||||||
return resolve_function_thunk_;
|
return resolve_function_thunk_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void* synchronize_guest_and_host_stack_helper() const {
|
||||||
|
return synchronize_guest_and_host_stack_helper_;
|
||||||
|
}
|
||||||
|
void* synchronize_guest_and_host_stack_helper_for_size(size_t sz) const {
|
||||||
|
switch (sz) {
|
||||||
|
case 1:
|
||||||
|
return synchronize_guest_and_host_stack_helper_size8_;
|
||||||
|
case 2:
|
||||||
|
return synchronize_guest_and_host_stack_helper_size16_;
|
||||||
|
default:
|
||||||
|
return synchronize_guest_and_host_stack_helper_size32_;
|
||||||
|
}
|
||||||
|
}
|
||||||
bool Initialize(Processor* processor) override;
|
bool Initialize(Processor* processor) override;
|
||||||
|
|
||||||
void CommitExecutableRange(uint32_t guest_low, uint32_t guest_high) override;
|
void CommitExecutableRange(uint32_t guest_low, uint32_t guest_high) override;
|
||||||
|
@ -97,7 +122,8 @@ class X64Backend : public Backend {
|
||||||
void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) override;
|
void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) override;
|
||||||
void UninstallBreakpoint(Breakpoint* breakpoint) override;
|
void UninstallBreakpoint(Breakpoint* breakpoint) override;
|
||||||
virtual void InitializeBackendContext(void* ctx) override;
|
virtual void InitializeBackendContext(void* ctx) override;
|
||||||
|
virtual void DeinitializeBackendContext(void* ctx) override;
|
||||||
|
virtual void PrepareForReentry(void* ctx) override;
|
||||||
X64BackendContext* BackendContextForGuestContext(void* ctx) {
|
X64BackendContext* BackendContextForGuestContext(void* ctx) {
|
||||||
return reinterpret_cast<X64BackendContext*>(
|
return reinterpret_cast<X64BackendContext*>(
|
||||||
reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
|
reinterpret_cast<intptr_t>(ctx) - sizeof(X64BackendContext));
|
||||||
|
@ -120,7 +146,12 @@ class X64Backend : public Backend {
|
||||||
HostToGuestThunk host_to_guest_thunk_;
|
HostToGuestThunk host_to_guest_thunk_;
|
||||||
GuestToHostThunk guest_to_host_thunk_;
|
GuestToHostThunk guest_to_host_thunk_;
|
||||||
ResolveFunctionThunk resolve_function_thunk_;
|
ResolveFunctionThunk resolve_function_thunk_;
|
||||||
|
void* synchronize_guest_and_host_stack_helper_ = nullptr;
|
||||||
|
|
||||||
|
// loads stack sizes 1 byte, 2 bytes or 4 bytes
|
||||||
|
void* synchronize_guest_and_host_stack_helper_size8_ = nullptr;
|
||||||
|
void* synchronize_guest_and_host_stack_helper_size16_ = nullptr;
|
||||||
|
void* synchronize_guest_and_host_stack_helper_size32_ = nullptr;
|
||||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||||
GuestProfilerData profiler_data_;
|
GuestProfilerData profiler_data_;
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -213,6 +213,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
||||||
func_info.stack_size = stack_size;
|
func_info.stack_size = stack_size;
|
||||||
stack_size_ = stack_size;
|
stack_size_ = stack_size;
|
||||||
|
|
||||||
|
PushStackpoint();
|
||||||
sub(rsp, (uint32_t)stack_size);
|
sub(rsp, (uint32_t)stack_size);
|
||||||
|
|
||||||
code_offsets.prolog_stack_alloc = getSize();
|
code_offsets.prolog_stack_alloc = getSize();
|
||||||
|
@ -271,6 +272,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
||||||
*/
|
*/
|
||||||
// Body.
|
// Body.
|
||||||
auto block = builder->first_block();
|
auto block = builder->first_block();
|
||||||
|
synchronize_stack_on_next_instruction_ = false;
|
||||||
while (block) {
|
while (block) {
|
||||||
ForgetMxcsrMode(); // at start of block, mxcsr mode is undefined
|
ForgetMxcsrMode(); // at start of block, mxcsr mode is undefined
|
||||||
|
|
||||||
|
@ -287,6 +289,12 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
||||||
// Process instructions.
|
// Process instructions.
|
||||||
const Instr* instr = block->instr_head;
|
const Instr* instr = block->instr_head;
|
||||||
while (instr) {
|
while (instr) {
|
||||||
|
if (synchronize_stack_on_next_instruction_) {
|
||||||
|
if (instr->GetOpcodeNum() != hir::OPCODE_SOURCE_OFFSET) {
|
||||||
|
synchronize_stack_on_next_instruction_ = false;
|
||||||
|
EnsureSynchronizedGuestAndHostStack();
|
||||||
|
}
|
||||||
|
}
|
||||||
const Instr* new_tail = instr;
|
const Instr* new_tail = instr;
|
||||||
if (!SelectSequence(this, instr, &new_tail)) {
|
if (!SelectSequence(this, instr, &new_tail)) {
|
||||||
// No sequence found!
|
// No sequence found!
|
||||||
|
@ -314,6 +322,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
||||||
EmitProfilerEpilogue();
|
EmitProfilerEpilogue();
|
||||||
|
|
||||||
add(rsp, (uint32_t)stack_size);
|
add(rsp, (uint32_t)stack_size);
|
||||||
|
PopStackpoint();
|
||||||
ret();
|
ret();
|
||||||
// todo: do some kind of sorting by alignment?
|
// todo: do some kind of sorting by alignment?
|
||||||
for (auto&& tail_item : tail_code_) {
|
for (auto&& tail_item : tail_code_) {
|
||||||
|
@ -453,12 +462,186 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
|
||||||
|
|
||||||
// This is used by the X64ThunkEmitter's ResolveFunctionThunk.
|
// This is used by the X64ThunkEmitter's ResolveFunctionThunk.
|
||||||
uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
|
uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
|
||||||
auto thread_state =
|
auto guest_context = reinterpret_cast<ppc::PPCContext_s*>(raw_context);
|
||||||
reinterpret_cast<ppc::PPCContext_s*>(raw_context)->thread_state;
|
|
||||||
|
auto thread_state = guest_context->thread_state;
|
||||||
|
|
||||||
// TODO(benvanik): required?
|
// TODO(benvanik): required?
|
||||||
assert_not_zero(target_address);
|
assert_not_zero(target_address);
|
||||||
|
|
||||||
|
/*
|
||||||
|
todo: refactor this!
|
||||||
|
|
||||||
|
The purpose of this code is to allow guest longjmp to call into
|
||||||
|
the body of an existing host function. There are a lot of conditions we
|
||||||
|
have to check here to ensure that we do not mess up a normal call to a
|
||||||
|
function
|
||||||
|
|
||||||
|
The address must be within an XexModule (may need to make some changes
|
||||||
|
to instructionaddressflags to remove this limitation) The target address
|
||||||
|
must be a known return site. The guest address must be part of a function
|
||||||
|
that was already translated.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (cvars::enable_host_guest_stack_synchronization) {
|
||||||
|
auto processor = thread_state->processor();
|
||||||
|
auto module_for_address =
|
||||||
|
processor->LookupModule(static_cast<uint32_t>(target_address));
|
||||||
|
|
||||||
|
if (module_for_address) {
|
||||||
|
XexModule* xexmod = dynamic_cast<XexModule*>(module_for_address);
|
||||||
|
if (xexmod) {
|
||||||
|
InfoCacheFlags* flags = xexmod->GetInstructionAddressFlags(
|
||||||
|
static_cast<uint32_t>(target_address));
|
||||||
|
if (flags) {
|
||||||
|
if (flags->is_return_site) {
|
||||||
|
auto ones_with_address = processor->FindFunctionsWithAddress(
|
||||||
|
static_cast<uint32_t>(target_address));
|
||||||
|
if (ones_with_address.size() != 0) {
|
||||||
|
// this loop to find a host address for the guest address is
|
||||||
|
// necessary because FindFunctionsWithAddress works via a range
|
||||||
|
// check, but if the function consists of multiple blocks
|
||||||
|
// scattered around with "holes" of instructions that cannot be
|
||||||
|
// reached in between those holes the instructions that cannot be
|
||||||
|
// reached will incorrectly be considered members of the function
|
||||||
|
|
||||||
|
X64Function* candidate = nullptr;
|
||||||
|
uintptr_t host_address = 0;
|
||||||
|
for (auto&& entry : ones_with_address) {
|
||||||
|
X64Function* xfunc = static_cast<X64Function*>(entry);
|
||||||
|
|
||||||
|
host_address = xfunc->MapGuestAddressToMachineCode(
|
||||||
|
static_cast<uint32_t>(target_address));
|
||||||
|
// host address does exist within the function, and that host
|
||||||
|
// function is not the start of the function, it is instead
|
||||||
|
// somewhere within its existing body
|
||||||
|
// i originally did not have this (xfunc->machine_code() !=
|
||||||
|
// reinterpret_cast<const uint8_t*>(host_address))) condition
|
||||||
|
// here when i distributed builds for testing, no issues arose
|
||||||
|
// related to it but i wanted to be more explicit
|
||||||
|
if (host_address &&
|
||||||
|
xfunc->machine_code() !=
|
||||||
|
reinterpret_cast<const uint8_t*>(host_address)) {
|
||||||
|
candidate = xfunc;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// we found an existing X64Function, and a return site within that
|
||||||
|
// function that has a host address w/ native code
|
||||||
|
if (candidate && host_address) {
|
||||||
|
X64Backend* backend =
|
||||||
|
static_cast<X64Backend*>(processor->backend());
|
||||||
|
// grab the backend context, next we have to check whether the
|
||||||
|
// guest and host stack are out of sync if they arent, its fine
|
||||||
|
// for the backend to create a new function for the guest
|
||||||
|
// address we're resolving if they are, it means that the reason
|
||||||
|
// we're resolving this address is because context is being
|
||||||
|
// restored (probably by longjmp)
|
||||||
|
X64BackendContext* backend_context =
|
||||||
|
backend->BackendContextForGuestContext(guest_context);
|
||||||
|
|
||||||
|
uint32_t current_stackpoint_index =
|
||||||
|
backend_context->current_stackpoint_depth;
|
||||||
|
|
||||||
|
--current_stackpoint_index;
|
||||||
|
|
||||||
|
X64BackendStackpoint* stackpoints =
|
||||||
|
backend_context->stackpoints;
|
||||||
|
|
||||||
|
uint32_t current_guest_stackpointer =
|
||||||
|
static_cast<uint32_t>(guest_context->r[1]);
|
||||||
|
uint32_t num_frames_bigger = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
if the current guest stack pointer is bigger than the
|
||||||
|
recorded pointer for this stack thats fine, plenty of
|
||||||
|
functions restore the original stack pointer early
|
||||||
|
|
||||||
|
if more than 1... we're longjmping and sure of it at
|
||||||
|
this point (jumping to a return site that has already been
|
||||||
|
emitted)
|
||||||
|
*/
|
||||||
|
while (current_stackpoint_index != 0xFFFFFFFF) {
|
||||||
|
if (current_guest_stackpointer >
|
||||||
|
stackpoints[current_stackpoint_index].guest_stack_) {
|
||||||
|
--current_stackpoint_index;
|
||||||
|
++num_frames_bigger;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
DEFINITELY a longjmp, return original
|
||||||
|
host address. returning the existing host address is going to
|
||||||
|
set off some extra machinery we have set up to support this
|
||||||
|
|
||||||
|
to break it down, our caller (us being
|
||||||
|
this ResolveFunction that this comment is in) is
|
||||||
|
X64Backend::resolve_function_thunk_ which is implemented in
|
||||||
|
x64_backend.cc X64HelperEmitter::EmitResolveFunctionThunk, or
|
||||||
|
a call from the resolver table
|
||||||
|
|
||||||
|
the x64 fastcall abi dictates that the
|
||||||
|
stack must always be 16 byte aligned. We select our stack
|
||||||
|
size for functions to ensure that we keep rsp aligned to 16
|
||||||
|
bytes
|
||||||
|
|
||||||
|
but by calling into the body of an
|
||||||
|
existing function we've pushed our return address onto the
|
||||||
|
stack (dont worry about this return address, it gets
|
||||||
|
discarded in a later step)
|
||||||
|
|
||||||
|
this means that the stack is no longer
|
||||||
|
16 byte aligned, (rsp % 16) now == 8, and this is the only
|
||||||
|
time outside of the prolog or epilog of a function that this
|
||||||
|
will be the case
|
||||||
|
|
||||||
|
so, after all direct or indirect
|
||||||
|
function calls we set
|
||||||
|
X64Emitter::synchronize_stack_on_next_instruction_ to true.
|
||||||
|
On the next instruction that is not
|
||||||
|
OPCODE_SOURCE_OFFSET we will emit a check when we see
|
||||||
|
synchronize_stack_on_next_instruction_ is true. We have to
|
||||||
|
skip OPCODE_SOURCE_OFFSET because its not a "real"
|
||||||
|
instruction and if we emit on it the return address of the
|
||||||
|
function call will point to AFTER our check, so itll never be
|
||||||
|
executed.
|
||||||
|
|
||||||
|
our check is just going to do test esp,
|
||||||
|
15 to see if the stack is misaligned. (using esp instead of
|
||||||
|
rsp saves 1 byte). We tail emit the handling for when the
|
||||||
|
check succeeds because in 99.99999% of function calls it will
|
||||||
|
be aligned, in the end the runtime cost of these checks is 5
|
||||||
|
bytes for the test instruction which ought to be one cycle
|
||||||
|
and 5 bytes for the jmp with no cycles taken for the jump
|
||||||
|
which will be predicted not taken.
|
||||||
|
|
||||||
|
Our handling for the check is implemented in X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper. we don't call it directly though,
|
||||||
|
instead we go through backend()->synchronize_guest_and_host_stack_helper_for_size(num_bytes_needed_to_represent_stack_size). we place the stack size after the
|
||||||
|
call instruction so we can load it in the helper and readjust the return address to point after the literal value.
|
||||||
|
|
||||||
|
The helper is going to search the array of stackpoints to find the first one that is greater than or equal to the current stack pointer, when it finds
|
||||||
|
the entry it will set the currently host rsp to the host stack pointer value in the entry, and then subtract the stack size of the caller from that.
|
||||||
|
the current stackpoint index is adjusted to point to the one after the stackpoint we restored to.
|
||||||
|
|
||||||
|
The helper then jumps back to the function that was longjmp'ed to, with the host stack in its proper state. it just works!
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (num_frames_bigger > 1) {
|
||||||
|
return host_address;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
auto fn = thread_state->processor()->ResolveFunction(
|
auto fn = thread_state->processor()->ResolveFunction(
|
||||||
static_cast<uint32_t>(target_address));
|
static_cast<uint32_t>(target_address));
|
||||||
assert_not_null(fn);
|
assert_not_null(fn);
|
||||||
|
@ -479,7 +662,7 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
|
||||||
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
||||||
|
|
||||||
call((void*)fn->machine_code());
|
call((void*)fn->machine_code());
|
||||||
|
synchronize_stack_on_next_instruction_ = true;
|
||||||
} else {
|
} else {
|
||||||
// tail call
|
// tail call
|
||||||
EmitTraceUserCallReturn();
|
EmitTraceUserCallReturn();
|
||||||
|
@ -488,8 +671,10 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
|
||||||
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||||
|
|
||||||
add(rsp, static_cast<uint32_t>(stack_size()));
|
add(rsp, static_cast<uint32_t>(stack_size()));
|
||||||
|
PopStackpoint();
|
||||||
jmp((void*)fn->machine_code(), T_NEAR);
|
jmp((void*)fn->machine_code(), T_NEAR);
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
} else if (code_cache_->has_indirection_table()) {
|
} else if (code_cache_->has_indirection_table()) {
|
||||||
// Load the pointer to the indirection table maintained in X64CodeCache.
|
// Load the pointer to the indirection table maintained in X64CodeCache.
|
||||||
|
@ -513,12 +698,14 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
|
||||||
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||||
|
|
||||||
add(rsp, static_cast<uint32_t>(stack_size()));
|
add(rsp, static_cast<uint32_t>(stack_size()));
|
||||||
|
PopStackpoint();
|
||||||
jmp(rax);
|
jmp(rax);
|
||||||
} else {
|
} else {
|
||||||
// Return address is from the previous SET_RETURN_ADDRESS.
|
// Return address is from the previous SET_RETURN_ADDRESS.
|
||||||
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
||||||
|
|
||||||
call(rax);
|
call(rax);
|
||||||
|
synchronize_stack_on_next_instruction_ = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -557,12 +744,14 @@ void X64Emitter::CallIndirect(const hir::Instr* instr,
|
||||||
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||||
|
|
||||||
add(rsp, static_cast<uint32_t>(stack_size()));
|
add(rsp, static_cast<uint32_t>(stack_size()));
|
||||||
|
PopStackpoint();
|
||||||
jmp(rax);
|
jmp(rax);
|
||||||
} else {
|
} else {
|
||||||
// Return address is from the previous SET_RETURN_ADDRESS.
|
// Return address is from the previous SET_RETURN_ADDRESS.
|
||||||
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
||||||
|
|
||||||
call(rax);
|
call(rax);
|
||||||
|
synchronize_stack_on_next_instruction_ = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1458,6 +1647,126 @@ Xbyak::Address X64Emitter::GetBackendFlagsPtr() const {
|
||||||
pt.setBit(32);
|
pt.setBit(32);
|
||||||
return pt;
|
return pt;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void X64Emitter::HandleStackpointOverflowError(ppc::PPCContext* context) {
|
||||||
|
// context->lr
|
||||||
|
// todo: show lr in message?
|
||||||
|
xe::FatalError(
|
||||||
|
"Overflowed stackpoints! Please report this error for this title to "
|
||||||
|
"Xenia developers.");
|
||||||
|
}
|
||||||
|
|
||||||
|
void X64Emitter::PushStackpoint() {
|
||||||
|
if (!cvars::enable_host_guest_stack_synchronization) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// push the current host and guest stack pointers
|
||||||
|
// this is done before a stack frame is set up or any guest instructions are
|
||||||
|
// executed this code is probably the most intrusive part of the stackpoint
|
||||||
|
mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints)));
|
||||||
|
mov(eax,
|
||||||
|
GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)));
|
||||||
|
|
||||||
|
mov(r8, qword[GetContextReg() + offsetof(ppc::PPCContext, r[1])]);
|
||||||
|
|
||||||
|
imul(r9d, eax, sizeof(X64BackendStackpoint));
|
||||||
|
add(rbx, r9);
|
||||||
|
|
||||||
|
mov(qword[rbx + offsetof(X64BackendStackpoint, host_stack_)], rsp);
|
||||||
|
mov(dword[rbx + offsetof(X64BackendStackpoint, guest_stack_)], r8d);
|
||||||
|
if (IsFeatureEnabled(kX64FlagsIndependentVars)) {
|
||||||
|
inc(eax);
|
||||||
|
} else {
|
||||||
|
add(eax, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
|
||||||
|
eax);
|
||||||
|
|
||||||
|
cmp(eax, (uint32_t)cvars::max_stackpoints);
|
||||||
|
|
||||||
|
Xbyak::Label& overflowed_stackpoints =
|
||||||
|
AddToTail([](X64Emitter& e, Xbyak::Label& our_tail_label) {
|
||||||
|
e.L(our_tail_label);
|
||||||
|
// we never subtracted anything from rsp, so our stack is misaligned and
|
||||||
|
// will fault in guesttohostthunk
|
||||||
|
// e.sub(e.rsp, 8);
|
||||||
|
e.push(e.rax); // easier realign, 1 byte opcode vs 4 bytes for sub
|
||||||
|
|
||||||
|
e.CallNativeSafe((void*)X64Emitter::HandleStackpointOverflowError);
|
||||||
|
});
|
||||||
|
jge(overflowed_stackpoints, T_NEAR);
|
||||||
|
}
|
||||||
|
void X64Emitter::PopStackpoint() {
|
||||||
|
if (!cvars::enable_host_guest_stack_synchronization) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// todo: maybe verify that rsp and r1 == the stackpoint?
|
||||||
|
Xbyak::Address stackpoint_pos_pointer =
|
||||||
|
GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth));
|
||||||
|
stackpoint_pos_pointer.setBit(32);
|
||||||
|
dec(stackpoint_pos_pointer);
|
||||||
|
}
|
||||||
|
|
||||||
|
void X64Emitter::EnsureSynchronizedGuestAndHostStack() {
|
||||||
|
if (!cvars::enable_host_guest_stack_synchronization) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// chrispy: keeping this old slower test here in case in the future changes
|
||||||
|
// need to be made
|
||||||
|
// that result in the stack not being 8 byte misaligned on context reentry
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
Xbyak::Label skip{};
|
||||||
|
mov(r8, qword[GetContextReg() + offsetof(ppc::PPCContext, r[1])]);
|
||||||
|
mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints)));
|
||||||
|
imul(eax,
|
||||||
|
GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
|
||||||
|
sizeof(X64BackendStackpoint));
|
||||||
|
sub(eax, sizeof(X64BackendStackpoint));
|
||||||
|
add(rbx, rax);
|
||||||
|
|
||||||
|
cmp(r8d, dword[rbx + offsetof(X64BackendStackpoint, guest_stack_)]);
|
||||||
|
jle(skip, T_NEAR);
|
||||||
|
Xbyak::Label skip{};
|
||||||
|
mov(r11d, stack_size());
|
||||||
|
call(backend_->synchronize_guest_and_host_stack_helper());
|
||||||
|
L(skip);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
Xbyak::Label& return_from_sync = this->NewCachedLabel();
|
||||||
|
|
||||||
|
// if we got here somehow from setjmp or the like we ought to have a
|
||||||
|
// misaligned stack right now! this provides us with a very fast pretest for
|
||||||
|
// this condition
|
||||||
|
test(esp, 15);
|
||||||
|
|
||||||
|
Xbyak::Label& sync_label = this->AddToTail(
|
||||||
|
[&return_from_sync](X64Emitter& e, Xbyak::Label& our_tail_label) {
|
||||||
|
e.L(our_tail_label);
|
||||||
|
|
||||||
|
uint32_t stack32 = static_cast<uint32_t>(e.stack_size());
|
||||||
|
auto backend = e.backend();
|
||||||
|
|
||||||
|
if (stack32 < 256) {
|
||||||
|
e.call(backend->synchronize_guest_and_host_stack_helper_for_size(1));
|
||||||
|
e.db(stack32);
|
||||||
|
|
||||||
|
} else if (stack32 < 65536) {
|
||||||
|
e.call(backend->synchronize_guest_and_host_stack_helper_for_size(2));
|
||||||
|
e.dw(stack32);
|
||||||
|
} else {
|
||||||
|
// ought to be impossible, a host stack bigger than 65536??
|
||||||
|
e.call(backend->synchronize_guest_and_host_stack_helper_for_size(4));
|
||||||
|
e.dd(stack32);
|
||||||
|
}
|
||||||
|
e.jmp(return_from_sync, T_NEAR);
|
||||||
|
});
|
||||||
|
|
||||||
|
jnz(sync_label, T_NEAR);
|
||||||
|
|
||||||
|
L(return_from_sync);
|
||||||
|
}
|
||||||
} // namespace x64
|
} // namespace x64
|
||||||
} // namespace backend
|
} // namespace backend
|
||||||
} // namespace cpu
|
} // namespace cpu
|
||||||
|
|
|
@ -299,6 +299,11 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
||||||
|
|
||||||
Xbyak::Label& AddToTail(TailEmitCallback callback, uint32_t alignment = 0);
|
Xbyak::Label& AddToTail(TailEmitCallback callback, uint32_t alignment = 0);
|
||||||
Xbyak::Label& NewCachedLabel();
|
Xbyak::Label& NewCachedLabel();
|
||||||
|
|
||||||
|
void PushStackpoint();
|
||||||
|
void PopStackpoint();
|
||||||
|
|
||||||
|
void EnsureSynchronizedGuestAndHostStack();
|
||||||
FunctionDebugInfo* debug_info() const { return debug_info_; }
|
FunctionDebugInfo* debug_info() const { return debug_info_; }
|
||||||
|
|
||||||
size_t stack_size() const { return stack_size_; }
|
size_t stack_size() const { return stack_size_; }
|
||||||
|
@ -381,13 +386,14 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
||||||
bool Emit(hir::HIRBuilder* builder, EmitFunctionInfo& func_info);
|
bool Emit(hir::HIRBuilder* builder, EmitFunctionInfo& func_info);
|
||||||
void EmitGetCurrentThreadId();
|
void EmitGetCurrentThreadId();
|
||||||
void EmitTraceUserCallReturn();
|
void EmitTraceUserCallReturn();
|
||||||
|
static void HandleStackpointOverflowError(ppc::PPCContext* context);
|
||||||
protected:
|
protected:
|
||||||
Processor* processor_ = nullptr;
|
Processor* processor_ = nullptr;
|
||||||
X64Backend* backend_ = nullptr;
|
X64Backend* backend_ = nullptr;
|
||||||
X64CodeCache* code_cache_ = nullptr;
|
X64CodeCache* code_cache_ = nullptr;
|
||||||
XbyakAllocator* allocator_ = nullptr;
|
XbyakAllocator* allocator_ = nullptr;
|
||||||
XexModule* guest_module_ = nullptr;
|
XexModule* guest_module_ = nullptr;
|
||||||
|
bool synchronize_stack_on_next_instruction_ = false;
|
||||||
Xbyak::util::Cpu cpu_;
|
Xbyak::util::Cpu cpu_;
|
||||||
uint64_t feature_flags_ = 0;
|
uint64_t feature_flags_ = 0;
|
||||||
uint32_t current_guest_function_ = 0;
|
uint32_t current_guest_function_ = 0;
|
||||||
|
|
|
@ -56,6 +56,8 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
|
||||||
if (entry) {
|
if (entry) {
|
||||||
// If we aren't ready yet spin and wait.
|
// If we aren't ready yet spin and wait.
|
||||||
if (entry->status == Entry::STATUS_COMPILING) {
|
if (entry->status == Entry::STATUS_COMPILING) {
|
||||||
|
// chrispy: i think this is dead code, if we are compiling we're holding
|
||||||
|
// the global lock, arent we? so we wouldnt be executing here
|
||||||
// Still compiling, so spin.
|
// Still compiling, so spin.
|
||||||
do {
|
do {
|
||||||
global_lock.unlock();
|
global_lock.unlock();
|
||||||
|
|
|
@ -110,8 +110,13 @@ uint32_t GuestFunction::MapGuestAddressToMachineCodeOffset(
|
||||||
uintptr_t GuestFunction::MapGuestAddressToMachineCode(
|
uintptr_t GuestFunction::MapGuestAddressToMachineCode(
|
||||||
uint32_t guest_address) const {
|
uint32_t guest_address) const {
|
||||||
auto entry = LookupGuestAddress(guest_address);
|
auto entry = LookupGuestAddress(guest_address);
|
||||||
return reinterpret_cast<uintptr_t>(machine_code()) +
|
|
||||||
(entry ? entry->code_offset : 0);
|
if (entry) {
|
||||||
|
return reinterpret_cast<uintptr_t>(machine_code()) + entry->code_offset;
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t GuestFunction::MapMachineCodeToGuestAddress(
|
uint32_t GuestFunction::MapMachineCodeToGuestAddress(
|
||||||
|
|
|
@ -27,18 +27,13 @@
|
||||||
#include "xenia/cpu/ppc/ppc_frontend.h"
|
#include "xenia/cpu/ppc/ppc_frontend.h"
|
||||||
#include "xenia/cpu/ppc/ppc_opcode_info.h"
|
#include "xenia/cpu/ppc/ppc_opcode_info.h"
|
||||||
#include "xenia/cpu/processor.h"
|
#include "xenia/cpu/processor.h"
|
||||||
|
#include "xenia/cpu/xex_module.h"
|
||||||
DEFINE_bool(
|
DEFINE_bool(
|
||||||
break_on_unimplemented_instructions, true,
|
break_on_unimplemented_instructions, true,
|
||||||
"Break to the host debugger (or crash if no debugger attached) if an "
|
"Break to the host debugger (or crash if no debugger attached) if an "
|
||||||
"unimplemented PowerPC instruction is encountered.",
|
"unimplemented PowerPC instruction is encountered.",
|
||||||
"CPU");
|
"CPU");
|
||||||
|
|
||||||
DEFINE_bool(
|
|
||||||
emit_useless_fpscr_updates, false,
|
|
||||||
"Emit useless fpscr update instructions (pre-10/30/2022 behavior). ",
|
|
||||||
"CPU");
|
|
||||||
|
|
||||||
namespace xe {
|
namespace xe {
|
||||||
namespace cpu {
|
namespace cpu {
|
||||||
namespace ppc {
|
namespace ppc {
|
||||||
|
@ -94,8 +89,9 @@ bool PPCHIRBuilder::Emit(GuestFunction* function, uint32_t flags) {
|
||||||
|
|
||||||
function_ = function;
|
function_ = function;
|
||||||
start_address_ = function_->address();
|
start_address_ = function_->address();
|
||||||
//chrispy: i've seen this one happen, not sure why but i think from trying to precompile twice
|
// chrispy: i've seen this one happen, not sure why but i think from trying to
|
||||||
//i've also seen ones with a start and end address that are the same...
|
// precompile twice i've also seen ones with a start and end address that are
|
||||||
|
// the same...
|
||||||
assert_true(function_->address() <= function_->end_address());
|
assert_true(function_->address() <= function_->end_address());
|
||||||
instr_count_ = (function_->end_address() - function_->address()) / 4 + 1;
|
instr_count_ = (function_->end_address() - function_->address()) / 4 + 1;
|
||||||
|
|
||||||
|
@ -250,7 +246,8 @@ void PPCHIRBuilder::MaybeBreakOnInstruction(uint32_t address) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void PPCHIRBuilder::AnnotateLabel(uint32_t address, Label* label) {
|
void PPCHIRBuilder::AnnotateLabel(uint32_t address, Label* label) {
|
||||||
//chrispy: label->name is unused, it would be nice to be able to remove the field and this code
|
// chrispy: label->name is unused, it would be nice to be able to remove the
|
||||||
|
// field and this code
|
||||||
char name_buffer[13];
|
char name_buffer[13];
|
||||||
auto format_result = fmt::format_to_n(name_buffer, 12, "loc_{:08X}", address);
|
auto format_result = fmt::format_to_n(name_buffer, 12, "loc_{:08X}", address);
|
||||||
name_buffer[format_result.size] = '\0';
|
name_buffer[format_result.size] = '\0';
|
||||||
|
@ -457,37 +454,38 @@ void PPCHIRBuilder::UpdateFPSCR(Value* result, bool update_cr1) {
|
||||||
// TODO(benvanik): detect overflow and nan cases.
|
// TODO(benvanik): detect overflow and nan cases.
|
||||||
// fx and vx are the most important.
|
// fx and vx are the most important.
|
||||||
/*
|
/*
|
||||||
chrispy: stubbed this out because right now all it does is waste
|
chrispy: i stubbed this out at one point because all it does is waste
|
||||||
memory and CPU time
|
memory and CPU time, however, this introduced issues with raiden
|
||||||
|
(substitute w/ titleid later) which probably means they stash stuff in the
|
||||||
|
fpscr?
|
||||||
|
|
||||||
*/
|
*/
|
||||||
if (cvars::emit_useless_fpscr_updates) {
|
|
||||||
Value* fx = LoadConstantInt8(0);
|
|
||||||
Value* fex = LoadConstantInt8(0);
|
|
||||||
Value* vx = LoadConstantInt8(0);
|
|
||||||
Value* ox = LoadConstantInt8(0);
|
|
||||||
|
|
||||||
if (update_cr1) {
|
Value* fx = LoadConstantInt8(0);
|
||||||
// Store into the CR1 field.
|
Value* fex = LoadConstantInt8(0);
|
||||||
// We do this instead of just calling CopyFPSCRToCR1 so that we don't
|
Value* vx = LoadConstantInt8(0);
|
||||||
// have to read back the bits and do shifting work.
|
Value* ox = LoadConstantInt8(0);
|
||||||
StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx);
|
|
||||||
StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex);
|
|
||||||
StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx);
|
|
||||||
StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generate our new bits.
|
if (update_cr1) {
|
||||||
Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31);
|
// Store into the CR1 field.
|
||||||
new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30));
|
// We do this instead of just calling CopyFPSCRToCR1 so that we don't
|
||||||
new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29));
|
// have to read back the bits and do shifting work.
|
||||||
new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28));
|
StoreContext(offsetof(PPCContext, cr1.cr1_fx), fx);
|
||||||
|
StoreContext(offsetof(PPCContext, cr1.cr1_fex), fex);
|
||||||
// Mix into fpscr while preserving sticky bits (FX and OX).
|
StoreContext(offsetof(PPCContext, cr1.cr1_vx), vx);
|
||||||
Value* bits = LoadFPSCR();
|
StoreContext(offsetof(PPCContext, cr1.cr1_ox), ox);
|
||||||
bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits);
|
|
||||||
StoreFPSCR(bits);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Generate our new bits.
|
||||||
|
Value* new_bits = Shl(ZeroExtend(fx, INT32_TYPE), 31);
|
||||||
|
new_bits = Or(new_bits, Shl(ZeroExtend(fex, INT32_TYPE), 30));
|
||||||
|
new_bits = Or(new_bits, Shl(ZeroExtend(vx, INT32_TYPE), 29));
|
||||||
|
new_bits = Or(new_bits, Shl(ZeroExtend(ox, INT32_TYPE), 28));
|
||||||
|
|
||||||
|
// Mix into fpscr while preserving sticky bits (FX and OX).
|
||||||
|
Value* bits = LoadFPSCR();
|
||||||
|
bits = Or(And(bits, LoadConstantUint32(0x9FFFFFFF)), new_bits);
|
||||||
|
StoreFPSCR(bits);
|
||||||
}
|
}
|
||||||
|
|
||||||
void PPCHIRBuilder::CopyFPSCRToCR1() {
|
void PPCHIRBuilder::CopyFPSCRToCR1() {
|
||||||
|
@ -587,7 +585,24 @@ void PPCHIRBuilder::StoreReserved(Value* val) {
|
||||||
Value* PPCHIRBuilder::LoadReserved() {
|
Value* PPCHIRBuilder::LoadReserved() {
|
||||||
return LoadContext(offsetof(PPCContext, reserved_val), INT64_TYPE);
|
return LoadContext(offsetof(PPCContext, reserved_val), INT64_TYPE);
|
||||||
}
|
}
|
||||||
|
void PPCHIRBuilder::SetReturnAddress(Value* value) {
|
||||||
|
/*
|
||||||
|
Record the address as being a possible target of a return. This is
|
||||||
|
needed for longjmp emulation. See x64_emitter.cc's ResolveFunction
|
||||||
|
*/
|
||||||
|
Module* mod = this->function_->module();
|
||||||
|
if (value && value->IsConstant()) {
|
||||||
|
if (mod) {
|
||||||
|
XexModule* xexmod = dynamic_cast<XexModule*>(mod);
|
||||||
|
if (xexmod) {
|
||||||
|
auto flags = xexmod->GetInstructionAddressFlags(value->AsUint32());
|
||||||
|
flags->is_return_site = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HIRBuilder::SetReturnAddress(value);
|
||||||
|
}
|
||||||
} // namespace ppc
|
} // namespace ppc
|
||||||
} // namespace cpu
|
} // namespace cpu
|
||||||
} // namespace xe
|
} // namespace xe
|
||||||
|
|
|
@ -80,7 +80,8 @@ class PPCHIRBuilder : public hir::HIRBuilder {
|
||||||
|
|
||||||
void StoreReserved(Value* val);
|
void StoreReserved(Value* val);
|
||||||
Value* LoadReserved();
|
Value* LoadReserved();
|
||||||
|
//calls original impl in hirbuilder, but also records the is_return_site bit into flags in the guestmodule
|
||||||
|
void SetReturnAddress(Value* value);
|
||||||
private:
|
private:
|
||||||
void MaybeBreakOnInstruction(uint32_t address);
|
void MaybeBreakOnInstruction(uint32_t address);
|
||||||
void AnnotateLabel(uint32_t address, Label* label);
|
void AnnotateLabel(uint32_t address, Label* label);
|
||||||
|
|
|
@ -263,12 +263,11 @@ Function* Processor::ResolveFunction(uint32_t address) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (!DemandFunction(function)) {
|
if (!DemandFunction(function)) {
|
||||||
entry->status = Entry::STATUS_FAILED;
|
entry->status = Entry::STATUS_FAILED;
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
//only add it to the list of resolved functions if resolving succeeded
|
//only add it to the list of resolved functions if resolving succeeded
|
||||||
auto module_for = function->module();
|
auto module_for = function->module();
|
||||||
|
|
||||||
auto xexmod = dynamic_cast<XexModule*>(module_for);
|
auto xexmod = dynamic_cast<XexModule*>(module_for);
|
||||||
|
@ -291,23 +290,23 @@ Function* Processor::ResolveFunction(uint32_t address) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Module* Processor::LookupModule(uint32_t address) {
|
||||||
|
auto global_lock = global_critical_region_.Acquire();
|
||||||
|
// TODO(benvanik): sort by code address (if contiguous) so can bsearch.
|
||||||
|
// TODO(benvanik): cache last module low/high, as likely to be in there.
|
||||||
|
for (const auto& module : modules_) {
|
||||||
|
if (module->ContainsAddress(address)) {
|
||||||
|
return module.get();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
Function* Processor::LookupFunction(uint32_t address) {
|
Function* Processor::LookupFunction(uint32_t address) {
|
||||||
// TODO(benvanik): fast reject invalid addresses/log errors.
|
// TODO(benvanik): fast reject invalid addresses/log errors.
|
||||||
|
|
||||||
// Find the module that contains the address.
|
// Find the module that contains the address.
|
||||||
Module* code_module = nullptr;
|
Module* code_module = LookupModule(address);
|
||||||
{
|
|
||||||
auto global_lock = global_critical_region_.Acquire();
|
|
||||||
// TODO(benvanik): sort by code address (if contiguous) so can bsearch.
|
|
||||||
// TODO(benvanik): cache last module low/high, as likely to be in there.
|
|
||||||
for (const auto& module : modules_) {
|
|
||||||
if (module->ContainsAddress(address)) {
|
|
||||||
code_module = module.get();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!code_module) {
|
if (!code_module) {
|
||||||
// No module found that could contain the address.
|
// No module found that could contain the address.
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
|
@ -115,6 +115,7 @@ class Processor {
|
||||||
void RemoveFunctionByAddress(uint32_t address);
|
void RemoveFunctionByAddress(uint32_t address);
|
||||||
|
|
||||||
Function* LookupFunction(uint32_t address);
|
Function* LookupFunction(uint32_t address);
|
||||||
|
Module* LookupModule(uint32_t address);
|
||||||
Function* LookupFunction(Module* module, uint32_t address);
|
Function* LookupFunction(Module* module, uint32_t address);
|
||||||
Function* ResolveFunction(uint32_t address);
|
Function* ResolveFunction(uint32_t address);
|
||||||
|
|
||||||
|
|
|
@ -78,7 +78,7 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
|
||||||
// Allocate with 64b alignment.
|
// Allocate with 64b alignment.
|
||||||
|
|
||||||
context_ = reinterpret_cast<ppc::PPCContext*>(
|
context_ = reinterpret_cast<ppc::PPCContext*>(
|
||||||
AllocateContext());
|
AllocateContext());
|
||||||
processor->backend()->InitializeBackendContext(context_);
|
processor->backend()->InitializeBackendContext(context_);
|
||||||
assert_true(((uint64_t)context_ & 0x3F) == 0);
|
assert_true(((uint64_t)context_ & 0x3F) == 0);
|
||||||
std::memset(context_, 0, sizeof(ppc::PPCContext));
|
std::memset(context_, 0, sizeof(ppc::PPCContext));
|
||||||
|
@ -105,9 +105,9 @@ ThreadState::~ThreadState() {
|
||||||
thread_state_ = nullptr;
|
thread_state_ = nullptr;
|
||||||
}
|
}
|
||||||
if (context_) {
|
if (context_) {
|
||||||
|
processor_->backend()->DeinitializeBackendContext(context_);
|
||||||
FreeContext(reinterpret_cast<void*>(context_));
|
FreeContext(reinterpret_cast<void*>(context_));
|
||||||
}
|
}
|
||||||
// memory::AlignedFree(context_);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ThreadState::Bind(ThreadState* thread_state) {
|
void ThreadState::Bind(ThreadState* thread_state) {
|
||||||
|
|
|
@ -38,9 +38,10 @@ DEFINE_bool(disable_instruction_infocache, false,
|
||||||
"CPU");
|
"CPU");
|
||||||
|
|
||||||
DEFINE_bool(
|
DEFINE_bool(
|
||||||
disable_early_precompilation, false,
|
enable_early_precompilation, false,
|
||||||
"Disables pre-compiling guest functions that we know we've called/that "
|
"Enable pre-compiling guest functions that we know we've called/that "
|
||||||
"we've recognized as being functions via simple heuristics.",
|
"we've recognized as being functions via simple heuristics, good for error "
|
||||||
|
"finding/stress testing with the JIT",
|
||||||
"CPU");
|
"CPU");
|
||||||
|
|
||||||
static const uint8_t xe_xex2_retail_key[16] = {
|
static const uint8_t xe_xex2_retail_key[16] = {
|
||||||
|
@ -1115,6 +1116,7 @@ void XexModule::Precompile() {
|
||||||
if (!FindSaveRest()) {
|
if (!FindSaveRest()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
info_cache_.Init(this);
|
info_cache_.Init(this);
|
||||||
PrecompileDiscoveredFunctions();
|
PrecompileDiscoveredFunctions();
|
||||||
}
|
}
|
||||||
|
@ -1343,22 +1345,26 @@ void XexInfoCache::Init(XexModule* xexmod) {
|
||||||
num_codebytes += 3; // round up to nearest multiple of 4
|
num_codebytes += 3; // round up to nearest multiple of 4
|
||||||
num_codebytes &= ~3;
|
num_codebytes &= ~3;
|
||||||
|
|
||||||
bool did_exist = true;
|
auto try_open = [this, &infocache_path, num_codebytes]() {
|
||||||
if (!std::filesystem::exists(infocache_path)) {
|
bool did_exist = true;
|
||||||
recreate:
|
|
||||||
xe::filesystem::CreateEmptyFile(infocache_path);
|
|
||||||
did_exist = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// todo: prepopulate with stuff from pdata, dll exports
|
if (!std::filesystem::exists(infocache_path)) {
|
||||||
|
xe::filesystem::CreateEmptyFile(infocache_path);
|
||||||
|
did_exist = false;
|
||||||
|
}
|
||||||
|
|
||||||
this->executable_addr_flags_ = std::move(xe::MappedMemory::Open(
|
// todo: prepopulate with stuff from pdata, dll exports
|
||||||
infocache_path, xe::MappedMemory::Mode::kReadWrite, 0,
|
|
||||||
sizeof(InfoCacheFlagsHeader) +
|
|
||||||
(sizeof(InfoCacheFlags) *
|
|
||||||
(num_codebytes /
|
|
||||||
4)))); // one infocacheflags entry for each PPC instr-sized addr
|
|
||||||
|
|
||||||
|
this->executable_addr_flags_ = std::move(xe::MappedMemory::Open(
|
||||||
|
infocache_path, xe::MappedMemory::Mode::kReadWrite, 0,
|
||||||
|
sizeof(InfoCacheFlagsHeader) +
|
||||||
|
(sizeof(InfoCacheFlags) *
|
||||||
|
(num_codebytes /
|
||||||
|
4)))); // one infocacheflags entry for each PPC instr-sized addr
|
||||||
|
return did_exist;
|
||||||
|
};
|
||||||
|
|
||||||
|
bool did_exist = try_open();
|
||||||
if (!did_exist) {
|
if (!did_exist) {
|
||||||
GetHeader()->version = CURRENT_INFOCACHE_VERSION;
|
GetHeader()->version = CURRENT_INFOCACHE_VERSION;
|
||||||
|
|
||||||
|
@ -1366,7 +1372,7 @@ void XexInfoCache::Init(XexModule* xexmod) {
|
||||||
if (GetHeader()->version != CURRENT_INFOCACHE_VERSION) {
|
if (GetHeader()->version != CURRENT_INFOCACHE_VERSION) {
|
||||||
this->executable_addr_flags_->Close();
|
this->executable_addr_flags_->Close();
|
||||||
std::filesystem::remove(infocache_path);
|
std::filesystem::remove(infocache_path);
|
||||||
goto recreate;
|
try_open();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1380,7 +1386,7 @@ InfoCacheFlags* XexModule::GetInstructionAddressFlags(uint32_t guest_addr) {
|
||||||
return info_cache_.LookupFlags(guest_addr);
|
return info_cache_.LookupFlags(guest_addr);
|
||||||
}
|
}
|
||||||
void XexModule::PrecompileDiscoveredFunctions() {
|
void XexModule::PrecompileDiscoveredFunctions() {
|
||||||
if (cvars::disable_early_precompilation) {
|
if (!cvars::enable_early_precompilation) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
auto others = PreanalyzeCode();
|
auto others = PreanalyzeCode();
|
||||||
|
@ -1397,7 +1403,7 @@ void XexModule::PrecompileDiscoveredFunctions() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void XexModule::PrecompileKnownFunctions() {
|
void XexModule::PrecompileKnownFunctions() {
|
||||||
if (cvars::disable_early_precompilation) {
|
if (!cvars::enable_early_precompilation) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
uint32_t start = 0;
|
uint32_t start = 0;
|
||||||
|
@ -1435,18 +1441,14 @@ static bool IsOpcodeBL(unsigned w) {
|
||||||
|
|
||||||
std::vector<uint32_t> XexModule::PreanalyzeCode() {
|
std::vector<uint32_t> XexModule::PreanalyzeCode() {
|
||||||
uint32_t low_8_aligned = xe::align<uint32_t>(low_address_, 8);
|
uint32_t low_8_aligned = xe::align<uint32_t>(low_address_, 8);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
uint32_t highest_exec_addr = 0;
|
uint32_t highest_exec_addr = 0;
|
||||||
|
|
||||||
for (auto&& sec : pe_sections_) {
|
for (auto&& sec : pe_sections_) {
|
||||||
if ((sec.flags & kXEPESectionContainsCode)) {
|
if ((sec.flags & kXEPESectionContainsCode)) {
|
||||||
|
highest_exec_addr =
|
||||||
|
|
||||||
highest_exec_addr =
|
|
||||||
std::max<uint32_t>(highest_exec_addr, sec.address + sec.size);
|
std::max<uint32_t>(highest_exec_addr, sec.address + sec.size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
uint32_t high_8_aligned = highest_exec_addr & ~(8U - 1);
|
uint32_t high_8_aligned = highest_exec_addr & ~(8U - 1);
|
||||||
uint32_t n_possible_8byte_addresses = (high_8_aligned - low_8_aligned) / 8;
|
uint32_t n_possible_8byte_addresses = (high_8_aligned - low_8_aligned) / 8;
|
||||||
|
@ -1476,7 +1478,7 @@ std::vector<uint32_t> XexModule::PreanalyzeCode() {
|
||||||
uint32_t mfspr_r12_lr32 =
|
uint32_t mfspr_r12_lr32 =
|
||||||
*reinterpret_cast<const uint32_t*>(&mfspr_r12_lr[0]);
|
*reinterpret_cast<const uint32_t*>(&mfspr_r12_lr[0]);
|
||||||
|
|
||||||
auto add_new_func = [funcstart_candidate_stack, &stack_pos](uint32_t addr) {
|
auto add_new_func = [funcstart_candidate_stack, &stack_pos](uint32_t addr) {
|
||||||
funcstart_candidate_stack[stack_pos++] = addr;
|
funcstart_candidate_stack[stack_pos++] = addr;
|
||||||
};
|
};
|
||||||
/*
|
/*
|
||||||
|
@ -1926,7 +1928,7 @@ bool XexModule::FindSaveRest() {
|
||||||
address += 2 * 4;
|
address += 2 * 4;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!cvars::disable_early_precompilation) {
|
if (cvars::enable_early_precompilation) {
|
||||||
for (auto&& to_ensure_precompiled : resolve_on_exit) {
|
for (auto&& to_ensure_precompiled : resolve_on_exit) {
|
||||||
// we want to make sure an address for these functions is available before
|
// we want to make sure an address for these functions is available before
|
||||||
// any other functions are compiled for code generation purposes but we do
|
// any other functions are compiled for code generation purposes but we do
|
||||||
|
|
|
@ -29,23 +29,27 @@ constexpr fourcc_t kXEX1Signature = make_fourcc("XEX1");
|
||||||
constexpr fourcc_t kXEX2Signature = make_fourcc("XEX2");
|
constexpr fourcc_t kXEX2Signature = make_fourcc("XEX2");
|
||||||
constexpr fourcc_t kElfSignature = make_fourcc(0x7F, 'E', 'L', 'F');
|
constexpr fourcc_t kElfSignature = make_fourcc(0x7F, 'E', 'L', 'F');
|
||||||
|
|
||||||
|
|
||||||
class Runtime;
|
class Runtime;
|
||||||
struct InfoCacheFlags {
|
struct InfoCacheFlags {
|
||||||
uint32_t was_resolved : 1; // has this address ever been called/requested
|
uint32_t was_resolved : 1; // has this address ever been called/requested
|
||||||
// via resolvefunction?
|
// via resolvefunction?
|
||||||
uint32_t accessed_mmio : 1;
|
uint32_t accessed_mmio : 1;
|
||||||
uint32_t is_syscall_func : 1;
|
uint32_t is_syscall_func : 1;
|
||||||
uint32_t reserved : 29;
|
uint32_t is_return_site : 1; // address can be reached from another function
|
||||||
|
// by returning
|
||||||
|
uint32_t reserved : 28;
|
||||||
};
|
};
|
||||||
|
static_assert(sizeof(InfoCacheFlags) == 4,
|
||||||
|
"InfoCacheFlags size should be equal to sizeof ppc instruction.");
|
||||||
|
|
||||||
struct XexInfoCache {
|
struct XexInfoCache {
|
||||||
//increment this to invalidate all user infocaches
|
// increment this to invalidate all user infocaches
|
||||||
static constexpr uint32_t CURRENT_INFOCACHE_VERSION = 1;
|
static constexpr uint32_t CURRENT_INFOCACHE_VERSION = 4;
|
||||||
|
|
||||||
struct InfoCacheFlagsHeader {
|
struct InfoCacheFlagsHeader {
|
||||||
uint32_t version;
|
uint32_t version;
|
||||||
|
|
||||||
unsigned char reserved[252];
|
unsigned char reserved[252];
|
||||||
|
|
||||||
InfoCacheFlags* LookupFlags(unsigned offset) {
|
InfoCacheFlags* LookupFlags(unsigned offset) {
|
||||||
return &reinterpret_cast<InfoCacheFlags*>(&this[1])[offset];
|
return &reinterpret_cast<InfoCacheFlags*>(&this[1])[offset];
|
||||||
|
@ -228,7 +232,8 @@ class XexModule : public xe::cpu::Module {
|
||||||
|
|
||||||
InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr);
|
InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr);
|
||||||
|
|
||||||
virtual void Precompile() override;
|
virtual void Precompile() override;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
std::unique_ptr<Function> CreateFunction(uint32_t address) override;
|
std::unique_ptr<Function> CreateFunction(uint32_t address) override;
|
||||||
|
|
||||||
|
|
|
@ -1911,21 +1911,8 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
|
||||||
void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
|
void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
|
||||||
uint32_t base,
|
uint32_t base,
|
||||||
uint32_t num_registers) {
|
uint32_t num_registers) {
|
||||||
RingBuffer::ReadRange range =
|
WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
|
||||||
ring->BeginRead(num_registers * sizeof(uint32_t));
|
num_registers);
|
||||||
|
|
||||||
XE_LIKELY_IF(!range.second) {
|
|
||||||
uint32_t num_regs_firstrange =
|
|
||||||
static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
|
|
||||||
|
|
||||||
D3D12CommandProcessor::WriteRegistersFromMem(
|
|
||||||
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
|
||||||
num_regs_firstrange);
|
|
||||||
ring->EndRead(range);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||||
|
@ -2042,7 +2029,6 @@ D3D12CommandProcessor::WriteRegisterRangeFromRing_WithKnownBound(
|
||||||
RingBuffer::ReadRange range =
|
RingBuffer::ReadRange range =
|
||||||
ring->BeginRead(num_registers * sizeof(uint32_t));
|
ring->BeginRead(num_registers * sizeof(uint32_t));
|
||||||
|
|
||||||
|
|
||||||
XE_LIKELY_IF(!range.second) {
|
XE_LIKELY_IF(!range.second) {
|
||||||
WriteRegisterRangeFromMem_WithKnownBound<register_lower_bound,
|
WriteRegisterRangeFromMem_WithKnownBound<register_lower_bound,
|
||||||
register_upper_bound>(
|
register_upper_bound>(
|
||||||
|
@ -2710,9 +2696,9 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (vfetch_current_queued) {
|
if (vfetch_current_queued) {
|
||||||
// so far, i have never seen vfetch_current_queued > 4. 1 is most common, 2 happens occasionally. did not test many games though
|
// so far, i have never seen vfetch_current_queued > 4. 1 is most common,
|
||||||
// pre-acquire the critical region so we're not repeatedly re-acquiring it
|
// 2 happens occasionally. did not test many games though pre-acquire the
|
||||||
// in requestrange
|
// critical region so we're not repeatedly re-acquiring it in requestrange
|
||||||
auto shared_memory_request_range_hoisted =
|
auto shared_memory_request_range_hoisted =
|
||||||
global_critical_region::Acquire();
|
global_critical_region::Acquire();
|
||||||
|
|
||||||
|
@ -4351,7 +4337,8 @@ bool D3D12CommandProcessor::UpdateBindings(
|
||||||
uint32_t float_constant_index;
|
uint32_t float_constant_index;
|
||||||
while (xe::bit_scan_forward(float_constant_map_entry,
|
while (xe::bit_scan_forward(float_constant_map_entry,
|
||||||
&float_constant_index)) {
|
&float_constant_index)) {
|
||||||
float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry);
|
float_constant_map_entry =
|
||||||
|
xe::clear_lowest_bit(float_constant_map_entry);
|
||||||
std::memcpy(float_constants,
|
std::memcpy(float_constants,
|
||||||
®s[XE_GPU_REG_SHADER_CONSTANT_000_X + (i << 8) +
|
®s[XE_GPU_REG_SHADER_CONSTANT_000_X + (i << 8) +
|
||||||
(float_constant_index << 2)]
|
(float_constant_index << 2)]
|
||||||
|
@ -4382,7 +4369,8 @@ bool D3D12CommandProcessor::UpdateBindings(
|
||||||
uint32_t float_constant_index;
|
uint32_t float_constant_index;
|
||||||
while (xe::bit_scan_forward(float_constant_map_entry,
|
while (xe::bit_scan_forward(float_constant_map_entry,
|
||||||
&float_constant_index)) {
|
&float_constant_index)) {
|
||||||
float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry);
|
float_constant_map_entry =
|
||||||
|
xe::clear_lowest_bit(float_constant_map_entry);
|
||||||
std::memcpy(float_constants,
|
std::memcpy(float_constants,
|
||||||
®s[XE_GPU_REG_SHADER_CONSTANT_256_X + (i << 8) +
|
®s[XE_GPU_REG_SHADER_CONSTANT_256_X + (i << 8) +
|
||||||
(float_constant_index << 2)]
|
(float_constant_index << 2)]
|
||||||
|
|
|
@ -41,10 +41,23 @@ DECLARE_XAM_EXPORT1(XamEnableInactivityProcessing, kInput, kStub);
|
||||||
|
|
||||||
// https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputgetcapabilities(v=vs.85).aspx
|
// https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputgetcapabilities(v=vs.85).aspx
|
||||||
dword_result_t XamInputGetCapabilities_entry(
|
dword_result_t XamInputGetCapabilities_entry(
|
||||||
dword_t user_index, dword_t flags, pointer_t<X_INPUT_CAPABILITIES> caps) {
|
dword_t user_index, dword_t _flags, pointer_t<X_INPUT_CAPABILITIES> caps) {
|
||||||
|
unsigned flags = _flags;
|
||||||
|
//chrispy: actually, it appears that caps is never checked for null, it is memset at the start regardless
|
||||||
if (!caps) {
|
if (!caps) {
|
||||||
return X_ERROR_BAD_ARGUMENTS;
|
return X_ERROR_BAD_ARGUMENTS;
|
||||||
}
|
}
|
||||||
|
if ((flags & 0x40000000) != 0) {
|
||||||
|
//should trap
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((flags & 4) != 0) {
|
||||||
|
//should trap
|
||||||
|
}
|
||||||
|
if (!flags) {
|
||||||
|
flags = 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if ((flags & 0xFF) && (flags & XINPUT_FLAG_GAMEPAD) == 0) {
|
if ((flags & 0xFF) && (flags & XINPUT_FLAG_GAMEPAD) == 0) {
|
||||||
// Ignore any query for other types of devices.
|
// Ignore any query for other types of devices.
|
||||||
|
@ -118,7 +131,7 @@ dword_result_t XamInputGetState_entry(dword_t user_index, dword_t flags,
|
||||||
DECLARE_XAM_EXPORT2(XamInputGetState, kInput, kImplemented, kHighFrequency);
|
DECLARE_XAM_EXPORT2(XamInputGetState, kInput, kImplemented, kHighFrequency);
|
||||||
|
|
||||||
// https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputsetstate(v=vs.85).aspx
|
// https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputsetstate(v=vs.85).aspx
|
||||||
dword_result_t XamInputSetState_entry(dword_t user_index, dword_t unk,
|
dword_result_t XamInputSetState_entry(dword_t user_index, dword_t flags, /* flags, as far as i can see, is not used*/
|
||||||
pointer_t<X_INPUT_VIBRATION> vibration) {
|
pointer_t<X_INPUT_VIBRATION> vibration) {
|
||||||
if (user_index >= 4) {
|
if (user_index >= 4) {
|
||||||
return X_E_DEVICE_NOT_CONNECTED;
|
return X_E_DEVICE_NOT_CONNECTED;
|
||||||
|
|
|
@ -508,7 +508,16 @@ dword_result_t RtlInitializeCriticalSectionAndSpinCount_entry(
|
||||||
DECLARE_XBOXKRNL_EXPORT1(RtlInitializeCriticalSectionAndSpinCount, kNone,
|
DECLARE_XBOXKRNL_EXPORT1(RtlInitializeCriticalSectionAndSpinCount, kNone,
|
||||||
kImplemented);
|
kImplemented);
|
||||||
|
|
||||||
|
static void CriticalSectionPrefetchW(const void* vp) {
|
||||||
|
#if XE_ARCH_AMD64 == 1
|
||||||
|
if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
|
||||||
|
swcache::PrefetchW(vp);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
void RtlEnterCriticalSection_entry(pointer_t<X_RTL_CRITICAL_SECTION> cs) {
|
void RtlEnterCriticalSection_entry(pointer_t<X_RTL_CRITICAL_SECTION> cs) {
|
||||||
|
CriticalSectionPrefetchW(&cs->lock_count);
|
||||||
uint32_t cur_thread = XThread::GetCurrentThread()->guest_object();
|
uint32_t cur_thread = XThread::GetCurrentThread()->guest_object();
|
||||||
uint32_t spin_count = cs->header.absolute * 256;
|
uint32_t spin_count = cs->header.absolute * 256;
|
||||||
|
|
||||||
|
@ -544,6 +553,7 @@ DECLARE_XBOXKRNL_EXPORT2(RtlEnterCriticalSection, kNone, kImplemented,
|
||||||
|
|
||||||
dword_result_t RtlTryEnterCriticalSection_entry(
|
dword_result_t RtlTryEnterCriticalSection_entry(
|
||||||
pointer_t<X_RTL_CRITICAL_SECTION> cs) {
|
pointer_t<X_RTL_CRITICAL_SECTION> cs) {
|
||||||
|
CriticalSectionPrefetchW(&cs->lock_count);
|
||||||
uint32_t thread = XThread::GetCurrentThread()->guest_object();
|
uint32_t thread = XThread::GetCurrentThread()->guest_object();
|
||||||
|
|
||||||
if (xe::atomic_cas(-1, 0, &cs->lock_count)) {
|
if (xe::atomic_cas(-1, 0, &cs->lock_count)) {
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
******************************************************************************
|
******************************************************************************
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "xenia/kernel/xboxkrnl/xboxkrnl_threading.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "xenia/base/atomic.h"
|
#include "xenia/base/atomic.h"
|
||||||
|
@ -18,7 +19,6 @@
|
||||||
#include "xenia/kernel/user_module.h"
|
#include "xenia/kernel/user_module.h"
|
||||||
#include "xenia/kernel/util/shim_utils.h"
|
#include "xenia/kernel/util/shim_utils.h"
|
||||||
#include "xenia/kernel/xboxkrnl/xboxkrnl_private.h"
|
#include "xenia/kernel/xboxkrnl/xboxkrnl_private.h"
|
||||||
#include "xenia/kernel/xboxkrnl/xboxkrnl_threading.h"
|
|
||||||
#include "xenia/kernel/xevent.h"
|
#include "xenia/kernel/xevent.h"
|
||||||
#include "xenia/kernel/xmutant.h"
|
#include "xenia/kernel/xmutant.h"
|
||||||
#include "xenia/kernel/xsemaphore.h"
|
#include "xenia/kernel/xsemaphore.h"
|
||||||
|
@ -165,8 +165,16 @@ dword_result_t NtResumeThread_entry(dword_t handle,
|
||||||
uint32_t suspend_count = 0;
|
uint32_t suspend_count = 0;
|
||||||
|
|
||||||
auto thread = kernel_state()->object_table()->LookupObject<XThread>(handle);
|
auto thread = kernel_state()->object_table()->LookupObject<XThread>(handle);
|
||||||
|
|
||||||
if (thread) {
|
if (thread) {
|
||||||
result = thread->Resume(&suspend_count);
|
if (thread->type() == XObject::Type::Thread) {
|
||||||
|
result = thread->Resume(&suspend_count);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
return X_STATUS_OBJECT_TYPE_MISMATCH;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return X_STATUS_INVALID_HANDLE;
|
||||||
}
|
}
|
||||||
if (suspend_count_ptr) {
|
if (suspend_count_ptr) {
|
||||||
*suspend_count_ptr = suspend_count;
|
*suspend_count_ptr = suspend_count;
|
||||||
|
@ -190,15 +198,27 @@ dword_result_t KeResumeThread_entry(lpvoid_t thread_ptr) {
|
||||||
DECLARE_XBOXKRNL_EXPORT1(KeResumeThread, kThreading, kImplemented);
|
DECLARE_XBOXKRNL_EXPORT1(KeResumeThread, kThreading, kImplemented);
|
||||||
|
|
||||||
dword_result_t NtSuspendThread_entry(dword_t handle,
|
dword_result_t NtSuspendThread_entry(dword_t handle,
|
||||||
lpdword_t suspend_count_ptr) {
|
lpdword_t suspend_count_ptr,
|
||||||
|
const ppc_context_t& context) {
|
||||||
X_RESULT result = X_STATUS_SUCCESS;
|
X_RESULT result = X_STATUS_SUCCESS;
|
||||||
uint32_t suspend_count = 0;
|
uint32_t suspend_count = 0;
|
||||||
|
|
||||||
auto thread = kernel_state()->object_table()->LookupObject<XThread>(handle);
|
auto thread = kernel_state()->object_table()->LookupObject<XThread>(handle);
|
||||||
if (thread) {
|
if (thread) {
|
||||||
result = thread->Suspend(&suspend_count);
|
if (thread->type() == XObject::Type::Thread) {
|
||||||
|
auto current_pcr = context->TranslateVirtualGPR<X_KPCR*>(context->r[13]);
|
||||||
|
|
||||||
|
if (current_pcr->current_thread == thread->guest_object() ||
|
||||||
|
!thread->guest_object<X_KTHREAD>()->terminated) {
|
||||||
|
result = thread->Suspend(&suspend_count);
|
||||||
|
} else {
|
||||||
|
return X_STATUS_THREAD_IS_TERMINATING;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return X_STATUS_OBJECT_TYPE_MISMATCH;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
result = X_STATUS_INVALID_HANDLE;
|
return X_STATUS_INVALID_HANDLE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (suspend_count_ptr) {
|
if (suspend_count_ptr) {
|
||||||
|
@ -213,23 +233,23 @@ void KeSetCurrentStackPointers_entry(lpvoid_t stack_ptr,
|
||||||
pointer_t<X_KTHREAD> thread,
|
pointer_t<X_KTHREAD> thread,
|
||||||
lpvoid_t stack_alloc_base,
|
lpvoid_t stack_alloc_base,
|
||||||
lpvoid_t stack_base,
|
lpvoid_t stack_base,
|
||||||
lpvoid_t stack_limit) {
|
lpvoid_t stack_limit, const ppc_context_t& context) {
|
||||||
auto current_thread = XThread::GetCurrentThread();
|
auto current_thread = XThread::GetCurrentThread();
|
||||||
auto context = current_thread->thread_state()->context();
|
|
||||||
auto pcr = kernel_memory()->TranslateVirtual<X_KPCR*>(
|
|
||||||
static_cast<uint32_t>(context->r[13]));
|
|
||||||
|
|
||||||
|
auto pcr = context->TranslateVirtualGPR<X_KPCR*>(context->r[13]);
|
||||||
|
|
||||||
thread->stack_alloc_base = stack_alloc_base.value();
|
thread->stack_alloc_base = stack_alloc_base.value();
|
||||||
thread->stack_base = stack_base.value();
|
thread->stack_base = stack_base.value();
|
||||||
thread->stack_limit = stack_limit.value();
|
thread->stack_limit = stack_limit.value();
|
||||||
pcr->stack_base_ptr = stack_base.guest_address();
|
pcr->stack_base_ptr = stack_base.guest_address();
|
||||||
pcr->stack_end_ptr = stack_limit.guest_address();
|
pcr->stack_end_ptr = stack_limit.guest_address();
|
||||||
context->r[1] = stack_ptr.guest_address();
|
context->r[1] = stack_ptr.guest_address();
|
||||||
|
|
||||||
// If a fiber is set, and the thread matches, reenter to avoid issues with
|
// If a fiber is set, and the thread matches, reenter to avoid issues with
|
||||||
// host stack overflowing.
|
// host stack overflowing.
|
||||||
if (thread->fiber_ptr &&
|
if (thread->fiber_ptr &&
|
||||||
current_thread->guest_object() == thread.guest_address()) {
|
current_thread->guest_object() == thread.guest_address()) {
|
||||||
|
context->processor->backend()->PrepareForReentry(context.value());
|
||||||
current_thread->Reenter(static_cast<uint32_t>(context->lr));
|
current_thread->Reenter(static_cast<uint32_t>(context->lr));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1018,7 +1038,8 @@ void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr,
|
||||||
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
|
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||||
|
|
||||||
PrefetchForCAS(lock);
|
PrefetchForCAS(lock);
|
||||||
while (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
|
while (!xe::atomic_cas(
|
||||||
|
0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
|
||||||
#if XE_ARCH_AMD64 == 1
|
#if XE_ARCH_AMD64 == 1
|
||||||
// todo: this is just a nop if they don't have SMT, which is not great
|
// todo: this is just a nop if they don't have SMT, which is not great
|
||||||
// either...
|
// either...
|
||||||
|
@ -1038,7 +1059,8 @@ dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(
|
||||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||||
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
|
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||||
PrefetchForCAS(lock);
|
PrefetchForCAS(lock);
|
||||||
if (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])), lock)) {
|
if (!xe::atomic_cas(0, xe::byte_swap(static_cast<uint32_t>(ppc_ctx->r[13])),
|
||||||
|
lock)) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -1281,7 +1303,8 @@ DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented);
|
||||||
|
|
||||||
void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr,
|
void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr,
|
||||||
const ppc_context_t& ppc_context) {
|
const ppc_context_t& ppc_context) {
|
||||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
auto old_irql =
|
||||||
|
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||||
|
|
||||||
int32_t lock_count = ++lock_ptr->lock_count;
|
int32_t lock_count = ++lock_ptr->lock_count;
|
||||||
if (!lock_count) {
|
if (!lock_count) {
|
||||||
|
@ -1318,7 +1341,8 @@ DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading,
|
||||||
|
|
||||||
void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr,
|
void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr,
|
||||||
const ppc_context_t& ppc_context) {
|
const ppc_context_t& ppc_context) {
|
||||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
auto old_irql =
|
||||||
|
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||||
|
|
||||||
int32_t lock_count = ++lock_ptr->lock_count;
|
int32_t lock_count = ++lock_ptr->lock_count;
|
||||||
if (!lock_count ||
|
if (!lock_count ||
|
||||||
|
|
|
@ -33,8 +33,15 @@ DEFINE_bool(ignore_thread_priorities, true,
|
||||||
DEFINE_bool(ignore_thread_affinities, true,
|
DEFINE_bool(ignore_thread_affinities, true,
|
||||||
"Ignores game-specified thread affinities.", "Kernel");
|
"Ignores game-specified thread affinities.", "Kernel");
|
||||||
|
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
DEFINE_int64(stack_size_multiplier_hack, 1,
|
||||||
|
"A hack for games with setjmp/longjmp issues.", "Kernel");
|
||||||
|
DEFINE_int64(main_xthread_stack_size_multiplier_hack, 1,
|
||||||
|
"A hack for games with setjmp/longjmp issues.", "Kernel");
|
||||||
|
#endif
|
||||||
namespace xe {
|
namespace xe {
|
||||||
namespace kernel {
|
namespace kernel {
|
||||||
|
|
||||||
const uint32_t XAPC::kSize;
|
const uint32_t XAPC::kSize;
|
||||||
const uint32_t XAPC::kDummyKernelRoutine;
|
const uint32_t XAPC::kDummyKernelRoutine;
|
||||||
|
@ -373,8 +380,23 @@ X_STATUS XThread::Create() {
|
||||||
RetainHandle();
|
RetainHandle();
|
||||||
|
|
||||||
xe::threading::Thread::CreationParameters params;
|
xe::threading::Thread::CreationParameters params;
|
||||||
params.stack_size = 16_MiB; // Allocate a big host stack.
|
|
||||||
|
|
||||||
|
|
||||||
params.create_suspended = true;
|
params.create_suspended = true;
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
uint64_t stack_size_mult = cvars::stack_size_multiplier_hack;
|
||||||
|
|
||||||
|
if (main_thread_) {
|
||||||
|
stack_size_mult =
|
||||||
|
static_cast<uint64_t>(cvars::main_xthread_stack_size_multiplier_hack);
|
||||||
|
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
uint64_t stack_size_mult = 1;
|
||||||
|
#endif
|
||||||
|
params.stack_size = 16_MiB * stack_size_mult; // Allocate a big host stack.
|
||||||
thread_ = xe::threading::Thread::Create(params, [this]() {
|
thread_ = xe::threading::Thread::Create(params, [this]() {
|
||||||
// Set thread ID override. This is used by logging.
|
// Set thread ID override. This is used by logging.
|
||||||
xe::threading::set_current_thread_id(handle());
|
xe::threading::set_current_thread_id(handle());
|
||||||
|
@ -433,6 +455,9 @@ X_STATUS XThread::Create() {
|
||||||
X_STATUS XThread::Exit(int exit_code) {
|
X_STATUS XThread::Exit(int exit_code) {
|
||||||
// This may only be called on the thread itself.
|
// This may only be called on the thread itself.
|
||||||
assert_true(XThread::GetCurrentThread() == this);
|
assert_true(XThread::GetCurrentThread() == this);
|
||||||
|
//TODO(chrispy): not sure if this order is correct, should it come after apcs?
|
||||||
|
guest_object<X_KTHREAD>()->terminated = 1;
|
||||||
|
|
||||||
|
|
||||||
// TODO(benvanik): dispatch events? waiters? etc?
|
// TODO(benvanik): dispatch events? waiters? etc?
|
||||||
RundownAPCs();
|
RundownAPCs();
|
||||||
|
|
|
@ -121,7 +121,7 @@ struct X_KTHREAD {
|
||||||
uint8_t unk_B4[0x8]; // 0xB4
|
uint8_t unk_B4[0x8]; // 0xB4
|
||||||
uint8_t suspend_count; // 0xBC
|
uint8_t suspend_count; // 0xBC
|
||||||
uint8_t unk_BD; // 0xBD
|
uint8_t unk_BD; // 0xBD
|
||||||
uint8_t unk_BE; // 0xBE
|
uint8_t terminated; // 0xBE
|
||||||
uint8_t current_cpu; // 0xBF
|
uint8_t current_cpu; // 0xBF
|
||||||
uint8_t unk_C0[0x10]; // 0xC0
|
uint8_t unk_C0[0x10]; // 0xC0
|
||||||
xe::be<uint32_t> stack_alloc_base; // 0xD0
|
xe::be<uint32_t> stack_alloc_base; // 0xD0
|
||||||
|
|
|
@ -316,8 +316,8 @@ void Memory::Reset() {
|
||||||
heaps_.v90000000.Reset();
|
heaps_.v90000000.Reset();
|
||||||
heaps_.physical.Reset();
|
heaps_.physical.Reset();
|
||||||
}
|
}
|
||||||
//clang does not like non-standard layout offsetof
|
// clang does not like non-standard layout offsetof
|
||||||
#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL==0
|
#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL == 0
|
||||||
XE_NOALIAS
|
XE_NOALIAS
|
||||||
const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
||||||
#define HEAP_INDEX(name) \
|
#define HEAP_INDEX(name) \
|
||||||
|
@ -359,7 +359,6 @@ const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
||||||
#else
|
#else
|
||||||
XE_NOALIAS
|
XE_NOALIAS
|
||||||
const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
||||||
|
|
||||||
if (address < 0x40000000) {
|
if (address < 0x40000000) {
|
||||||
return &heaps_.v00000000;
|
return &heaps_.v00000000;
|
||||||
} else if (address < 0x7F000000) {
|
} else if (address < 0x7F000000) {
|
||||||
|
@ -964,6 +963,14 @@ bool BaseHeap::AllocFixed(uint32_t base_address, uint32_t size,
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
template<typename T>
|
||||||
|
static inline T QuickMod(T value, uint32_t modv) {
|
||||||
|
if (xe::is_pow2(modv)) {
|
||||||
|
return value & (modv - 1);
|
||||||
|
} else {
|
||||||
|
return value % modv;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
||||||
uint32_t size, uint32_t alignment,
|
uint32_t size, uint32_t alignment,
|
||||||
|
@ -976,8 +983,9 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
||||||
low_address = std::max(heap_base_, xe::align(low_address, alignment));
|
low_address = std::max(heap_base_, xe::align(low_address, alignment));
|
||||||
high_address = std::min(heap_base_ + (heap_size_ - 1),
|
high_address = std::min(heap_base_ + (heap_size_ - 1),
|
||||||
xe::align(high_address, alignment));
|
xe::align(high_address, alignment));
|
||||||
uint32_t low_page_number = (low_address - heap_base_) / page_size_;
|
|
||||||
uint32_t high_page_number = (high_address - heap_base_) / page_size_;
|
uint32_t low_page_number = (low_address - heap_base_) >> page_size_shift_;
|
||||||
|
uint32_t high_page_number = (high_address - heap_base_) >> page_size_shift_;
|
||||||
low_page_number = std::min(uint32_t(page_table_.size()) - 1, low_page_number);
|
low_page_number = std::min(uint32_t(page_table_.size()) - 1, low_page_number);
|
||||||
high_page_number =
|
high_page_number =
|
||||||
std::min(uint32_t(page_table_.size()) - 1, high_page_number);
|
std::min(uint32_t(page_table_.size()) - 1, high_page_number);
|
||||||
|
@ -995,8 +1003,10 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
||||||
// TODO(benvanik): optimized searching (free list buckets, bitmap, etc).
|
// TODO(benvanik): optimized searching (free list buckets, bitmap, etc).
|
||||||
uint32_t start_page_number = UINT_MAX;
|
uint32_t start_page_number = UINT_MAX;
|
||||||
uint32_t end_page_number = UINT_MAX;
|
uint32_t end_page_number = UINT_MAX;
|
||||||
uint32_t page_scan_stride = alignment / page_size_;
|
// chrispy:todo, page_scan_stride is probably always a power of two...
|
||||||
high_page_number = high_page_number - (high_page_number % page_scan_stride);
|
uint32_t page_scan_stride = alignment >> page_size_shift_;
|
||||||
|
high_page_number =
|
||||||
|
high_page_number - QuickMod(high_page_number, page_scan_stride);
|
||||||
if (top_down) {
|
if (top_down) {
|
||||||
for (int64_t base_page_number =
|
for (int64_t base_page_number =
|
||||||
high_page_number - xe::round_up(page_count, page_scan_stride);
|
high_page_number - xe::round_up(page_count, page_scan_stride);
|
||||||
|
@ -1024,7 +1034,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
||||||
base_page_number = -1;
|
base_page_number = -1;
|
||||||
} else {
|
} else {
|
||||||
base_page_number = page_number - page_count;
|
base_page_number = page_number - page_count;
|
||||||
base_page_number -= base_page_number % page_scan_stride;
|
base_page_number -= QuickMod(base_page_number, page_scan_stride);
|
||||||
base_page_number += page_scan_stride; // cancel out loop logic
|
base_page_number += page_scan_stride; // cancel out loop logic
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -1072,7 +1082,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
||||||
if (start_page_number == UINT_MAX || end_page_number == UINT_MAX) {
|
if (start_page_number == UINT_MAX || end_page_number == UINT_MAX) {
|
||||||
// Out of memory.
|
// Out of memory.
|
||||||
XELOGE("BaseHeap::Alloc failed to find contiguous range");
|
XELOGE("BaseHeap::Alloc failed to find contiguous range");
|
||||||
//assert_always("Heap exhausted!");
|
// assert_always("Heap exhausted!");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1084,15 +1094,15 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
||||||
? xe::memory::AllocationType::kCommit
|
? xe::memory::AllocationType::kCommit
|
||||||
: xe::memory::AllocationType::kReserve;
|
: xe::memory::AllocationType::kReserve;
|
||||||
void* result = xe::memory::AllocFixed(
|
void* result = xe::memory::AllocFixed(
|
||||||
TranslateRelative(start_page_number * page_size_),
|
TranslateRelative(start_page_number << page_size_shift_),
|
||||||
page_count * page_size_, alloc_type, ToPageAccess(protect));
|
page_count << page_size_shift_, alloc_type, ToPageAccess(protect));
|
||||||
if (!result) {
|
if (!result) {
|
||||||
XELOGE("BaseHeap::Alloc failed to alloc range from host");
|
XELOGE("BaseHeap::Alloc failed to alloc range from host");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cvars::scribble_heap && (protect & kMemoryProtectWrite)) {
|
if (cvars::scribble_heap && (protect & kMemoryProtectWrite)) {
|
||||||
std::memset(result, 0xCD, page_count * page_size_);
|
std::memset(result, 0xCD, page_count << page_size_shift_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1108,7 +1118,7 @@ bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
||||||
unreserved_page_count_--;
|
unreserved_page_count_--;
|
||||||
}
|
}
|
||||||
|
|
||||||
*out_address = heap_base_ + (start_page_number * page_size_);
|
*out_address = heap_base_ + (start_page_number << page_size_shift_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1719,8 +1729,7 @@ XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
|
||||||
uint32_t first_guest_page = SystemPagenumToGuestPagenum(system_page_first);
|
uint32_t first_guest_page = SystemPagenumToGuestPagenum(system_page_first);
|
||||||
uint32_t last_guest_page = SystemPagenumToGuestPagenum(system_page_last);
|
uint32_t last_guest_page = SystemPagenumToGuestPagenum(system_page_last);
|
||||||
|
|
||||||
uint32_t guest_one =
|
uint32_t guest_one = SystemPagenumToGuestPagenum(1);
|
||||||
SystemPagenumToGuestPagenum(1);
|
|
||||||
|
|
||||||
uint32_t system_one = GuestPagenumToSystemPagenum(1);
|
uint32_t system_one = GuestPagenumToSystemPagenum(1);
|
||||||
for (; i <= system_page_last; ++i) {
|
for (; i <= system_page_last; ++i) {
|
||||||
|
@ -1755,7 +1764,6 @@ XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
uint32_t guest_page_number = SystemPagenumToGuestPagenum(i);
|
uint32_t guest_page_number = SystemPagenumToGuestPagenum(i);
|
||||||
//swcache::PrefetchL1(&page_table_ptr[guest_page_number + 8]);
|
|
||||||
xe::memory::PageAccess current_page_access =
|
xe::memory::PageAccess current_page_access =
|
||||||
ToPageAccess(page_table_ptr[guest_page_number].current_protect);
|
ToPageAccess(page_table_ptr[guest_page_number].current_protect);
|
||||||
bool protect_system_page = false;
|
bool protect_system_page = false;
|
||||||
|
|
|
@ -19,11 +19,96 @@
|
||||||
|
|
||||||
DEFINE_bool(enable_console, false, "Open a console window with the main window",
|
DEFINE_bool(enable_console, false, "Open a console window with the main window",
|
||||||
"General");
|
"General");
|
||||||
|
#if XE_ARCH_AMD64 == 1
|
||||||
|
DEFINE_bool(enable_rdrand_ntdll_patch, true,
|
||||||
|
"Hot-patches ntdll at the start of the process to not use rdrand "
|
||||||
|
"as part of the RNG for heap randomization. Can reduce CPU usage "
|
||||||
|
"significantly, but is untested on all Windows versions.",
|
||||||
|
"Win32");
|
||||||
|
// begin ntdll hack
|
||||||
|
#include <psapi.h>
|
||||||
|
static bool g_didfailtowrite = false;
|
||||||
|
static void write_process_memory(HANDLE process, uintptr_t offset,
|
||||||
|
unsigned size, const unsigned char* bvals) {
|
||||||
|
if (!WriteProcessMemory(process, (void*)offset, bvals, size, nullptr)) {
|
||||||
|
if (!g_didfailtowrite) {
|
||||||
|
MessageBoxA(nullptr, "Failed to write to process!", "Failed", MB_OK);
|
||||||
|
g_didfailtowrite = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static const unsigned char pattern_cmp_processorfeature_28_[] = {
|
||||||
|
0x80, 0x3C, 0x25, 0x90,
|
||||||
|
0x02, 0xFE, 0x7F, 0x00}; // cmp byte ptr ds:7FFE0290h, 0
|
||||||
|
static const unsigned char pattern_replacement[] = {
|
||||||
|
0x48, 0x39, 0xe4, // cmp rsp, rsp = always Z
|
||||||
|
0x0F, 0x1F, 0x44, 0x00, 0x00 // 5byte nop
|
||||||
|
};
|
||||||
|
static void patch_ntdll_instance(HANDLE process, uintptr_t ntdll_base) {
|
||||||
|
MODULEINFO modinfo;
|
||||||
|
|
||||||
|
GetModuleInformation(process, (HMODULE)ntdll_base, &modinfo,
|
||||||
|
sizeof(MODULEINFO));
|
||||||
|
|
||||||
|
std::vector<uintptr_t> possible_places{};
|
||||||
|
|
||||||
|
unsigned char* strt = (unsigned char*)modinfo.lpBaseOfDll;
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < modinfo.SizeOfImage; ++i) {
|
||||||
|
for (unsigned j = 0; j < sizeof(pattern_cmp_processorfeature_28_); ++j) {
|
||||||
|
if (strt[i + j] != pattern_cmp_processorfeature_28_[j]) {
|
||||||
|
goto miss;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
possible_places.push_back((uintptr_t)(&strt[i]));
|
||||||
|
miss:;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto&& place : possible_places) {
|
||||||
|
write_process_memory(process, place, sizeof(pattern_replacement),
|
||||||
|
pattern_replacement);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void do_ntdll_hack_this_process() {
|
||||||
|
patch_ntdll_instance(GetCurrentProcess(),
|
||||||
|
(uintptr_t)GetModuleHandleA("ntdll.dll"));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// end ntdll hack
|
||||||
|
LONG _UnhandledExceptionFilter(_EXCEPTION_POINTERS* ExceptionInfo) {
|
||||||
|
PVOID exception_addr = ExceptionInfo->ExceptionRecord->ExceptionAddress;
|
||||||
|
|
||||||
|
DWORD64 last_stackpointer = ExceptionInfo->ContextRecord->Rsp;
|
||||||
|
|
||||||
|
DWORD64 last_rip = ExceptionInfo->ContextRecord->Rip;
|
||||||
|
|
||||||
|
DWORD except_code = ExceptionInfo->ExceptionRecord->ExceptionCode;
|
||||||
|
|
||||||
|
DWORD last_error = GetLastError();
|
||||||
|
|
||||||
|
NTSTATUS stat = __readgsdword(0x1250);
|
||||||
|
|
||||||
|
int last_errno_value = errno;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
char except_message_buf[1024];
|
||||||
|
|
||||||
|
sprintf_s(except_message_buf,
|
||||||
|
"Exception encountered!\nException address: %p\nStackpointer: "
|
||||||
|
"%p\nInstruction pointer: %p\nExceptionCode: 0x%X\nLast Win32 "
|
||||||
|
"Error: 0x%X\nLast NTSTATUS: 0x%X\nLast errno value: 0x%X\n",
|
||||||
|
exception_addr, (void*)last_stackpointer, (void*)last_rip, except_code,
|
||||||
|
last_error, stat, last_errno_value);
|
||||||
|
MessageBoxA(nullptr, except_message_buf, "Unhandled Exception", MB_ICONERROR);
|
||||||
|
return EXCEPTION_CONTINUE_SEARCH;
|
||||||
|
}
|
||||||
int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev,
|
int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev,
|
||||||
LPWSTR command_line, int show_cmd) {
|
LPWSTR command_line, int show_cmd) {
|
||||||
int result;
|
int result;
|
||||||
|
SetUnhandledExceptionFilter(_UnhandledExceptionFilter);
|
||||||
{
|
{
|
||||||
xe::ui::Win32WindowedAppContext app_context(hinstance, show_cmd);
|
xe::ui::Win32WindowedAppContext app_context(hinstance, show_cmd);
|
||||||
// TODO(Triang3l): Initialize creates a window. Set DPI awareness via the
|
// TODO(Triang3l): Initialize creates a window. Set DPI awareness via the
|
||||||
|
@ -40,13 +125,6 @@ int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev,
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(Triang3l): Rework this, need to initialize the console properly,
|
|
||||||
// disable has_console_attached_ by default in windowed apps, and attach
|
|
||||||
// only if needed.
|
|
||||||
if (cvars::enable_console) {
|
|
||||||
xe::AttachConsole();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize COM on the UI thread with the apartment-threaded concurrency
|
// Initialize COM on the UI thread with the apartment-threaded concurrency
|
||||||
// model, so dialogs can be used.
|
// model, so dialogs can be used.
|
||||||
if (FAILED(CoInitializeEx(nullptr, COINIT_APARTMENTTHREADED))) {
|
if (FAILED(CoInitializeEx(nullptr, COINIT_APARTMENTTHREADED))) {
|
||||||
|
@ -55,8 +133,22 @@ int WINAPI wWinMain(HINSTANCE hinstance, HINSTANCE hinstance_prev,
|
||||||
|
|
||||||
xe::InitializeWin32App(app->GetName());
|
xe::InitializeWin32App(app->GetName());
|
||||||
|
|
||||||
result =
|
if (app->OnInitialize()) {
|
||||||
app->OnInitialize() ? app_context.RunMainMessageLoop() : EXIT_FAILURE;
|
#if XE_ARCH_AMD64 == 1
|
||||||
|
if (cvars::enable_rdrand_ntdll_patch) {
|
||||||
|
do_ntdll_hack_this_process();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// TODO(Triang3l): Rework this, need to initialize the console properly,
|
||||||
|
// disable has_console_attached_ by default in windowed apps, and attach
|
||||||
|
// only if needed.
|
||||||
|
if (cvars::enable_console) {
|
||||||
|
xe::AttachConsole();
|
||||||
|
}
|
||||||
|
result = app_context.RunMainMessageLoop();
|
||||||
|
} else {
|
||||||
|
result = EXIT_FAILURE;
|
||||||
|
}
|
||||||
|
|
||||||
app->InvokeOnDestroy();
|
app->InvokeOnDestroy();
|
||||||
}
|
}
|
||||||
|
|
|
@ -61,6 +61,7 @@ typedef uint32_t X_STATUS;
|
||||||
#define X_STATUS_OBJECT_NAME_COLLISION ((X_STATUS)0xC0000035L)
|
#define X_STATUS_OBJECT_NAME_COLLISION ((X_STATUS)0xC0000035L)
|
||||||
#define X_STATUS_INVALID_PAGE_PROTECTION ((X_STATUS)0xC0000045L)
|
#define X_STATUS_INVALID_PAGE_PROTECTION ((X_STATUS)0xC0000045L)
|
||||||
#define X_STATUS_MUTANT_NOT_OWNED ((X_STATUS)0xC0000046L)
|
#define X_STATUS_MUTANT_NOT_OWNED ((X_STATUS)0xC0000046L)
|
||||||
|
#define X_STATUS_THREAD_IS_TERMINATING ((X_STATUS)0xC000004BL)
|
||||||
#define X_STATUS_PROCEDURE_NOT_FOUND ((X_STATUS)0xC000007AL)
|
#define X_STATUS_PROCEDURE_NOT_FOUND ((X_STATUS)0xC000007AL)
|
||||||
#define X_STATUS_INSUFFICIENT_RESOURCES ((X_STATUS)0xC000009AL)
|
#define X_STATUS_INSUFFICIENT_RESOURCES ((X_STATUS)0xC000009AL)
|
||||||
#define X_STATUS_MEMORY_NOT_ALLOCATED ((X_STATUS)0xC00000A0L)
|
#define X_STATUS_MEMORY_NOT_ALLOCATED ((X_STATUS)0xC00000A0L)
|
||||||
|
|
Loading…
Reference in New Issue