Merge pull request #75 from chrisps/canary_experimental

misc stuff again
This commit is contained in:
chrisps 2022-09-17 06:43:50 -07:00 committed by GitHub
commit a29a7436e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
35 changed files with 942 additions and 738 deletions

View File

@ -20,6 +20,8 @@ namespace apu {
namespace conversion {
#if XE_ARCH_AMD64
#if 0
inline void sequential_6_BE_to_interleaved_6_LE(float* output,
const float* input,
size_t ch_sample_count) {
@ -41,7 +43,44 @@ inline void sequential_6_BE_to_interleaved_6_LE(float* output,
out[sample * 6 + 5] = sample2;
}
}
#else
XE_NOINLINE
static void _generic_sequential_6_BE_to_interleaved_6_LE(
float* XE_RESTRICT output, const float* XE_RESTRICT input,
unsigned ch_sample_count) {
for (unsigned sample = 0; sample < ch_sample_count; sample++) {
for (unsigned channel = 0; channel < 6; channel++) {
unsigned int value = *reinterpret_cast<const unsigned int*>(
&input[channel * ch_sample_count + sample]);
*reinterpret_cast<unsigned int*>(&output[sample * 6 + channel]) =
xe::byte_swap(value);
}
}
}
XE_NOINLINE
static void _movbe_sequential_6_BE_to_interleaved_6_LE(
float* XE_RESTRICT output, const float* XE_RESTRICT input,
unsigned ch_sample_count) {
for (unsigned sample = 0; sample < ch_sample_count; sample++) {
for (unsigned channel = 0; channel < 6; channel++) {
*reinterpret_cast<unsigned int*>(&output[sample * 6 + channel]) =
_load_be_u32(reinterpret_cast<const unsigned int*>(
&input[channel * ch_sample_count + sample]));
}
}
}
inline static void sequential_6_BE_to_interleaved_6_LE(
float* output, const float* input, unsigned ch_sample_count) {
if (amd64::GetFeatureFlags() & amd64::kX64EmitMovbe) {
_movbe_sequential_6_BE_to_interleaved_6_LE(output, input, ch_sample_count);
} else {
_generic_sequential_6_BE_to_interleaved_6_LE(output, input,
ch_sample_count);
}
}
#endif
inline void sequential_6_BE_to_interleaved_2_LE(float* output,
const float* input,
size_t ch_sample_count) {

View File

@ -335,7 +335,8 @@ ICommandVar* define_cmdvar(const char* name, T* default_value,
#define DEFINE_uint64(name, default_value, description, category) \
DEFINE_CVar(name, default_value, description, category, false, uint64_t)
#define DEFINE_int64(name, default_value, description, category) \
DEFINE_CVar(name, default_value, description, category, false, int64_t)
#define DEFINE_double(name, default_value, description, category) \
DEFINE_CVar(name, default_value, description, category, false, double)
@ -383,7 +384,7 @@ ICommandVar* define_cmdvar(const char* name, T* default_value,
#define DECLARE_uint32(name) DECLARE_CVar(name, uint32_t)
#define DECLARE_uint64(name) DECLARE_CVar(name, uint64_t)
#define DECLARE_int64(name) DECLARE_CVar(name, int64_t)
#define DECLARE_double(name) DECLARE_CVar(name, double)
#define DECLARE_string(name) DECLARE_CVar(name, std::string)

View File

@ -26,7 +26,7 @@ check this and release the mutex one way to do this is by using FlsAlloc and
PFLS_CALLBACK_FUNCTION, which gets called with the fiber local data when a
thread exits
*/
thread_local unsigned global_mutex_depth = 0;
static CRITICAL_SECTION* global_critical_section(xe_global_mutex* mutex) {
return reinterpret_cast<CRITICAL_SECTION*>(mutex);
}
@ -38,29 +38,16 @@ xe_global_mutex::xe_global_mutex() {
xe_global_mutex ::~xe_global_mutex() {
DeleteCriticalSection(global_critical_section(this));
}
void xe_global_mutex::lock() {
if (global_mutex_depth) {
} else {
EnterCriticalSection(global_critical_section(this));
}
global_mutex_depth++;
EnterCriticalSection(global_critical_section(this));
}
void xe_global_mutex::unlock() {
if (--global_mutex_depth == 0) {
LeaveCriticalSection(global_critical_section(this));
}
LeaveCriticalSection(global_critical_section(this));
}
bool xe_global_mutex::try_lock() {
if (global_mutex_depth) {
++global_mutex_depth;
return true;
} else {
BOOL success = TryEnterCriticalSection(global_critical_section(this));
if (success) {
++global_mutex_depth;
}
return success;
}
BOOL success = TryEnterCriticalSection(global_critical_section(this));
return success;
}
CRITICAL_SECTION* fast_crit(xe_fast_mutex* mutex) {

View File

@ -116,15 +116,15 @@
#define XE_LIKELY(...) (!!(__VA_ARGS__))
#define XE_UNLIKELY(...) (!!(__VA_ARGS__))
#define XE_MSVC_ASSUME(...) __assume(__VA_ARGS__)
#define XE_NOALIAS __declspec(noalias)
#define XE_NOALIAS __declspec(noalias)
#elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1
#define XE_FORCEINLINE __attribute__((always_inline))
#define XE_NOINLINE __attribute__((noinline))
#define XE_COLD __attribute__((cold))
#define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
#define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
#define XE_NOALIAS
//cant do unevaluated assume
#define XE_NOALIAS
// cant do unevaluated assume
#define XE_MSVC_ASSUME(...) static_cast<void>(0)
#else
#define XE_FORCEINLINE inline
@ -137,7 +137,13 @@
#define XE_MSVC_ASSUME(...) static_cast<void>(0)
#endif
#if XE_COMPILER_HAS_MSVC_EXTENSIONS == 1
#define XE_MSVC_OPTIMIZE_SMALL() __pragma(optimize("s", on))
#define XE_MSVC_OPTIMIZE_REVERT() __pragma(optimize("", on))
#else
#define XE_MSVC_OPTIMIZE_SMALL()
#define XE_MSVC_OPTIMIZE_REVERT()
#endif
#if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
#define XE_LIKELY_IF(...) if (XE_LIKELY(__VA_ARGS__))
#define XE_UNLIKELY_IF(...) if (XE_UNLIKELY(__VA_ARGS__))
@ -180,7 +186,7 @@ const char kPathSeparator = '/';
const char kGuestPathSeparator = '\\';
} // namespace xe
#if XE_ARCH_AMD64==1
#if XE_ARCH_AMD64 == 1
#include "platform_amd64.h"
#endif
#endif // XENIA_BASE_PLATFORM_H_

View File

@ -7,13 +7,12 @@
******************************************************************************
*/
#include "xenia/base/cvar.h"
#include "xenia/base/platform.h"
#include "third_party/xbyak/xbyak/xbyak.h"
#include "third_party/xbyak/xbyak/xbyak_util.h"
DEFINE_int32(x64_extension_mask, -1,
DEFINE_int64(x64_extension_mask, -1LL,
"Allow the detection and utilization of specific instruction set "
"features.\n"
" 0 = x86_64 + AVX1\n"
@ -33,79 +32,92 @@ DEFINE_int32(x64_extension_mask, -1,
"x64");
namespace xe {
namespace amd64 {
static uint32_t g_feature_flags = 0U;
static uint64_t g_feature_flags = 0U;
static bool g_did_initialize_feature_flags = false;
uint32_t GetFeatureFlags() {
xenia_assert(g_did_initialize_feature_flags);
return g_feature_flags;
uint64_t GetFeatureFlags() {
xenia_assert(g_did_initialize_feature_flags);
return g_feature_flags;
}
XE_COLD
XE_NOINLINE
void InitFeatureFlags() {
uint32_t feature_flags_ = 0U;
Xbyak::util::Cpu cpu_;
uint64_t feature_flags_ = 0U;
{
Xbyak::util::Cpu cpu_;
#define TEST_EMIT_FEATURE(emit, ext) \
if ((cvars::x64_extension_mask & emit) == emit) { \
feature_flags_ |= (cpu_.has(ext) ? emit : 0); \
}
TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
#undef TEST_EMIT_FEATURE
/*
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
latest version of xbyak
*/
unsigned int data[4];
Xbyak::util::Cpu::getCpuid(0x80000001, data);
unsigned amd_flags = data[2];
if (amd_flags & (1U << 5)) {
if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
feature_flags_ |= kX64EmitLZCNT;
}
}
// todo: although not reported by cpuid, zen 1 and zen+ also have fma4
if (amd_flags & (1U << 16)) {
if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
feature_flags_ |= kX64EmitFMA4;
}
}
if (amd_flags & (1U << 21)) {
if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
feature_flags_ |= kX64EmitTBM;
}
}
if (amd_flags & (1U << 11)) {
if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
feature_flags_ |= kX64EmitXOP;
}
}
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
bool is_zennish = cpu_.displayFamily >= 0x17;
/*
chrispy: according to agner's tables, all amd architectures that
we support (ones with avx) have the same timings for
jrcxz/loop/loope/loopne as for other jmps
*/
feature_flags_ |= kX64FastJrcx;
feature_flags_ |= kX64FastLoop;
if (is_zennish) {
// ik that i heard somewhere that this is the case for zen, but i need to
// verify. cant find my original source for that.
// todo: ask agner?
feature_flags_ |= kX64FlagsIndependentVars;
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
latest version of xbyak
*/
unsigned int data[4];
Xbyak::util::Cpu::getCpuid(0x80000001, data);
unsigned amd_flags = data[2];
if (amd_flags & (1U << 5)) {
if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
feature_flags_ |= kX64EmitLZCNT;
}
}
// todo: although not reported by cpuid, zen 1 and zen+ also have fma4
if (amd_flags & (1U << 16)) {
if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
feature_flags_ |= kX64EmitFMA4;
}
}
if (amd_flags & (1U << 21)) {
if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
feature_flags_ |= kX64EmitTBM;
}
}
if (amd_flags & (1U << 11)) {
if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
feature_flags_ |= kX64EmitXOP;
}
}
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
bool is_zennish = cpu_.displayFamily >= 0x17;
/*
chrispy: according to agner's tables, all amd architectures
that we support (ones with avx) have the same timings for
jrcxz/loop/loope/loopne as for other jmps
*/
feature_flags_ |= kX64FastJrcx;
feature_flags_ |= kX64FastLoop;
if (is_zennish) {
// ik that i heard somewhere that this is the case for zen, but i need
// to verify. cant find my original source for that. todo: ask agner?
feature_flags_ |= kX64FlagsIndependentVars;
}
}
}
{
unsigned int data[4];
memset(data, 0, sizeof(data));
// intel extended features
Xbyak::util::Cpu::getCpuidEx(7, 0, data);
if ((data[2] & (1 << 28)) &&
(cvars::x64_extension_mask & kX64EmitMovdir64M)) {
feature_flags_ |= kX64EmitMovdir64M;
}
if ((data[1] & (1 << 9)) && (cvars::x64_extension_mask & kX64FastRepMovs)) {
feature_flags_ |= kX64FastRepMovs;
}
}
g_feature_flags = feature_flags_;

View File

@ -13,7 +13,7 @@
namespace xe {
namespace amd64 {
enum X64FeatureFlags {
enum X64FeatureFlags : uint64_t {
kX64EmitAVX2 = 1 << 0,
kX64EmitFMA = 1 << 1,
kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount
@ -44,14 +44,13 @@ enum X64FeatureFlags {
// instructions, and FX users need the boost
kX64EmitFMA4 = 1 << 17, // todo: also use on zen1?
kX64EmitTBM = 1 << 18,
// kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
// 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
// instructions by using the legacy sse version if we recently cleared the
// high 128 bits of the
kX64EmitMovdir64M = 1 << 19,
kX64FastRepMovs = 1 << 20
};
XE_NOALIAS
uint32_t GetFeatureFlags();
uint64_t GetFeatureFlags();
XE_COLD
void InitFeatureFlags();

View File

@ -299,6 +299,12 @@ class Event : public WaitHandle {
// the nonsignaled state after releasing the appropriate number of waiting
// threads.
virtual void Pulse() = 0;
#if XE_PLATFORM_WIN32 ==1
//SetEvent, but if there is a waiter we immediately transfer execution to it
virtual void SetBoostPriority() = 0;
#else
void SetBoostPriority() { Set() }
#endif
};
// Models a Win32-like semaphore object.

View File

@ -39,6 +39,8 @@ XE_NTDLL_IMPORT(NtWaitForSingleObject, cls_NtWaitForSingleObject,
NtWaitForSingleObjectPointer);
XE_NTDLL_IMPORT(NtSetEvent, cls_NtSetEvent, NtSetEventPointer);
XE_NTDLL_IMPORT(NtSetEventBoostPriority, cls_NtSetEventBoostPriority,
NtSetEventBoostPriorityPointer);
// difference between NtClearEvent and NtResetEvent is that NtResetEvent returns
// the events state prior to the call, but we dont need that. might need to
// check whether one or the other is faster in the kernel though yeah, just
@ -53,6 +55,7 @@ XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,
XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
NtDelayExecutionPointer);
namespace xe {
namespace threading {
@ -137,7 +140,7 @@ void MaybeYield() {
#endif
#endif
// memorybarrier is really not necessary here...
MemoryBarrier();
// MemoryBarrier();
}
void SyncMemory() { MemoryBarrier(); }
@ -288,11 +291,19 @@ class Win32Event : public Win32Handle<Event> {
void Set() override { NtSetEventPointer.invoke(handle_, nullptr); }
void Reset() override { NtClearEventPointer.invoke(handle_); }
void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); }
void SetBoostPriority() override {
// no previous state for boostpriority
NtSetEventBoostPriorityPointer.invoke(handle_);
}
#else
void Set() override { SetEvent(handle_); }
void Reset() override { ResetEvent(handle_); }
void Pulse() override { PulseEvent(handle_); }
void SetBoostPriority() override {
// no win32 version of boostpriority
SetEvent(handle_);
}
#endif
};

View File

@ -23,7 +23,7 @@
#define XE_X64_PROFILER_AVAILABLE 1
#endif
DECLARE_int32(x64_extension_mask);
DECLARE_int64(x64_extension_mask);
namespace xe {
class Exception;

View File

@ -103,74 +103,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
"FAQ for system requirements at https://xenia.jp");
return;
}
#if 1
feature_flags_ = amd64::GetFeatureFlags();
#else
#define TEST_EMIT_FEATURE(emit, ext) \
if ((cvars::x64_extension_mask & emit) == emit) { \
feature_flags_ |= (cpu_.has(ext) ? emit : 0); \
}
TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
#undef TEST_EMIT_FEATURE
/*
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
latest version of xbyak
*/
unsigned int data[4];
Xbyak::util::Cpu::getCpuid(0x80000001, data);
unsigned amd_flags = data[2];
if (amd_flags & (1U << 5)) {
if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
feature_flags_ |= kX64EmitLZCNT;
}
}
// todo: although not reported by cpuid, zen 1 and zen+ also have fma4
if (amd_flags & (1U << 16)) {
if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
feature_flags_ |= kX64EmitFMA4;
}
}
if (amd_flags & (1U << 21)) {
if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
feature_flags_ |= kX64EmitTBM;
}
}
if (amd_flags & (1U << 11)) {
if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
feature_flags_ |= kX64EmitXOP;
XELOGCPU("Cpu support XOP!\n\n");
}
}
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
bool is_zennish = cpu_.displayFamily >= 0x17;
/*
chrispy: according to agner's tables, all amd architectures that
we support (ones with avx) have the same timings for
jrcxz/loop/loope/loopne as for other jmps
*/
feature_flags_ |= kX64FastJrcx;
feature_flags_ |= kX64FastLoop;
if (is_zennish) {
// ik that i heard somewhere that this is the case for zen, but i need to
// verify. cant find my original source for that.
// todo: ask agner?
feature_flags_ |= kX64FlagsIndependentVars;
}
}
#endif
feature_flags_ = amd64::GetFeatureFlags();
may_use_membase32_as_zero_reg_ =
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(
processor()->memory()->virtual_membase())) == 0;

View File

@ -299,7 +299,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
void* FindWordConstantOffset(unsigned wordvalue);
void* FindDwordConstantOffset(unsigned bytevalue);
void* FindQwordConstantOffset(uint64_t bytevalue);
bool IsFeatureEnabled(uint32_t feature_flag) const {
bool IsFeatureEnabled(uint64_t feature_flag) const {
return (feature_flags_ & feature_flag) == feature_flag;
}
@ -395,7 +395,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
XbyakAllocator* allocator_ = nullptr;
XexModule* guest_module_ = nullptr;
Xbyak::util::Cpu cpu_;
uint32_t feature_flags_ = 0;
uint64_t feature_flags_ = 0;
uint32_t current_guest_function_ = 0;
Xbyak::Label* epilog_label_ = nullptr;

View File

@ -39,7 +39,7 @@
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
#include "xenia/cpu/hir/hir_builder.h"
#include "xenia/cpu/processor.h"
XE_MSVC_OPTIMIZE_SMALL()
DEFINE_bool(use_fast_dot_product, false,
"Experimental optimization, much shorter sequence on dot products, "
"treating inf as overflow instead of using mcxsr"

View File

@ -19,16 +19,19 @@ EntryTable::EntryTable() = default;
EntryTable::~EntryTable() {
auto global_lock = global_critical_region_.Acquire();
for (auto it : map_) {
Entry* entry = it.second;
for (auto it : map_.Values()) {
Entry* entry = it;
delete entry;
}
}
Entry* EntryTable::Get(uint32_t address) {
auto global_lock = global_critical_region_.Acquire();
const auto& it = map_.find(address);
Entry* entry = it != map_.end() ? it->second : nullptr;
uint32_t idx = map_.IndexForKey(address);
if (idx == map_.size() || *map_.KeyAt(idx) != address) {
return nullptr;
}
Entry* entry = *map_.ValueAt(idx);
if (entry) {
// TODO(benvanik): wait if needed?
if (entry->status != Entry::STATUS_READY) {
@ -43,8 +46,12 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
// https://github.com/facebook/folly/blob/master/folly/AtomicHashMap.h
auto global_lock = global_critical_region_.Acquire();
const auto& it = map_.find(address);
Entry* entry = it != map_.end() ? it->second : nullptr;
uint32_t idx = map_.IndexForKey(address);
Entry* entry = idx != map_.size() && *map_.KeyAt(idx) == address
? *map_.ValueAt(idx)
: nullptr;
Entry::Status status;
if (entry) {
// If we aren't ready yet spin and wait.
@ -65,7 +72,8 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
entry->end_address = 0;
entry->status = Entry::STATUS_COMPILING;
entry->function = 0;
map_[address] = entry;
map_.InsertAt(address, entry, idx);
// map_[address] = entry;
status = Entry::STATUS_NEW;
}
global_lock.unlock();
@ -75,18 +83,18 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
void EntryTable::Delete(uint32_t address) {
auto global_lock = global_critical_region_.Acquire();
const auto itr = map_.find(address);
if (itr != map_.cend()) {
map_.erase(itr);
// doesnt this leak memory by not deleting the entry?
uint32_t idx = map_.IndexForKey(address);
if (idx != map_.size() && *map_.KeyAt(idx) == address) {
map_.EraseAt(idx);
}
}
std::vector<Function*> EntryTable::FindWithAddress(uint32_t address) {
auto global_lock = global_critical_region_.Acquire();
std::vector<Function*> fns;
for (auto& it : map_) {
Entry* entry = it.second;
for (auto& it : map_.Values()) {
Entry* entry = it;
if (address >= entry->address && address <= entry->end_address) {
if (entry->status == Entry::STATUS_READY) {
fns.push_back(entry->function);
@ -95,6 +103,5 @@ std::vector<Function*> EntryTable::FindWithAddress(uint32_t address) {
}
return fns;
}
} // namespace cpu
} // namespace xe

View File

@ -14,7 +14,7 @@
#include <vector>
#include "xenia/base/mutex.h"
#include "xenia/base/split_map.h"
namespace xe {
namespace cpu {
@ -48,7 +48,8 @@ class EntryTable {
private:
xe::global_critical_region global_critical_region_;
// TODO(benvanik): replace with a better data structure.
std::unordered_map<uint32_t, Entry*> map_;
xe::split_map<uint32_t, Entry*> map_;
//std::unordered_map<uint32_t, Entry*> map_;
};
} // namespace cpu

View File

@ -334,7 +334,7 @@ void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr,
void CommandProcessor::UpdateWritePointer(uint32_t value) {
write_ptr_index_ = value;
write_ptr_index_event_->Set();
write_ptr_index_event_->SetBoostPriority();
}
void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index,
uint32_t value) {
@ -665,6 +665,11 @@ uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index,
reader_.set_read_offset(read_index * sizeof(uint32_t));
reader_.set_write_offset(write_index * sizeof(uint32_t));
// prefetch the wraparound range
// it likely is already in L3 cache, but in a zen system it may be another
// chiplets l3
reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
GetCurrentRingReadCount());
do {
if (!ExecutePacket()) {
// This probably should be fatal - but we're going to continue anyways.

File diff suppressed because it is too large Load Diff

View File

@ -45,7 +45,10 @@
namespace xe {
namespace gpu {
namespace d3d12 {
struct MemExportRange {
uint32_t base_address_dwords;
uint32_t size_dwords;
};
class D3D12CommandProcessor final : public CommandProcessor {
public:
#include "../pm4_command_processor_declare.h"
@ -287,8 +290,21 @@ class D3D12CommandProcessor final : public CommandProcessor {
bool IssueDraw(xenos::PrimitiveType primitive_type, uint32_t index_count,
IndexBufferInfo* index_buffer_info,
bool major_mode_explicit) override;
XE_COLD
XE_NOINLINE
bool HandleMemexportGuestDMA(ID3D12Resource*& scratch_index_buffer,
D3D12_INDEX_BUFFER_VIEW& index_buffer_view,
uint32_t guest_index_base,
bool& retflag);
XE_NOINLINE
XE_COLD
bool GatherMemexportRangesAndMakeResident(bool& retflag);
XE_NOINLINE
XE_COLD
void HandleMemexportDrawOrdering_AndReadback();
bool IssueCopy() override;
XE_NOINLINE
bool IssueCopy_ReadbackResolvePath();
void InitializeTrace() override;
private:
@ -363,6 +379,8 @@ class D3D12CommandProcessor final : public CommandProcessor {
};
// Gets the indices of optional root parameters. Returns the total parameter
// count.
XE_NOINLINE
XE_COLD
static uint32_t GetRootBindfulExtraParameterIndices(
const DxbcShader* vertex_shader, const DxbcShader* pixel_shader,
RootBindfulExtraParameterIndices& indices_out);
@ -437,6 +455,18 @@ class D3D12CommandProcessor final : public CommandProcessor {
bool UpdateBindings(const D3D12Shader* vertex_shader,
const D3D12Shader* pixel_shader,
ID3D12RootSignature* root_signature);
XE_COLD
XE_NOINLINE
void UpdateBindings_UpdateRootBindful();
XE_NOINLINE
XE_COLD
bool UpdateBindings_BindfulPath(
const size_t texture_layout_uid_vertex,
const std::vector<xe::gpu::DxbcShader::TextureBinding>& textures_vertex,
const size_t texture_layout_uid_pixel,
const std::vector<xe::gpu::DxbcShader::TextureBinding>* textures_pixel,
const size_t sampler_count_vertex, const size_t sampler_count_pixel,
bool& retflag);
// Returns dword count for one element for a memexport format, or 0 if it's
// not supported by the D3D12 command processor (if it's smaller that 1 dword,
@ -743,6 +773,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
draw_util::GetViewportInfoArgs previous_viewport_info_args_;
draw_util::ViewportInfo previous_viewport_info_;
// scratch memexport data
MemExportRange memexport_ranges_[512];
uint32_t memexport_range_count_ = 0;
};
} // namespace d3d12

View File

@ -266,22 +266,9 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
void* DeferredCommandList::WriteCommand(Command command,
size_t arguments_size_bytes) {
size_t arguments_size_elements =
round_up(arguments_size_bytes, sizeof(uintmax_t), false);
//(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
#if 0
size_t offset = command_stream_.size();
command_stream_.resize(offset + kCommandHeaderSizeElements +
arguments_size_elements);
CommandHeader& header =
*reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
header.command = command;
header.arguments_size_elements = uint32_t(arguments_size_elements);
return command_stream_.data() + (offset + kCommandHeaderSizeElements);
#else
size_t offset = command_stream_.size();
constexpr size_t kCommandHeaderSizeBytes =
kCommandHeaderSizeElements * sizeof(uintmax_t);
@ -290,9 +277,9 @@ void* DeferredCommandList::WriteCommand(Command command,
CommandHeader& header =
*reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
header.command = command;
header.arguments_size_elements = uint32_t(arguments_size_elements) / sizeof(uintmax_t);
header.arguments_size_elements =
uint32_t(arguments_size_elements) / sizeof(uintmax_t);
return command_stream_.data() + (offset + kCommandHeaderSizeBytes);
#endif
}
} // namespace d3d12

View File

@ -183,7 +183,7 @@ void PipelineCache::Shutdown() {
// creating them.
if (!creation_threads_.empty()) {
{
std::lock_guard<std::mutex> lock(creation_request_lock_);
std::lock_guard<xe_mutex> lock(creation_request_lock_);
creation_threads_shutdown_from_ = 0;
}
creation_request_cond_.notify_all();
@ -681,7 +681,7 @@ void PipelineCache::InitializeShaderStorage(
if (!creation_threads_.empty()) {
// Submit the pipeline for creation to any available thread.
{
std::lock_guard<std::mutex> lock(creation_request_lock_);
std::lock_guard<xe_mutex> lock(creation_request_lock_);
creation_queue_.push_back(new_pipeline);
}
creation_request_cond_.notify_one();
@ -695,7 +695,7 @@ void PipelineCache::InitializeShaderStorage(
CreateQueuedPipelinesOnProcessorThread();
if (creation_threads_.size() > creation_thread_original_count) {
{
std::lock_guard<std::mutex> lock(creation_request_lock_);
std::lock_guard<xe_mutex> lock(creation_request_lock_);
creation_threads_shutdown_from_ = creation_thread_original_count;
// Assuming the queue is empty because of
// CreateQueuedPipelinesOnProcessorThread.
@ -708,7 +708,7 @@ void PipelineCache::InitializeShaderStorage(
bool await_creation_completion_event;
{
// Cleanup so additional threads can be created later again.
std::lock_guard<std::mutex> lock(creation_request_lock_);
std::lock_guard<xe_mutex> lock(creation_request_lock_);
creation_threads_shutdown_from_ = SIZE_MAX;
// If the invocation is blocking, all the shader storage
// initialization is expected to be done before proceeding, to avoid
@ -813,7 +813,7 @@ void PipelineCache::EndSubmission() {
// Await creation of all queued pipelines.
bool await_creation_completion_event;
{
std::lock_guard<std::mutex> lock(creation_request_lock_);
std::lock_guard<xe_mutex> lock(creation_request_lock_);
// Assuming the creation queue is already empty (because the processor
// thread also worked on creating the leftover pipelines), so only check
// if there are threads with pipelines currently being created.
@ -834,7 +834,7 @@ bool PipelineCache::IsCreatingPipelines() {
if (creation_threads_.empty()) {
return false;
}
std::lock_guard<std::mutex> lock(creation_request_lock_);
std::lock_guard<xe_mutex> lock(creation_request_lock_);
return !creation_queue_.empty() || creation_threads_busy_ != 0;
}
@ -1076,7 +1076,7 @@ bool PipelineCache::ConfigurePipeline(
if (!creation_threads_.empty()) {
// Submit the pipeline for creation to any available thread.
{
std::lock_guard<std::mutex> lock(creation_request_lock_);
std::lock_guard<xe_mutex> lock(creation_request_lock_);
creation_queue_.push_back(new_pipeline);
}
creation_request_cond_.notify_one();
@ -3314,7 +3314,7 @@ void PipelineCache::CreationThread(size_t thread_index) {
// Check if need to shut down or set the completion event and dequeue the
// pipeline if there is any.
{
std::unique_lock<std::mutex> lock(creation_request_lock_);
std::unique_lock<xe_mutex> lock(creation_request_lock_);
if (thread_index >= creation_threads_shutdown_from_ ||
creation_queue_.empty()) {
if (creation_completion_set_event_ && creation_threads_busy_ == 0) {
@ -3345,7 +3345,7 @@ void PipelineCache::CreationThread(size_t thread_index) {
// completion event if needed (at the next iteration, or in some other
// thread).
{
std::lock_guard<std::mutex> lock(creation_request_lock_);
std::lock_guard<xe_mutex> lock(creation_request_lock_);
--creation_threads_busy_;
}
}
@ -3356,7 +3356,7 @@ void PipelineCache::CreateQueuedPipelinesOnProcessorThread() {
while (true) {
Pipeline* pipeline_to_create;
{
std::lock_guard<std::mutex> lock(creation_request_lock_);
std::lock_guard<xe_mutex> lock(creation_request_lock_);
if (creation_queue_.empty()) {
break;
}

View File

@ -403,8 +403,8 @@ class PipelineCache {
// Pipeline creation threads.
void CreationThread(size_t thread_index);
void CreateQueuedPipelinesOnProcessorThread();
std::mutex creation_request_lock_;
std::condition_variable creation_request_cond_;
xe_mutex creation_request_lock_;
std::condition_variable_any creation_request_cond_;
// Protected with creation_request_lock_, notify_one creation_request_cond_
// when set.
std::deque<Pipeline*> creation_queue_;

View File

@ -650,7 +650,8 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
}
return normalized_color_mask;
}
XE_NOINLINE
XE_NOALIAS
xenos::CopySampleSelect SanitizeCopySampleSelect(
xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
bool is_depth) {
@ -737,7 +738,7 @@ const ResolveCopyShaderInfo
{"Resolve Copy Full 64bpp", true, 2, 4, 5, 3},
{"Resolve Copy Full 128bpp", true, 2, 4, 4, 3},
};
XE_MSVC_OPTIMIZE_SMALL()
bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
TraceWriter& trace_writer, uint32_t draw_resolution_scale_x,
uint32_t draw_resolution_scale_y,
@ -869,7 +870,8 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
y1 = y0 + int32_t(xenos::kMaxResolveSize);
}
// fails in forza horizon 1
assert_true(x0 < x1 && y0 < y1);
//x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
assert_true(x0 <= x1 && y0 <= y1);
if (x0 >= x1 || y0 >= y1) {
XELOGE("Resolve region is empty");
return false;
@ -1108,7 +1110,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32;
info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32;
#if 0
XELOGD(
"Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially "
"modified memory range 0x{:08X} to 0x{:08X})",
@ -1119,10 +1121,10 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
xenos::ColorRenderTargetFormat(color_edram_info.format)),
FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start,
copy_dest_extent_end);
#endif
return true;
}
XE_MSVC_OPTIMIZE_REVERT()
ResolveCopyShaderIndex ResolveInfo::GetCopyShader(
uint32_t draw_resolution_scale_x, uint32_t draw_resolution_scale_y,
ResolveCopyShaderConstants& constants_out, uint32_t& group_count_x_out,

View File

@ -475,6 +475,8 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA(
// To avoid passing values that the shader won't understand (even though
// Direct3D 9 shouldn't pass them anyway).
XE_NOINLINE
XE_NOALIAS
xenos::CopySampleSelect SanitizeCopySampleSelect(
xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
bool is_depth);

View File

@ -14,6 +14,11 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
new (&reader_)
RingBuffer(memory_->TranslatePhysical(ptr), count * sizeof(uint32_t));
reader_.set_write_offset(count * sizeof(uint32_t));
// prefetch the wraparound range
// it likely is already in L3 cache, but in a zen system it may be another
// chiplets l3
reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
COMMAND_PROCESSOR::GetCurrentRingReadCount());
do {
if (COMMAND_PROCESSOR::ExecutePacket()) {
continue;
@ -30,11 +35,6 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
}
bool COMMAND_PROCESSOR::ExecutePacket() {
// prefetch the wraparound range
// it likely is already in L3 cache, but in a zen system it may be another
// chiplets l3
reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
COMMAND_PROCESSOR::GetCurrentRingReadCount());
const uint32_t packet = reader_.ReadAndSwap<uint32_t>();
const uint32_t packet_type = packet >> 30;
@ -495,7 +495,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
} else {
xe::threading::Sleep(std::chrono::milliseconds(wait / 0x100));
}
xe::threading::SyncMemory();
// xe::threading::SyncMemory();
ReturnFromWait();
if (!worker_running_) {
@ -599,27 +599,28 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_COND_WRITE(
value = register_file_->values[poll_reg_addr].u32;
}
bool matched = false;
value &= mask;
switch (wait_info & 0x7) {
case 0x0: // Never.
matched = false;
break;
case 0x1: // Less than reference.
matched = (value & mask) < ref;
matched = value < ref;
break;
case 0x2: // Less than or equal to reference.
matched = (value & mask) <= ref;
matched = value <= ref;
break;
case 0x3: // Equal to reference.
matched = (value & mask) == ref;
matched = value == ref;
break;
case 0x4: // Not equal to reference.
matched = (value & mask) != ref;
matched = value != ref;
break;
case 0x5: // Greater than or equal to reference.
matched = (value & mask) >= ref;
matched = value >= ref;
break;
case 0x6: // Greater than reference.
matched = (value & mask) > ref;
matched = value > ref;
break;
case 0x7: // Always
matched = true;
@ -1064,7 +1065,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE(
assert_true(count - 2 >= size_dwords);
auto shader = COMMAND_PROCESSOR::LoadShader(
shader_type, uint32_t(reader_.read_ptr()),
reinterpret_cast<uint32_t*>(reader_.read_ptr()), size_dwords);
reinterpret_cast<uint32_t*>(reader_.read_ptr()), size_dwords);
switch (shader_type) {
case xenos::ShaderType::kVertex:
active_vertex_shader_ = shader;

View File

@ -430,7 +430,7 @@ class PrimitiveProcessor {
--count;
uint32_t index = *(source++) & low_bits_mask_guest_endian;
*(dest++) = index != reset_index_guest_endian
? xenos::GpuSwap(index, HostSwap)
? xenos::GpuSwapInline(index, HostSwap)
: UINT32_MAX;
}
if (count >= kSimdVectorU32Elements) {
@ -442,10 +442,10 @@ class PrimitiveProcessor {
__m128i host_swap_shuffle;
if constexpr (HostSwap != xenos::Endian::kNone) {
host_swap_shuffle = _mm_set_epi32(
int32_t(xenos::GpuSwap(uint32_t(0x0F0E0D0C), HostSwap)),
int32_t(xenos::GpuSwap(uint32_t(0x0B0A0908), HostSwap)),
int32_t(xenos::GpuSwap(uint32_t(0x07060504), HostSwap)),
int32_t(xenos::GpuSwap(uint32_t(0x03020100), HostSwap)));
int32_t(xenos::GpuSwapInline(uint32_t(0x0F0E0D0C), HostSwap)),
int32_t(xenos::GpuSwapInline(uint32_t(0x0B0A0908), HostSwap)),
int32_t(xenos::GpuSwapInline(uint32_t(0x07060504), HostSwap)),
int32_t(xenos::GpuSwapInline(uint32_t(0x03020100), HostSwap)));
}
#endif // XE_ARCH_AMD64
while (count >= kSimdVectorU32Elements) {
@ -490,7 +490,7 @@ class PrimitiveProcessor {
while (count--) {
uint32_t index = *(source++) & low_bits_mask_guest_endian;
*(dest++) = index != reset_index_guest_endian
? xenos::GpuSwap(index, HostSwap)
? xenos::GpuSwapInline(index, HostSwap)
: UINT32_MAX;
}
}
@ -510,19 +510,19 @@ class PrimitiveProcessor {
};
struct To24Swapping8In16IndexTransform {
uint32_t operator()(uint32_t index) const {
return xenos::GpuSwap(index, xenos::Endian::k8in16) &
return xenos::GpuSwapInline(index, xenos::Endian::k8in16) &
xenos::kVertexIndexMask;
}
};
struct To24Swapping8In32IndexTransform {
uint32_t operator()(uint32_t index) const {
return xenos::GpuSwap(index, xenos::Endian::k8in32) &
return xenos::GpuSwapInline(index, xenos::Endian::k8in32) &
xenos::kVertexIndexMask;
}
};
struct To24Swapping16In32IndexTransform {
uint32_t operator()(uint32_t index) const {
return xenos::GpuSwap(index, xenos::Endian::k16in32) &
return xenos::GpuSwapInline(index, xenos::Endian::k16in32) &
xenos::kVertexIndexMask;
}
};

View File

@ -388,6 +388,7 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
bool any_data_resolved = false;
uint32_t block_first = page_first >> 6;
swcache::PrefetchL1(&system_page_flags_[block_first]);
uint32_t block_last = page_last >> 6;
uint32_t range_start = UINT32_MAX;

View File

@ -464,7 +464,8 @@ TextureGuestLayout GetGuestTextureLayout(
return layout;
}
XE_NOINLINE
XE_NOALIAS
int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
uint32_t bytes_per_block_log2) {
// https://github.com/gildor2/UModel/blob/de8fbd3bc922427ea056b7340202dcdcc19ccff5/Unreal/UnTexture.cpp#L489
@ -481,7 +482,8 @@ int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
return ((offset & ~0x1FF) << 3) + ((y & 16) << 7) + ((offset & 0x1C0) << 2) +
(((((y & 8) >> 2) + (x >> 3)) & 3) << 6) + (offset & 0x3F);
}
XE_NOINLINE
XE_NOALIAS
int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
uint32_t height, uint32_t bytes_per_block_log2) {
// Reconstructed from disassembly of XGRAPHICS::TileVolume.
@ -509,7 +511,8 @@ int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
address += offset2 & 63;
return address;
}
XE_NOINLINE
XE_NOALIAS
uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
uint32_t pitch,
uint32_t bytes_per_block_log2) {
@ -538,7 +541,8 @@ uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
}
return upper_bound;
}
XE_NOINLINE
XE_NOALIAS
uint32_t GetTiledAddressUpperBound3D(uint32_t right, uint32_t bottom,
uint32_t back, uint32_t pitch,
uint32_t height,

View File

@ -280,8 +280,12 @@ void GetTextureTotalSize(xenos::DataDimension dimension,
// bytes_per_block_log2 is log2_floor according to how Direct3D 9 calculates it,
// but k_32_32_32 textures are never tiled anyway likely.
XE_NOINLINE
XE_NOALIAS
int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
uint32_t bytes_per_block_log2);
XE_NOINLINE
XE_NOALIAS
int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
uint32_t height, uint32_t bytes_per_block_log2);
// Because (0, 0, 0) within each 32x32x4-block tile is stored in memory first,
@ -308,9 +312,13 @@ inline uint32_t GetTiledAddressLowerBound3D(uint32_t left, uint32_t top,
// Supporting the right > pitch and bottom > height (in tiles) cases also, for
// estimation how far addresses can actually go even potentially beyond the
// subresource stride.
XE_NOINLINE
XE_NOALIAS
uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
uint32_t pitch,
uint32_t bytes_per_block_log2);
XE_NOINLINE
XE_NOALIAS
uint32_t GetTiledAddressUpperBound3D(uint32_t right, uint32_t bottom,
uint32_t back, uint32_t pitch,
uint32_t height,

View File

@ -125,8 +125,8 @@ float Float7e3To32(uint32_t f10) {
// Based on CFloat24 from d3dref9.dll and the 6e4 code from:
// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
XE_NOALIAS
uint32_t Float32To20e4(float f32, bool round_to_nearest_even) noexcept {
if (!(f32 > 0.0f)) {
// Positive only, and not -0 or NaN.
return 0;
@ -150,8 +150,8 @@ uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
}
return (f32u32 >> 3) & 0xFFFFFF;
}
float Float20e4To32(uint32_t f24) {
XE_NOALIAS
float Float20e4To32(uint32_t f24) noexcept {
f24 &= 0xFFFFFF;
if (!f24) {
return 0.0f;

View File

@ -421,10 +421,12 @@ float Float7e3To32(uint32_t f10);
// floating-point number.
// Converts an IEEE-754 32-bit floating-point number to Xenos floating-point
// depth, rounding to the nearest even or towards zero.
uint32_t Float32To20e4(float f32, bool round_to_nearest_even);
XE_NOALIAS
uint32_t Float32To20e4(float f32, bool round_to_nearest_even) noexcept;
// Converts Xenos floating-point depth in bits 0:23 (not clamping) to an
// IEEE-754 32-bit floating-point number.
float Float20e4To32(uint32_t f24);
XE_NOALIAS
float Float20e4To32(uint32_t f24) noexcept;
// Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit
// floating-point number.
constexpr float UNorm24To32(uint32_t n24) {
@ -1045,9 +1047,9 @@ inline uint16_t GpuSwap(uint16_t value, Endian endianness) {
return value;
}
}
XE_NOINLINE
XE_FORCEINLINE
XE_NOALIAS
static uint32_t GpuSwap(uint32_t value, Endian endianness) {
static uint32_t GpuSwapInline(uint32_t value, Endian endianness) {
switch (endianness) {
default:
case Endian::kNone:
@ -1065,6 +1067,11 @@ static uint32_t GpuSwap(uint32_t value, Endian endianness) {
return ((value >> 16) & 0xFFFF) | (value << 16);
}
}
XE_NOINLINE
XE_NOALIAS
static uint32_t GpuSwap(uint32_t value, Endian endianness) {
return GpuSwapInline(value, endianness);
}
inline float GpuSwap(float value, Endian endianness) {
union {

View File

@ -137,8 +137,8 @@ X_INPUT_VIBRATION InputSystem::ModifyVibrationLevel(
modified_vibration.right_motor_speed = 0;
return modified_vibration;
}
std::unique_lock<xe_unlikely_mutex> InputSystem::lock() {
return std::unique_lock<xe_unlikely_mutex>{lock_};
std::unique_lock<xe_mutex> InputSystem::lock() {
return std::unique_lock<xe_mutex>{lock_};
}
} // namespace hid
} // namespace xe

View File

@ -48,7 +48,7 @@ class InputSystem {
void UpdateUsedSlot(uint8_t slot, bool connected);
uint8_t GetConnectedSlots() const { return connected_slot; }
std::unique_lock<xe_unlikely_mutex> lock();
std::unique_lock<xe_mutex> lock();
private:
xe::ui::Window* window_ = nullptr;
@ -57,7 +57,7 @@ class InputSystem {
X_INPUT_VIBRATION ModifyVibrationLevel(X_INPUT_VIBRATION* vibration);
uint8_t connected_slot = 0b0001;
xe_unlikely_mutex lock_;
xe_mutex lock_;
};
} // namespace hid

View File

@ -911,11 +911,17 @@ dword_result_t NtSignalAndWaitForSingleObjectEx_entry(dword_t signal_handle,
DECLARE_XBOXKRNL_EXPORT3(NtSignalAndWaitForSingleObjectEx, kThreading,
kImplemented, kBlocking, kHighFrequency);
static void PrefetchForCAS(const void* value) {
if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
swcache::PrefetchW(value);
}
}
uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) {
// XELOGD(
// "KfAcquireSpinLock({:08X})",
// lock_ptr);
PrefetchForCAS(lock);
// Lock.
while (!xe::atomic_cas(0, 1, lock)) {
// Spin!
@ -956,6 +962,7 @@ DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented,
void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
// Lock.
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
PrefetchForCAS(lock);
while (!xe::atomic_cas(0, 1, lock)) {
#if XE_ARCH_AMD64 == 1
// todo: this is just a nop if they don't have SMT, which is not great
@ -973,6 +980,7 @@ DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading,
dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
// Lock.
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
PrefetchForCAS(lock);
if (!xe::atomic_cas(0, 1, lock)) {
return 0;
}

View File

@ -763,7 +763,8 @@ void XThread::SetActiveCpu(uint8_t cpu_index) {
thread_->set_affinity_mask(uint64_t(1) << cpu_index);
}
} else {
XELOGW("Too few processor cores - scheduling will be wonky");
//there no good reason why we need to log this... we don't perfectly emulate the 360's scheduler in any way
// XELOGW("Too few processor cores - scheduling will be wonky");
}
}

View File

@ -713,6 +713,8 @@ void BaseHeap::Initialize(Memory* memory, uint8_t* membase, HeapType heap_type,
heap_base_ = heap_base;
heap_size_ = heap_size;
page_size_ = page_size;
xenia_assert(xe::is_pow2(page_size_));
page_size_shift_ = xe::log2_floor(page_size_);
host_address_offset_ = host_address_offset;
page_table_.resize(heap_size / page_size);
unreserved_page_count_ = uint32_t(page_table_.size());
@ -1234,14 +1236,14 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
// fails and returns without modifying the access protection of any pages in
// the specified region."
uint32_t start_page_number = (address - heap_base_) / page_size_;
uint32_t start_page_number = (address - heap_base_) >> page_size_shift_;
if (start_page_number >= page_table_.size()) {
XELOGE("BaseHeap::Protect failed due to out-of-bounds base address {:08X}",
address);
return false;
}
uint32_t end_page_number =
uint32_t((uint64_t(address) + size - 1 - heap_base_) / page_size_);
uint32_t((uint64_t(address) + size - 1 - heap_base_) >> page_size_shift_);
if (end_page_number >= page_table_.size()) {
XELOGE(
"BaseHeap::Protect failed due to out-of-bounds range ({:08X} bytes "
@ -1268,17 +1270,21 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
return false;
}
}
uint32_t xe_page_size = static_cast<uint32_t>(xe::memory::page_size());
uint32_t page_size_mask = xe_page_size - 1;
// Attempt host change (hopefully won't fail).
// We can only do this if our size matches system page granularity.
uint32_t page_count = end_page_number - start_page_number + 1;
if (page_size_ == xe::memory::page_size() ||
(((page_count * page_size_) % xe::memory::page_size() == 0) &&
((start_page_number * page_size_) % xe::memory::page_size() == 0))) {
if (page_size_ == xe_page_size ||
((((page_count << page_size_shift_) & page_size_mask) == 0) &&
(((start_page_number << page_size_shift_) & page_size_mask) == 0))) {
memory::PageAccess old_protect_access;
if (!xe::memory::Protect(TranslateRelative(start_page_number * page_size_),
page_count * page_size_, ToPageAccess(protect),
old_protect ? &old_protect_access : nullptr)) {
if (!xe::memory::Protect(
TranslateRelative(start_page_number << page_size_shift_),
page_count << page_size_shift_, ToPageAccess(protect),
old_protect ? &old_protect_access : nullptr)) {
XELOGE("BaseHeap::Protect failed due to host VirtualProtect failure");
return false;
}
@ -1303,7 +1309,7 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
bool BaseHeap::QueryRegionInfo(uint32_t base_address,
HeapAllocationInfo* out_info) {
uint32_t start_page_number = (base_address - heap_base_) / page_size_;
uint32_t start_page_number = (base_address - heap_base_) >> page_size_shift_;
if (start_page_number > page_table_.size()) {
XELOGE("BaseHeap::QueryRegionInfo base page out of range");
return false;
@ -1321,9 +1327,10 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
if (start_page_entry.state) {
// Committed/reserved region.
out_info->allocation_base =
heap_base_ + start_page_entry.base_address * page_size_;
heap_base_ + (start_page_entry.base_address << page_size_shift_);
out_info->allocation_protect = start_page_entry.allocation_protect;
out_info->allocation_size = start_page_entry.region_page_count * page_size_;
out_info->allocation_size = start_page_entry.region_page_count
<< page_size_shift_;
out_info->state = start_page_entry.state;
out_info->protect = start_page_entry.current_protect;
@ -1358,7 +1365,7 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
}
bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
uint32_t page_number = (address - heap_base_) / page_size_;
uint32_t page_number = (address - heap_base_) >> page_size_shift_;
if (page_number > page_table_.size()) {
XELOGE("BaseHeap::QuerySize base page out of range");
*out_size = 0;
@ -1366,12 +1373,12 @@ bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
}
auto global_lock = global_critical_region_.Acquire();
auto page_entry = page_table_[page_number];
*out_size = (page_entry.region_page_count * page_size_);
*out_size = (page_entry.region_page_count << page_size_shift_);
return true;
}
bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
uint32_t page_number = (*in_out_address - heap_base_) / page_size_;
uint32_t page_number = (*in_out_address - heap_base_) >> page_size_shift_;
if (page_number > page_table_.size()) {
XELOGE("BaseHeap::QuerySize base page out of range");
*out_size = 0;
@ -1379,13 +1386,13 @@ bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
}
auto global_lock = global_critical_region_.Acquire();
auto page_entry = page_table_[page_number];
*in_out_address = (page_entry.base_address * page_size_);
*out_size = (page_entry.region_page_count * page_size_);
*in_out_address = (page_entry.base_address << page_size_shift_);
*out_size = (page_entry.region_page_count << page_size_shift_);
return true;
}
bool BaseHeap::QueryProtect(uint32_t address, uint32_t* out_protect) {
uint32_t page_number = (address - heap_base_) / page_size_;
uint32_t page_number = (address - heap_base_) >> page_size_shift_;
if (page_number > page_table_.size()) {
XELOGE("BaseHeap::QueryProtect base page out of range");
*out_protect = 0;
@ -1403,8 +1410,8 @@ xe::memory::PageAccess BaseHeap::QueryRangeAccess(uint32_t low_address,
(high_address - heap_base_) >= heap_size_) {
return xe::memory::PageAccess::kNoAccess;
}
uint32_t low_page_number = (low_address - heap_base_) / page_size_;
uint32_t high_page_number = (high_address - heap_base_) / page_size_;
uint32_t low_page_number = (low_address - heap_base_) >> page_size_shift_;
uint32_t high_page_number = (high_address - heap_base_) >> page_size_shift_;
uint32_t protect = kMemoryProtectRead | kMemoryProtectWrite;
{
auto global_lock = global_critical_region_.Acquire();
@ -1446,6 +1453,8 @@ void PhysicalHeap::Initialize(Memory* memory, uint8_t* membase,
page_size, host_address_offset);
parent_heap_ = parent_heap;
system_page_size_ = uint32_t(xe::memory::page_size());
xenia_assert(xe::is_pow2(system_page_size_));
system_page_shift_ = xe::log2_floor(system_page_size_);
system_page_count_ =
(size_t(heap_size_) + host_address_offset + (system_page_size_ - 1)) /
@ -1665,10 +1674,11 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
}
uint32_t system_page_first =
(heap_relative_address + host_address_offset()) / system_page_size_;
(heap_relative_address + host_address_offset()) >> system_page_shift_;
swcache::PrefetchL1(&system_page_flags_[system_page_first >> 6]);
uint32_t system_page_last =
(heap_relative_address + length - 1 + host_address_offset()) /
system_page_size_;
(heap_relative_address + length - 1 + host_address_offset()) >>
system_page_shift_;
system_page_last = std::min(system_page_last, system_page_count_ - 1);
assert_true(system_page_first <= system_page_last);
@ -1677,10 +1687,40 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
xe::memory::PageAccess protect_access =
enable_data_providers ? xe::memory::PageAccess::kNoAccess
: xe::memory::PageAccess::kReadOnly;
auto global_lock = global_critical_region_.Acquire();
if (enable_invalidation_notifications) {
EnableAccessCallbacksInner<true>(system_page_first, system_page_last,
protect_access);
} else {
EnableAccessCallbacksInner<false>(system_page_first, system_page_last,
protect_access);
}
}
template <bool enable_invalidation_notifications>
XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
const uint32_t system_page_first, const uint32_t system_page_last,
xe::memory::PageAccess protect_access) XE_RESTRICT {
uint8_t* protect_base = membase_ + heap_base_;
uint32_t protect_system_page_first = UINT32_MAX;
auto global_lock = global_critical_region_.Acquire();
for (uint32_t i = system_page_first; i <= system_page_last; ++i) {
SystemPageFlagsBlock* XE_RESTRICT sys_page_flags = system_page_flags_.data();
PageEntry* XE_RESTRICT page_table_ptr = page_table_.data();
// chrispy: a lot of time is spent in this loop, and i think some of the work
// may be avoidable and repetitive profiling shows quite a bit of time spent
// in this loop, but very little spent actually calling Protect
uint32_t i = system_page_first;
uint32_t first_guest_page = SystemPagenumToGuestPagenum(system_page_first);
uint32_t last_guest_page = SystemPagenumToGuestPagenum(system_page_last);
uint32_t guest_one =
SystemPagenumToGuestPagenum(1);
uint32_t system_one = GuestPagenumToSystemPagenum(1);
for (; i <= system_page_last; ++i) {
// Check if need to enable callbacks for the page and raise its protection.
//
// If enabling invalidation notifications:
@ -1702,12 +1742,19 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
//
// Enabling data providers doesn't need to be deferred - providers will be
// polled for the last time without releasing the lock.
SystemPageFlagsBlock& page_flags_block = system_page_flags_[i >> 6];
SystemPageFlagsBlock& page_flags_block = sys_page_flags[i >> 6];
#if XE_ARCH_AMD64 == 1
// x86 modulus shift
uint64_t page_flags_bit = uint64_t(1) << i;
#else
uint64_t page_flags_bit = uint64_t(1) << (i & 63);
uint32_t guest_page_number =
xe::sat_sub(i * system_page_size_, host_address_offset()) / page_size_;
#endif
uint32_t guest_page_number = SystemPagenumToGuestPagenum(i);
//swcache::PrefetchL1(&page_table_ptr[guest_page_number + 8]);
xe::memory::PageAccess current_page_access =
ToPageAccess(page_table_[guest_page_number].current_protect);
ToPageAccess(page_table_ptr[guest_page_number].current_protect);
bool protect_system_page = false;
// Don't do anything with inaccessible pages - don't protect, don't enable
// callbacks - because real access violations are needed there. And don't
@ -1715,7 +1762,7 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
// reason.
if (current_page_access != xe::memory::PageAccess::kNoAccess) {
// TODO(Triang3l): Enable data providers.
if (enable_invalidation_notifications) {
if constexpr (enable_invalidation_notifications) {
if (current_page_access != xe::memory::PageAccess::kReadOnly &&
(page_flags_block.notify_on_invalidation & page_flags_bit) == 0) {
// TODO(Triang3l): Check if data providers are already enabled.
@ -1733,21 +1780,22 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
} else {
if (protect_system_page_first != UINT32_MAX) {
xe::memory::Protect(
protect_base + protect_system_page_first * system_page_size_,
(i - protect_system_page_first) * system_page_size_,
protect_base + (protect_system_page_first << system_page_shift_),
(i - protect_system_page_first) << system_page_shift_,
protect_access);
protect_system_page_first = UINT32_MAX;
}
}
}
if (protect_system_page_first != UINT32_MAX) {
xe::memory::Protect(
protect_base + protect_system_page_first * system_page_size_,
(system_page_last + 1 - protect_system_page_first) * system_page_size_,
protect_base + (protect_system_page_first << system_page_shift_),
(system_page_last + 1 - protect_system_page_first)
<< system_page_shift_,
protect_access);
}
}
bool PhysicalHeap::TriggerCallbacks(
global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) {
@ -1774,10 +1822,10 @@ bool PhysicalHeap::TriggerCallbacks(
}
uint32_t system_page_first =
(heap_relative_address + host_address_offset()) / system_page_size_;
(heap_relative_address + host_address_offset()) >> system_page_shift_;
uint32_t system_page_last =
(heap_relative_address + length - 1 + host_address_offset()) /
system_page_size_;
(heap_relative_address + length - 1 + host_address_offset()) >>
system_page_shift_;
system_page_last = std::min(system_page_last, system_page_count_ - 1);
assert_true(system_page_first <= system_page_last);
uint32_t block_index_first = system_page_first >> 6;
@ -1810,11 +1858,11 @@ bool PhysicalHeap::TriggerCallbacks(
}
uint32_t physical_address_offset = GetPhysicalAddress(heap_base_);
uint32_t physical_address_start =
xe::sat_sub(system_page_first * system_page_size_,
xe::sat_sub(system_page_first << system_page_shift_,
host_address_offset()) +
physical_address_offset;
uint32_t physical_length = std::min(
xe::sat_sub(system_page_last * system_page_size_ + system_page_size_,
xe::sat_sub((system_page_last << system_page_shift_) + system_page_size_,
host_address_offset()) +
physical_address_offset - physical_address_start,
heap_size_ - (physical_address_start - physical_address_offset));
@ -1858,8 +1906,8 @@ bool PhysicalHeap::TriggerCallbacks(
unwatch_first += host_address_offset();
unwatch_last += host_address_offset();
assert_true(unwatch_first <= unwatch_last);
system_page_first = unwatch_first / system_page_size_;
system_page_last = unwatch_last / system_page_size_;
system_page_first = unwatch_first >> system_page_shift_;
system_page_last = unwatch_last >> system_page_shift_;
block_index_first = system_page_first >> 6;
block_index_last = system_page_last >> 6;
}
@ -1874,8 +1922,8 @@ bool PhysicalHeap::TriggerCallbacks(
(uint64_t(1) << (i & 63))) != 0;
if (unprotect_page) {
uint32_t guest_page_number =
xe::sat_sub(i * system_page_size_, host_address_offset()) /
page_size_;
xe::sat_sub(i << system_page_shift_, host_address_offset()) >>
page_size_shift_;
if (ToPageAccess(page_table_[guest_page_number].current_protect) !=
xe::memory::PageAccess::kReadWrite) {
unprotect_page = false;
@ -1888,8 +1936,9 @@ bool PhysicalHeap::TriggerCallbacks(
} else {
if (unprotect_system_page_first != UINT32_MAX) {
xe::memory::Protect(
protect_base + unprotect_system_page_first * system_page_size_,
(i - unprotect_system_page_first) * system_page_size_,
protect_base +
(unprotect_system_page_first << system_page_shift_),
(i - unprotect_system_page_first) << system_page_shift_,
xe::memory::PageAccess::kReadWrite);
unprotect_system_page_first = UINT32_MAX;
}
@ -1897,9 +1946,9 @@ bool PhysicalHeap::TriggerCallbacks(
}
if (unprotect_system_page_first != UINT32_MAX) {
xe::memory::Protect(
protect_base + unprotect_system_page_first * system_page_size_,
(system_page_last + 1 - unprotect_system_page_first) *
system_page_size_,
protect_base + (unprotect_system_page_first << system_page_shift_),
(system_page_last + 1 - unprotect_system_page_first)
<< system_page_shift_,
xe::memory::PageAccess::kReadWrite);
}
}

View File

@ -216,6 +216,7 @@ class BaseHeap {
uint32_t heap_base_;
uint32_t heap_size_;
uint32_t page_size_;
uint32_t page_size_shift_;
uint32_t host_address_offset_;
uint32_t unreserved_page_count_;
xe::global_critical_region global_critical_region_;
@ -270,18 +271,36 @@ class PhysicalHeap : public BaseHeap {
void EnableAccessCallbacks(uint32_t physical_address, uint32_t length,
bool enable_invalidation_notifications,
bool enable_data_providers);
template <bool enable_invalidation_notifications>
XE_NOINLINE void EnableAccessCallbacksInner(
const uint32_t system_page_first, const uint32_t system_page_last,
xe::memory::PageAccess protect_access) XE_RESTRICT;
// Returns true if any page in the range was watched.
bool TriggerCallbacks(global_unique_lock_type global_lock_locked_once,
uint32_t virtual_address, uint32_t length, bool is_write,
bool unwatch_exact_range, bool unprotect = true);
uint32_t virtual_address, uint32_t length,
bool is_write, bool unwatch_exact_range,
bool unprotect = true);
uint32_t GetPhysicalAddress(uint32_t address) const;
uint32_t SystemPagenumToGuestPagenum(uint32_t num) const {
return ((num << system_page_shift_) - host_address_offset()) >> page_size_shift_;
}
uint32_t GuestPagenumToSystemPagenum(uint32_t num) {
num <<= page_size_shift_;
num += host_address_offset();
num >>= system_page_shift_;
return num;
}
protected:
VirtualHeap* parent_heap_;
uint32_t system_page_size_;
uint32_t system_page_count_;
uint32_t system_page_shift_;
uint32_t padding1_;
struct SystemPageFlagsBlock {
// Whether writing to each page should result trigger invalidation
@ -458,9 +477,9 @@ class Memory {
// TODO(Triang3l): Implement data providers - this is why locking depth of 1
// will be required in the future.
bool TriggerPhysicalMemoryCallbacks(
global_unique_lock_type global_lock_locked_once,
uint32_t virtual_address, uint32_t length, bool is_write,
bool unwatch_exact_range, bool unprotect = true);
global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
uint32_t length, bool is_write, bool unwatch_exact_range,
bool unprotect = true);
// Allocates virtual memory from the 'system' heap.
// System memory is kept separate from game memory but is still accessible
@ -509,10 +528,10 @@ class Memory {
const void* host_address);
bool AccessViolationCallback(global_unique_lock_type global_lock_locked_once,
void* host_address, bool is_write);
void* host_address, bool is_write);
static bool AccessViolationCallbackThunk(
global_unique_lock_type global_lock_locked_once,
void* context, void* host_address, bool is_write);
global_unique_lock_type global_lock_locked_once, void* context,
void* host_address, bool is_write);
std::filesystem::path file_name_;
uint32_t system_page_size_ = 0;