Merge pull request #75 from chrisps/canary_experimental
misc stuff again
This commit is contained in:
commit
a29a7436e0
|
@ -20,6 +20,8 @@ namespace apu {
|
|||
namespace conversion {
|
||||
|
||||
#if XE_ARCH_AMD64
|
||||
|
||||
#if 0
|
||||
inline void sequential_6_BE_to_interleaved_6_LE(float* output,
|
||||
const float* input,
|
||||
size_t ch_sample_count) {
|
||||
|
@ -41,7 +43,44 @@ inline void sequential_6_BE_to_interleaved_6_LE(float* output,
|
|||
out[sample * 6 + 5] = sample2;
|
||||
}
|
||||
}
|
||||
#else
|
||||
XE_NOINLINE
|
||||
static void _generic_sequential_6_BE_to_interleaved_6_LE(
|
||||
float* XE_RESTRICT output, const float* XE_RESTRICT input,
|
||||
unsigned ch_sample_count) {
|
||||
for (unsigned sample = 0; sample < ch_sample_count; sample++) {
|
||||
for (unsigned channel = 0; channel < 6; channel++) {
|
||||
unsigned int value = *reinterpret_cast<const unsigned int*>(
|
||||
&input[channel * ch_sample_count + sample]);
|
||||
|
||||
*reinterpret_cast<unsigned int*>(&output[sample * 6 + channel]) =
|
||||
xe::byte_swap(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
XE_NOINLINE
|
||||
static void _movbe_sequential_6_BE_to_interleaved_6_LE(
|
||||
float* XE_RESTRICT output, const float* XE_RESTRICT input,
|
||||
unsigned ch_sample_count) {
|
||||
for (unsigned sample = 0; sample < ch_sample_count; sample++) {
|
||||
for (unsigned channel = 0; channel < 6; channel++) {
|
||||
*reinterpret_cast<unsigned int*>(&output[sample * 6 + channel]) =
|
||||
_load_be_u32(reinterpret_cast<const unsigned int*>(
|
||||
&input[channel * ch_sample_count + sample]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline static void sequential_6_BE_to_interleaved_6_LE(
|
||||
float* output, const float* input, unsigned ch_sample_count) {
|
||||
if (amd64::GetFeatureFlags() & amd64::kX64EmitMovbe) {
|
||||
_movbe_sequential_6_BE_to_interleaved_6_LE(output, input, ch_sample_count);
|
||||
} else {
|
||||
_generic_sequential_6_BE_to_interleaved_6_LE(output, input,
|
||||
ch_sample_count);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
inline void sequential_6_BE_to_interleaved_2_LE(float* output,
|
||||
const float* input,
|
||||
size_t ch_sample_count) {
|
||||
|
|
|
@ -335,7 +335,8 @@ ICommandVar* define_cmdvar(const char* name, T* default_value,
|
|||
|
||||
#define DEFINE_uint64(name, default_value, description, category) \
|
||||
DEFINE_CVar(name, default_value, description, category, false, uint64_t)
|
||||
|
||||
#define DEFINE_int64(name, default_value, description, category) \
|
||||
DEFINE_CVar(name, default_value, description, category, false, int64_t)
|
||||
#define DEFINE_double(name, default_value, description, category) \
|
||||
DEFINE_CVar(name, default_value, description, category, false, double)
|
||||
|
||||
|
@ -383,7 +384,7 @@ ICommandVar* define_cmdvar(const char* name, T* default_value,
|
|||
#define DECLARE_uint32(name) DECLARE_CVar(name, uint32_t)
|
||||
|
||||
#define DECLARE_uint64(name) DECLARE_CVar(name, uint64_t)
|
||||
|
||||
#define DECLARE_int64(name) DECLARE_CVar(name, int64_t)
|
||||
#define DECLARE_double(name) DECLARE_CVar(name, double)
|
||||
|
||||
#define DECLARE_string(name) DECLARE_CVar(name, std::string)
|
||||
|
|
|
@ -26,7 +26,7 @@ check this and release the mutex one way to do this is by using FlsAlloc and
|
|||
PFLS_CALLBACK_FUNCTION, which gets called with the fiber local data when a
|
||||
thread exits
|
||||
*/
|
||||
thread_local unsigned global_mutex_depth = 0;
|
||||
|
||||
static CRITICAL_SECTION* global_critical_section(xe_global_mutex* mutex) {
|
||||
return reinterpret_cast<CRITICAL_SECTION*>(mutex);
|
||||
}
|
||||
|
@ -38,29 +38,16 @@ xe_global_mutex::xe_global_mutex() {
|
|||
xe_global_mutex ::~xe_global_mutex() {
|
||||
DeleteCriticalSection(global_critical_section(this));
|
||||
}
|
||||
|
||||
void xe_global_mutex::lock() {
|
||||
if (global_mutex_depth) {
|
||||
} else {
|
||||
EnterCriticalSection(global_critical_section(this));
|
||||
}
|
||||
global_mutex_depth++;
|
||||
EnterCriticalSection(global_critical_section(this));
|
||||
}
|
||||
void xe_global_mutex::unlock() {
|
||||
if (--global_mutex_depth == 0) {
|
||||
LeaveCriticalSection(global_critical_section(this));
|
||||
}
|
||||
LeaveCriticalSection(global_critical_section(this));
|
||||
}
|
||||
bool xe_global_mutex::try_lock() {
|
||||
if (global_mutex_depth) {
|
||||
++global_mutex_depth;
|
||||
return true;
|
||||
} else {
|
||||
BOOL success = TryEnterCriticalSection(global_critical_section(this));
|
||||
if (success) {
|
||||
++global_mutex_depth;
|
||||
}
|
||||
return success;
|
||||
}
|
||||
BOOL success = TryEnterCriticalSection(global_critical_section(this));
|
||||
return success;
|
||||
}
|
||||
|
||||
CRITICAL_SECTION* fast_crit(xe_fast_mutex* mutex) {
|
||||
|
|
|
@ -116,15 +116,15 @@
|
|||
#define XE_LIKELY(...) (!!(__VA_ARGS__))
|
||||
#define XE_UNLIKELY(...) (!!(__VA_ARGS__))
|
||||
#define XE_MSVC_ASSUME(...) __assume(__VA_ARGS__)
|
||||
#define XE_NOALIAS __declspec(noalias)
|
||||
#define XE_NOALIAS __declspec(noalias)
|
||||
#elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1
|
||||
#define XE_FORCEINLINE __attribute__((always_inline))
|
||||
#define XE_NOINLINE __attribute__((noinline))
|
||||
#define XE_COLD __attribute__((cold))
|
||||
#define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
|
||||
#define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
|
||||
#define XE_NOALIAS
|
||||
//cant do unevaluated assume
|
||||
#define XE_NOALIAS
|
||||
// cant do unevaluated assume
|
||||
#define XE_MSVC_ASSUME(...) static_cast<void>(0)
|
||||
#else
|
||||
#define XE_FORCEINLINE inline
|
||||
|
@ -137,7 +137,13 @@
|
|||
#define XE_MSVC_ASSUME(...) static_cast<void>(0)
|
||||
|
||||
#endif
|
||||
|
||||
#if XE_COMPILER_HAS_MSVC_EXTENSIONS == 1
|
||||
#define XE_MSVC_OPTIMIZE_SMALL() __pragma(optimize("s", on))
|
||||
#define XE_MSVC_OPTIMIZE_REVERT() __pragma(optimize("", on))
|
||||
#else
|
||||
#define XE_MSVC_OPTIMIZE_SMALL()
|
||||
#define XE_MSVC_OPTIMIZE_REVERT()
|
||||
#endif
|
||||
#if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
|
||||
#define XE_LIKELY_IF(...) if (XE_LIKELY(__VA_ARGS__))
|
||||
#define XE_UNLIKELY_IF(...) if (XE_UNLIKELY(__VA_ARGS__))
|
||||
|
@ -180,7 +186,7 @@ const char kPathSeparator = '/';
|
|||
const char kGuestPathSeparator = '\\';
|
||||
|
||||
} // namespace xe
|
||||
#if XE_ARCH_AMD64==1
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
#include "platform_amd64.h"
|
||||
#endif
|
||||
#endif // XENIA_BASE_PLATFORM_H_
|
||||
|
|
|
@ -7,13 +7,12 @@
|
|||
******************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/base/platform.h"
|
||||
|
||||
#include "third_party/xbyak/xbyak/xbyak.h"
|
||||
#include "third_party/xbyak/xbyak/xbyak_util.h"
|
||||
DEFINE_int32(x64_extension_mask, -1,
|
||||
DEFINE_int64(x64_extension_mask, -1LL,
|
||||
"Allow the detection and utilization of specific instruction set "
|
||||
"features.\n"
|
||||
" 0 = x86_64 + AVX1\n"
|
||||
|
@ -33,79 +32,92 @@ DEFINE_int32(x64_extension_mask, -1,
|
|||
"x64");
|
||||
namespace xe {
|
||||
namespace amd64 {
|
||||
static uint32_t g_feature_flags = 0U;
|
||||
static uint64_t g_feature_flags = 0U;
|
||||
static bool g_did_initialize_feature_flags = false;
|
||||
uint32_t GetFeatureFlags() {
|
||||
xenia_assert(g_did_initialize_feature_flags);
|
||||
return g_feature_flags;
|
||||
uint64_t GetFeatureFlags() {
|
||||
xenia_assert(g_did_initialize_feature_flags);
|
||||
return g_feature_flags;
|
||||
}
|
||||
XE_COLD
|
||||
XE_NOINLINE
|
||||
void InitFeatureFlags() {
|
||||
uint32_t feature_flags_ = 0U;
|
||||
|
||||
Xbyak::util::Cpu cpu_;
|
||||
uint64_t feature_flags_ = 0U;
|
||||
{
|
||||
Xbyak::util::Cpu cpu_;
|
||||
#define TEST_EMIT_FEATURE(emit, ext) \
|
||||
if ((cvars::x64_extension_mask & emit) == emit) { \
|
||||
feature_flags_ |= (cpu_.has(ext) ? emit : 0); \
|
||||
}
|
||||
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
|
||||
TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
|
||||
TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
|
||||
TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
|
||||
TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
|
||||
TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
|
||||
TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
|
||||
TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
|
||||
TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
|
||||
TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
|
||||
TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
|
||||
TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
|
||||
TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
|
||||
TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
|
||||
TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
|
||||
#undef TEST_EMIT_FEATURE
|
||||
/*
|
||||
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
|
||||
latest version of xbyak
|
||||
*/
|
||||
unsigned int data[4];
|
||||
Xbyak::util::Cpu::getCpuid(0x80000001, data);
|
||||
unsigned amd_flags = data[2];
|
||||
if (amd_flags & (1U << 5)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
|
||||
feature_flags_ |= kX64EmitLZCNT;
|
||||
}
|
||||
}
|
||||
// todo: although not reported by cpuid, zen 1 and zen+ also have fma4
|
||||
if (amd_flags & (1U << 16)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
|
||||
feature_flags_ |= kX64EmitFMA4;
|
||||
}
|
||||
}
|
||||
if (amd_flags & (1U << 21)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
|
||||
feature_flags_ |= kX64EmitTBM;
|
||||
}
|
||||
}
|
||||
if (amd_flags & (1U << 11)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
|
||||
feature_flags_ |= kX64EmitXOP;
|
||||
}
|
||||
}
|
||||
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
|
||||
bool is_zennish = cpu_.displayFamily >= 0x17;
|
||||
/*
|
||||
chrispy: according to agner's tables, all amd architectures that
|
||||
we support (ones with avx) have the same timings for
|
||||
jrcxz/loop/loope/loopne as for other jmps
|
||||
*/
|
||||
feature_flags_ |= kX64FastJrcx;
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
if (is_zennish) {
|
||||
// ik that i heard somewhere that this is the case for zen, but i need to
|
||||
// verify. cant find my original source for that.
|
||||
// todo: ask agner?
|
||||
feature_flags_ |= kX64FlagsIndependentVars;
|
||||
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
|
||||
latest version of xbyak
|
||||
*/
|
||||
unsigned int data[4];
|
||||
Xbyak::util::Cpu::getCpuid(0x80000001, data);
|
||||
unsigned amd_flags = data[2];
|
||||
if (amd_flags & (1U << 5)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
|
||||
feature_flags_ |= kX64EmitLZCNT;
|
||||
}
|
||||
}
|
||||
// todo: although not reported by cpuid, zen 1 and zen+ also have fma4
|
||||
if (amd_flags & (1U << 16)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
|
||||
feature_flags_ |= kX64EmitFMA4;
|
||||
}
|
||||
}
|
||||
if (amd_flags & (1U << 21)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
|
||||
feature_flags_ |= kX64EmitTBM;
|
||||
}
|
||||
}
|
||||
if (amd_flags & (1U << 11)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
|
||||
feature_flags_ |= kX64EmitXOP;
|
||||
}
|
||||
}
|
||||
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
|
||||
bool is_zennish = cpu_.displayFamily >= 0x17;
|
||||
/*
|
||||
chrispy: according to agner's tables, all amd architectures
|
||||
that we support (ones with avx) have the same timings for
|
||||
jrcxz/loop/loope/loopne as for other jmps
|
||||
*/
|
||||
feature_flags_ |= kX64FastJrcx;
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
if (is_zennish) {
|
||||
// ik that i heard somewhere that this is the case for zen, but i need
|
||||
// to verify. cant find my original source for that. todo: ask agner?
|
||||
feature_flags_ |= kX64FlagsIndependentVars;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
unsigned int data[4];
|
||||
memset(data, 0, sizeof(data));
|
||||
// intel extended features
|
||||
Xbyak::util::Cpu::getCpuidEx(7, 0, data);
|
||||
if ((data[2] & (1 << 28)) &&
|
||||
(cvars::x64_extension_mask & kX64EmitMovdir64M)) {
|
||||
feature_flags_ |= kX64EmitMovdir64M;
|
||||
}
|
||||
if ((data[1] & (1 << 9)) && (cvars::x64_extension_mask & kX64FastRepMovs)) {
|
||||
feature_flags_ |= kX64FastRepMovs;
|
||||
}
|
||||
}
|
||||
g_feature_flags = feature_flags_;
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
namespace xe {
|
||||
namespace amd64 {
|
||||
enum X64FeatureFlags {
|
||||
enum X64FeatureFlags : uint64_t {
|
||||
kX64EmitAVX2 = 1 << 0,
|
||||
kX64EmitFMA = 1 << 1,
|
||||
kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount
|
||||
|
@ -44,14 +44,13 @@ enum X64FeatureFlags {
|
|||
// instructions, and FX users need the boost
|
||||
kX64EmitFMA4 = 1 << 17, // todo: also use on zen1?
|
||||
kX64EmitTBM = 1 << 18,
|
||||
// kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
|
||||
// 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
|
||||
// instructions by using the legacy sse version if we recently cleared the
|
||||
// high 128 bits of the
|
||||
kX64EmitMovdir64M = 1 << 19,
|
||||
kX64FastRepMovs = 1 << 20
|
||||
|
||||
};
|
||||
|
||||
XE_NOALIAS
|
||||
uint32_t GetFeatureFlags();
|
||||
uint64_t GetFeatureFlags();
|
||||
XE_COLD
|
||||
void InitFeatureFlags();
|
||||
|
||||
|
|
|
@ -299,6 +299,12 @@ class Event : public WaitHandle {
|
|||
// the nonsignaled state after releasing the appropriate number of waiting
|
||||
// threads.
|
||||
virtual void Pulse() = 0;
|
||||
#if XE_PLATFORM_WIN32 ==1
|
||||
//SetEvent, but if there is a waiter we immediately transfer execution to it
|
||||
virtual void SetBoostPriority() = 0;
|
||||
#else
|
||||
void SetBoostPriority() { Set() }
|
||||
#endif
|
||||
};
|
||||
|
||||
// Models a Win32-like semaphore object.
|
||||
|
|
|
@ -39,6 +39,8 @@ XE_NTDLL_IMPORT(NtWaitForSingleObject, cls_NtWaitForSingleObject,
|
|||
NtWaitForSingleObjectPointer);
|
||||
|
||||
XE_NTDLL_IMPORT(NtSetEvent, cls_NtSetEvent, NtSetEventPointer);
|
||||
XE_NTDLL_IMPORT(NtSetEventBoostPriority, cls_NtSetEventBoostPriority,
|
||||
NtSetEventBoostPriorityPointer);
|
||||
// difference between NtClearEvent and NtResetEvent is that NtResetEvent returns
|
||||
// the events state prior to the call, but we dont need that. might need to
|
||||
// check whether one or the other is faster in the kernel though yeah, just
|
||||
|
@ -53,6 +55,7 @@ XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,
|
|||
|
||||
XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
|
||||
NtDelayExecutionPointer);
|
||||
|
||||
namespace xe {
|
||||
namespace threading {
|
||||
|
||||
|
@ -137,7 +140,7 @@ void MaybeYield() {
|
|||
#endif
|
||||
#endif
|
||||
// memorybarrier is really not necessary here...
|
||||
MemoryBarrier();
|
||||
// MemoryBarrier();
|
||||
}
|
||||
|
||||
void SyncMemory() { MemoryBarrier(); }
|
||||
|
@ -288,11 +291,19 @@ class Win32Event : public Win32Handle<Event> {
|
|||
void Set() override { NtSetEventPointer.invoke(handle_, nullptr); }
|
||||
void Reset() override { NtClearEventPointer.invoke(handle_); }
|
||||
void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); }
|
||||
void SetBoostPriority() override {
|
||||
// no previous state for boostpriority
|
||||
NtSetEventBoostPriorityPointer.invoke(handle_);
|
||||
}
|
||||
#else
|
||||
void Set() override { SetEvent(handle_); }
|
||||
void Reset() override { ResetEvent(handle_); }
|
||||
void Pulse() override { PulseEvent(handle_); }
|
||||
|
||||
void SetBoostPriority() override {
|
||||
// no win32 version of boostpriority
|
||||
SetEvent(handle_);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
#define XE_X64_PROFILER_AVAILABLE 1
|
||||
#endif
|
||||
|
||||
DECLARE_int32(x64_extension_mask);
|
||||
DECLARE_int64(x64_extension_mask);
|
||||
|
||||
namespace xe {
|
||||
class Exception;
|
||||
|
|
|
@ -103,74 +103,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
|||
"FAQ for system requirements at https://xenia.jp");
|
||||
return;
|
||||
}
|
||||
#if 1
|
||||
feature_flags_ = amd64::GetFeatureFlags();
|
||||
#else
|
||||
#define TEST_EMIT_FEATURE(emit, ext) \
|
||||
if ((cvars::x64_extension_mask & emit) == emit) { \
|
||||
feature_flags_ |= (cpu_.has(ext) ? emit : 0); \
|
||||
}
|
||||
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
|
||||
TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
|
||||
TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
|
||||
TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
|
||||
TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
|
||||
TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
|
||||
TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
|
||||
TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
|
||||
#undef TEST_EMIT_FEATURE
|
||||
/*
|
||||
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
|
||||
latest version of xbyak
|
||||
*/
|
||||
unsigned int data[4];
|
||||
Xbyak::util::Cpu::getCpuid(0x80000001, data);
|
||||
unsigned amd_flags = data[2];
|
||||
if (amd_flags & (1U << 5)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
|
||||
feature_flags_ |= kX64EmitLZCNT;
|
||||
}
|
||||
}
|
||||
// todo: although not reported by cpuid, zen 1 and zen+ also have fma4
|
||||
if (amd_flags & (1U << 16)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
|
||||
feature_flags_ |= kX64EmitFMA4;
|
||||
}
|
||||
}
|
||||
if (amd_flags & (1U << 21)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
|
||||
feature_flags_ |= kX64EmitTBM;
|
||||
}
|
||||
}
|
||||
if (amd_flags & (1U << 11)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
|
||||
feature_flags_ |= kX64EmitXOP;
|
||||
XELOGCPU("Cpu support XOP!\n\n");
|
||||
}
|
||||
}
|
||||
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
|
||||
bool is_zennish = cpu_.displayFamily >= 0x17;
|
||||
/*
|
||||
chrispy: according to agner's tables, all amd architectures that
|
||||
we support (ones with avx) have the same timings for
|
||||
jrcxz/loop/loope/loopne as for other jmps
|
||||
*/
|
||||
feature_flags_ |= kX64FastJrcx;
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
if (is_zennish) {
|
||||
// ik that i heard somewhere that this is the case for zen, but i need to
|
||||
// verify. cant find my original source for that.
|
||||
// todo: ask agner?
|
||||
feature_flags_ |= kX64FlagsIndependentVars;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
feature_flags_ = amd64::GetFeatureFlags();
|
||||
|
||||
may_use_membase32_as_zero_reg_ =
|
||||
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(
|
||||
processor()->memory()->virtual_membase())) == 0;
|
||||
|
|
|
@ -299,7 +299,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
void* FindWordConstantOffset(unsigned wordvalue);
|
||||
void* FindDwordConstantOffset(unsigned bytevalue);
|
||||
void* FindQwordConstantOffset(uint64_t bytevalue);
|
||||
bool IsFeatureEnabled(uint32_t feature_flag) const {
|
||||
bool IsFeatureEnabled(uint64_t feature_flag) const {
|
||||
return (feature_flags_ & feature_flag) == feature_flag;
|
||||
}
|
||||
|
||||
|
@ -395,7 +395,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
XbyakAllocator* allocator_ = nullptr;
|
||||
XexModule* guest_module_ = nullptr;
|
||||
Xbyak::util::Cpu cpu_;
|
||||
uint32_t feature_flags_ = 0;
|
||||
uint64_t feature_flags_ = 0;
|
||||
uint32_t current_guest_function_ = 0;
|
||||
Xbyak::Label* epilog_label_ = nullptr;
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@
|
|||
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
|
||||
#include "xenia/cpu/hir/hir_builder.h"
|
||||
#include "xenia/cpu/processor.h"
|
||||
|
||||
XE_MSVC_OPTIMIZE_SMALL()
|
||||
DEFINE_bool(use_fast_dot_product, false,
|
||||
"Experimental optimization, much shorter sequence on dot products, "
|
||||
"treating inf as overflow instead of using mcxsr"
|
||||
|
|
|
@ -19,16 +19,19 @@ EntryTable::EntryTable() = default;
|
|||
|
||||
EntryTable::~EntryTable() {
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
for (auto it : map_) {
|
||||
Entry* entry = it.second;
|
||||
for (auto it : map_.Values()) {
|
||||
Entry* entry = it;
|
||||
delete entry;
|
||||
}
|
||||
}
|
||||
|
||||
Entry* EntryTable::Get(uint32_t address) {
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
const auto& it = map_.find(address);
|
||||
Entry* entry = it != map_.end() ? it->second : nullptr;
|
||||
uint32_t idx = map_.IndexForKey(address);
|
||||
if (idx == map_.size() || *map_.KeyAt(idx) != address) {
|
||||
return nullptr;
|
||||
}
|
||||
Entry* entry = *map_.ValueAt(idx);
|
||||
if (entry) {
|
||||
// TODO(benvanik): wait if needed?
|
||||
if (entry->status != Entry::STATUS_READY) {
|
||||
|
@ -43,8 +46,12 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
|
|||
// https://github.com/facebook/folly/blob/master/folly/AtomicHashMap.h
|
||||
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
const auto& it = map_.find(address);
|
||||
Entry* entry = it != map_.end() ? it->second : nullptr;
|
||||
|
||||
uint32_t idx = map_.IndexForKey(address);
|
||||
|
||||
Entry* entry = idx != map_.size() && *map_.KeyAt(idx) == address
|
||||
? *map_.ValueAt(idx)
|
||||
: nullptr;
|
||||
Entry::Status status;
|
||||
if (entry) {
|
||||
// If we aren't ready yet spin and wait.
|
||||
|
@ -65,7 +72,8 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
|
|||
entry->end_address = 0;
|
||||
entry->status = Entry::STATUS_COMPILING;
|
||||
entry->function = 0;
|
||||
map_[address] = entry;
|
||||
map_.InsertAt(address, entry, idx);
|
||||
// map_[address] = entry;
|
||||
status = Entry::STATUS_NEW;
|
||||
}
|
||||
global_lock.unlock();
|
||||
|
@ -75,18 +83,18 @@ Entry::Status EntryTable::GetOrCreate(uint32_t address, Entry** out_entry) {
|
|||
|
||||
void EntryTable::Delete(uint32_t address) {
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
const auto itr = map_.find(address);
|
||||
|
||||
if (itr != map_.cend()) {
|
||||
map_.erase(itr);
|
||||
// doesnt this leak memory by not deleting the entry?
|
||||
uint32_t idx = map_.IndexForKey(address);
|
||||
if (idx != map_.size() && *map_.KeyAt(idx) == address) {
|
||||
map_.EraseAt(idx);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Function*> EntryTable::FindWithAddress(uint32_t address) {
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
std::vector<Function*> fns;
|
||||
for (auto& it : map_) {
|
||||
Entry* entry = it.second;
|
||||
for (auto& it : map_.Values()) {
|
||||
Entry* entry = it;
|
||||
if (address >= entry->address && address <= entry->end_address) {
|
||||
if (entry->status == Entry::STATUS_READY) {
|
||||
fns.push_back(entry->function);
|
||||
|
@ -95,6 +103,5 @@ std::vector<Function*> EntryTable::FindWithAddress(uint32_t address) {
|
|||
}
|
||||
return fns;
|
||||
}
|
||||
|
||||
} // namespace cpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "xenia/base/mutex.h"
|
||||
|
||||
#include "xenia/base/split_map.h"
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
|
||||
|
@ -48,7 +48,8 @@ class EntryTable {
|
|||
private:
|
||||
xe::global_critical_region global_critical_region_;
|
||||
// TODO(benvanik): replace with a better data structure.
|
||||
std::unordered_map<uint32_t, Entry*> map_;
|
||||
xe::split_map<uint32_t, Entry*> map_;
|
||||
//std::unordered_map<uint32_t, Entry*> map_;
|
||||
};
|
||||
|
||||
} // namespace cpu
|
||||
|
|
|
@ -334,7 +334,7 @@ void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr,
|
|||
|
||||
void CommandProcessor::UpdateWritePointer(uint32_t value) {
|
||||
write_ptr_index_ = value;
|
||||
write_ptr_index_event_->Set();
|
||||
write_ptr_index_event_->SetBoostPriority();
|
||||
}
|
||||
void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index,
|
||||
uint32_t value) {
|
||||
|
@ -665,6 +665,11 @@ uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index,
|
|||
|
||||
reader_.set_read_offset(read_index * sizeof(uint32_t));
|
||||
reader_.set_write_offset(write_index * sizeof(uint32_t));
|
||||
// prefetch the wraparound range
|
||||
// it likely is already in L3 cache, but in a zen system it may be another
|
||||
// chiplets l3
|
||||
reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
|
||||
GetCurrentRingReadCount());
|
||||
do {
|
||||
if (!ExecutePacket()) {
|
||||
// This probably should be fatal - but we're going to continue anyways.
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -45,7 +45,10 @@
|
|||
namespace xe {
|
||||
namespace gpu {
|
||||
namespace d3d12 {
|
||||
|
||||
struct MemExportRange {
|
||||
uint32_t base_address_dwords;
|
||||
uint32_t size_dwords;
|
||||
};
|
||||
class D3D12CommandProcessor final : public CommandProcessor {
|
||||
public:
|
||||
#include "../pm4_command_processor_declare.h"
|
||||
|
@ -287,8 +290,21 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
bool IssueDraw(xenos::PrimitiveType primitive_type, uint32_t index_count,
|
||||
IndexBufferInfo* index_buffer_info,
|
||||
bool major_mode_explicit) override;
|
||||
XE_COLD
|
||||
XE_NOINLINE
|
||||
bool HandleMemexportGuestDMA(ID3D12Resource*& scratch_index_buffer,
|
||||
D3D12_INDEX_BUFFER_VIEW& index_buffer_view,
|
||||
uint32_t guest_index_base,
|
||||
bool& retflag);
|
||||
XE_NOINLINE
|
||||
XE_COLD
|
||||
bool GatherMemexportRangesAndMakeResident(bool& retflag);
|
||||
XE_NOINLINE
|
||||
XE_COLD
|
||||
void HandleMemexportDrawOrdering_AndReadback();
|
||||
bool IssueCopy() override;
|
||||
|
||||
XE_NOINLINE
|
||||
bool IssueCopy_ReadbackResolvePath();
|
||||
void InitializeTrace() override;
|
||||
|
||||
private:
|
||||
|
@ -363,6 +379,8 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
};
|
||||
// Gets the indices of optional root parameters. Returns the total parameter
|
||||
// count.
|
||||
XE_NOINLINE
|
||||
XE_COLD
|
||||
static uint32_t GetRootBindfulExtraParameterIndices(
|
||||
const DxbcShader* vertex_shader, const DxbcShader* pixel_shader,
|
||||
RootBindfulExtraParameterIndices& indices_out);
|
||||
|
@ -437,6 +455,18 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
bool UpdateBindings(const D3D12Shader* vertex_shader,
|
||||
const D3D12Shader* pixel_shader,
|
||||
ID3D12RootSignature* root_signature);
|
||||
XE_COLD
|
||||
XE_NOINLINE
|
||||
void UpdateBindings_UpdateRootBindful();
|
||||
XE_NOINLINE
|
||||
XE_COLD
|
||||
bool UpdateBindings_BindfulPath(
|
||||
const size_t texture_layout_uid_vertex,
|
||||
const std::vector<xe::gpu::DxbcShader::TextureBinding>& textures_vertex,
|
||||
const size_t texture_layout_uid_pixel,
|
||||
const std::vector<xe::gpu::DxbcShader::TextureBinding>* textures_pixel,
|
||||
const size_t sampler_count_vertex, const size_t sampler_count_pixel,
|
||||
bool& retflag);
|
||||
|
||||
// Returns dword count for one element for a memexport format, or 0 if it's
|
||||
// not supported by the D3D12 command processor (if it's smaller that 1 dword,
|
||||
|
@ -743,6 +773,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
|
||||
draw_util::GetViewportInfoArgs previous_viewport_info_args_;
|
||||
draw_util::ViewportInfo previous_viewport_info_;
|
||||
// scratch memexport data
|
||||
MemExportRange memexport_ranges_[512];
|
||||
uint32_t memexport_range_count_ = 0;
|
||||
};
|
||||
|
||||
} // namespace d3d12
|
||||
|
|
|
@ -266,22 +266,9 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
|
|||
|
||||
void* DeferredCommandList::WriteCommand(Command command,
|
||||
size_t arguments_size_bytes) {
|
||||
|
||||
size_t arguments_size_elements =
|
||||
round_up(arguments_size_bytes, sizeof(uintmax_t), false);
|
||||
|
||||
//(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
|
||||
#if 0
|
||||
size_t offset = command_stream_.size();
|
||||
command_stream_.resize(offset + kCommandHeaderSizeElements +
|
||||
arguments_size_elements);
|
||||
CommandHeader& header =
|
||||
*reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
|
||||
header.command = command;
|
||||
header.arguments_size_elements = uint32_t(arguments_size_elements);
|
||||
return command_stream_.data() + (offset + kCommandHeaderSizeElements);
|
||||
#else
|
||||
|
||||
size_t offset = command_stream_.size();
|
||||
constexpr size_t kCommandHeaderSizeBytes =
|
||||
kCommandHeaderSizeElements * sizeof(uintmax_t);
|
||||
|
@ -290,9 +277,9 @@ void* DeferredCommandList::WriteCommand(Command command,
|
|||
CommandHeader& header =
|
||||
*reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
|
||||
header.command = command;
|
||||
header.arguments_size_elements = uint32_t(arguments_size_elements) / sizeof(uintmax_t);
|
||||
header.arguments_size_elements =
|
||||
uint32_t(arguments_size_elements) / sizeof(uintmax_t);
|
||||
return command_stream_.data() + (offset + kCommandHeaderSizeBytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace d3d12
|
||||
|
|
|
@ -183,7 +183,7 @@ void PipelineCache::Shutdown() {
|
|||
// creating them.
|
||||
if (!creation_threads_.empty()) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
||||
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
||||
creation_threads_shutdown_from_ = 0;
|
||||
}
|
||||
creation_request_cond_.notify_all();
|
||||
|
@ -681,7 +681,7 @@ void PipelineCache::InitializeShaderStorage(
|
|||
if (!creation_threads_.empty()) {
|
||||
// Submit the pipeline for creation to any available thread.
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
||||
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
||||
creation_queue_.push_back(new_pipeline);
|
||||
}
|
||||
creation_request_cond_.notify_one();
|
||||
|
@ -695,7 +695,7 @@ void PipelineCache::InitializeShaderStorage(
|
|||
CreateQueuedPipelinesOnProcessorThread();
|
||||
if (creation_threads_.size() > creation_thread_original_count) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
||||
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
||||
creation_threads_shutdown_from_ = creation_thread_original_count;
|
||||
// Assuming the queue is empty because of
|
||||
// CreateQueuedPipelinesOnProcessorThread.
|
||||
|
@ -708,7 +708,7 @@ void PipelineCache::InitializeShaderStorage(
|
|||
bool await_creation_completion_event;
|
||||
{
|
||||
// Cleanup so additional threads can be created later again.
|
||||
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
||||
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
||||
creation_threads_shutdown_from_ = SIZE_MAX;
|
||||
// If the invocation is blocking, all the shader storage
|
||||
// initialization is expected to be done before proceeding, to avoid
|
||||
|
@ -813,7 +813,7 @@ void PipelineCache::EndSubmission() {
|
|||
// Await creation of all queued pipelines.
|
||||
bool await_creation_completion_event;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
||||
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
||||
// Assuming the creation queue is already empty (because the processor
|
||||
// thread also worked on creating the leftover pipelines), so only check
|
||||
// if there are threads with pipelines currently being created.
|
||||
|
@ -834,7 +834,7 @@ bool PipelineCache::IsCreatingPipelines() {
|
|||
if (creation_threads_.empty()) {
|
||||
return false;
|
||||
}
|
||||
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
||||
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
||||
return !creation_queue_.empty() || creation_threads_busy_ != 0;
|
||||
}
|
||||
|
||||
|
@ -1076,7 +1076,7 @@ bool PipelineCache::ConfigurePipeline(
|
|||
if (!creation_threads_.empty()) {
|
||||
// Submit the pipeline for creation to any available thread.
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
||||
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
||||
creation_queue_.push_back(new_pipeline);
|
||||
}
|
||||
creation_request_cond_.notify_one();
|
||||
|
@ -3314,7 +3314,7 @@ void PipelineCache::CreationThread(size_t thread_index) {
|
|||
// Check if need to shut down or set the completion event and dequeue the
|
||||
// pipeline if there is any.
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(creation_request_lock_);
|
||||
std::unique_lock<xe_mutex> lock(creation_request_lock_);
|
||||
if (thread_index >= creation_threads_shutdown_from_ ||
|
||||
creation_queue_.empty()) {
|
||||
if (creation_completion_set_event_ && creation_threads_busy_ == 0) {
|
||||
|
@ -3345,7 +3345,7 @@ void PipelineCache::CreationThread(size_t thread_index) {
|
|||
// completion event if needed (at the next iteration, or in some other
|
||||
// thread).
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
||||
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
||||
--creation_threads_busy_;
|
||||
}
|
||||
}
|
||||
|
@ -3356,7 +3356,7 @@ void PipelineCache::CreateQueuedPipelinesOnProcessorThread() {
|
|||
while (true) {
|
||||
Pipeline* pipeline_to_create;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(creation_request_lock_);
|
||||
std::lock_guard<xe_mutex> lock(creation_request_lock_);
|
||||
if (creation_queue_.empty()) {
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -403,8 +403,8 @@ class PipelineCache {
|
|||
// Pipeline creation threads.
|
||||
void CreationThread(size_t thread_index);
|
||||
void CreateQueuedPipelinesOnProcessorThread();
|
||||
std::mutex creation_request_lock_;
|
||||
std::condition_variable creation_request_cond_;
|
||||
xe_mutex creation_request_lock_;
|
||||
std::condition_variable_any creation_request_cond_;
|
||||
// Protected with creation_request_lock_, notify_one creation_request_cond_
|
||||
// when set.
|
||||
std::deque<Pipeline*> creation_queue_;
|
||||
|
|
|
@ -650,7 +650,8 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
|
|||
}
|
||||
return normalized_color_mask;
|
||||
}
|
||||
|
||||
XE_NOINLINE
|
||||
XE_NOALIAS
|
||||
xenos::CopySampleSelect SanitizeCopySampleSelect(
|
||||
xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
|
||||
bool is_depth) {
|
||||
|
@ -737,7 +738,7 @@ const ResolveCopyShaderInfo
|
|||
{"Resolve Copy Full 64bpp", true, 2, 4, 5, 3},
|
||||
{"Resolve Copy Full 128bpp", true, 2, 4, 4, 3},
|
||||
};
|
||||
|
||||
XE_MSVC_OPTIMIZE_SMALL()
|
||||
bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
||||
TraceWriter& trace_writer, uint32_t draw_resolution_scale_x,
|
||||
uint32_t draw_resolution_scale_y,
|
||||
|
@ -869,7 +870,8 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
|||
y1 = y0 + int32_t(xenos::kMaxResolveSize);
|
||||
}
|
||||
// fails in forza horizon 1
|
||||
assert_true(x0 < x1 && y0 < y1);
|
||||
//x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
|
||||
assert_true(x0 <= x1 && y0 <= y1);
|
||||
if (x0 >= x1 || y0 >= y1) {
|
||||
XELOGE("Resolve region is empty");
|
||||
return false;
|
||||
|
@ -1108,7 +1110,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
|||
info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
|
||||
info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32;
|
||||
info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32;
|
||||
|
||||
#if 0
|
||||
XELOGD(
|
||||
"Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially "
|
||||
"modified memory range 0x{:08X} to 0x{:08X})",
|
||||
|
@ -1119,10 +1121,10 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
|||
xenos::ColorRenderTargetFormat(color_edram_info.format)),
|
||||
FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start,
|
||||
copy_dest_extent_end);
|
||||
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
XE_MSVC_OPTIMIZE_REVERT()
|
||||
ResolveCopyShaderIndex ResolveInfo::GetCopyShader(
|
||||
uint32_t draw_resolution_scale_x, uint32_t draw_resolution_scale_y,
|
||||
ResolveCopyShaderConstants& constants_out, uint32_t& group_count_x_out,
|
||||
|
|
|
@ -475,6 +475,8 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA(
|
|||
|
||||
// To avoid passing values that the shader won't understand (even though
|
||||
// Direct3D 9 shouldn't pass them anyway).
|
||||
XE_NOINLINE
|
||||
XE_NOALIAS
|
||||
xenos::CopySampleSelect SanitizeCopySampleSelect(
|
||||
xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
|
||||
bool is_depth);
|
||||
|
|
|
@ -14,6 +14,11 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
|
|||
new (&reader_)
|
||||
RingBuffer(memory_->TranslatePhysical(ptr), count * sizeof(uint32_t));
|
||||
reader_.set_write_offset(count * sizeof(uint32_t));
|
||||
// prefetch the wraparound range
|
||||
// it likely is already in L3 cache, but in a zen system it may be another
|
||||
// chiplets l3
|
||||
reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
|
||||
COMMAND_PROCESSOR::GetCurrentRingReadCount());
|
||||
do {
|
||||
if (COMMAND_PROCESSOR::ExecutePacket()) {
|
||||
continue;
|
||||
|
@ -30,11 +35,6 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
|
|||
}
|
||||
|
||||
bool COMMAND_PROCESSOR::ExecutePacket() {
|
||||
// prefetch the wraparound range
|
||||
// it likely is already in L3 cache, but in a zen system it may be another
|
||||
// chiplets l3
|
||||
reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
|
||||
COMMAND_PROCESSOR::GetCurrentRingReadCount());
|
||||
const uint32_t packet = reader_.ReadAndSwap<uint32_t>();
|
||||
const uint32_t packet_type = packet >> 30;
|
||||
|
||||
|
@ -495,7 +495,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
|
|||
} else {
|
||||
xe::threading::Sleep(std::chrono::milliseconds(wait / 0x100));
|
||||
}
|
||||
xe::threading::SyncMemory();
|
||||
// xe::threading::SyncMemory();
|
||||
ReturnFromWait();
|
||||
|
||||
if (!worker_running_) {
|
||||
|
@ -599,27 +599,28 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_COND_WRITE(
|
|||
value = register_file_->values[poll_reg_addr].u32;
|
||||
}
|
||||
bool matched = false;
|
||||
value &= mask;
|
||||
switch (wait_info & 0x7) {
|
||||
case 0x0: // Never.
|
||||
matched = false;
|
||||
break;
|
||||
case 0x1: // Less than reference.
|
||||
matched = (value & mask) < ref;
|
||||
matched = value < ref;
|
||||
break;
|
||||
case 0x2: // Less than or equal to reference.
|
||||
matched = (value & mask) <= ref;
|
||||
matched = value <= ref;
|
||||
break;
|
||||
case 0x3: // Equal to reference.
|
||||
matched = (value & mask) == ref;
|
||||
matched = value == ref;
|
||||
break;
|
||||
case 0x4: // Not equal to reference.
|
||||
matched = (value & mask) != ref;
|
||||
matched = value != ref;
|
||||
break;
|
||||
case 0x5: // Greater than or equal to reference.
|
||||
matched = (value & mask) >= ref;
|
||||
matched = value >= ref;
|
||||
break;
|
||||
case 0x6: // Greater than reference.
|
||||
matched = (value & mask) > ref;
|
||||
matched = value > ref;
|
||||
break;
|
||||
case 0x7: // Always
|
||||
matched = true;
|
||||
|
@ -1064,7 +1065,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE(
|
|||
assert_true(count - 2 >= size_dwords);
|
||||
auto shader = COMMAND_PROCESSOR::LoadShader(
|
||||
shader_type, uint32_t(reader_.read_ptr()),
|
||||
reinterpret_cast<uint32_t*>(reader_.read_ptr()), size_dwords);
|
||||
reinterpret_cast<uint32_t*>(reader_.read_ptr()), size_dwords);
|
||||
switch (shader_type) {
|
||||
case xenos::ShaderType::kVertex:
|
||||
active_vertex_shader_ = shader;
|
||||
|
|
|
@ -430,7 +430,7 @@ class PrimitiveProcessor {
|
|||
--count;
|
||||
uint32_t index = *(source++) & low_bits_mask_guest_endian;
|
||||
*(dest++) = index != reset_index_guest_endian
|
||||
? xenos::GpuSwap(index, HostSwap)
|
||||
? xenos::GpuSwapInline(index, HostSwap)
|
||||
: UINT32_MAX;
|
||||
}
|
||||
if (count >= kSimdVectorU32Elements) {
|
||||
|
@ -442,10 +442,10 @@ class PrimitiveProcessor {
|
|||
__m128i host_swap_shuffle;
|
||||
if constexpr (HostSwap != xenos::Endian::kNone) {
|
||||
host_swap_shuffle = _mm_set_epi32(
|
||||
int32_t(xenos::GpuSwap(uint32_t(0x0F0E0D0C), HostSwap)),
|
||||
int32_t(xenos::GpuSwap(uint32_t(0x0B0A0908), HostSwap)),
|
||||
int32_t(xenos::GpuSwap(uint32_t(0x07060504), HostSwap)),
|
||||
int32_t(xenos::GpuSwap(uint32_t(0x03020100), HostSwap)));
|
||||
int32_t(xenos::GpuSwapInline(uint32_t(0x0F0E0D0C), HostSwap)),
|
||||
int32_t(xenos::GpuSwapInline(uint32_t(0x0B0A0908), HostSwap)),
|
||||
int32_t(xenos::GpuSwapInline(uint32_t(0x07060504), HostSwap)),
|
||||
int32_t(xenos::GpuSwapInline(uint32_t(0x03020100), HostSwap)));
|
||||
}
|
||||
#endif // XE_ARCH_AMD64
|
||||
while (count >= kSimdVectorU32Elements) {
|
||||
|
@ -490,7 +490,7 @@ class PrimitiveProcessor {
|
|||
while (count--) {
|
||||
uint32_t index = *(source++) & low_bits_mask_guest_endian;
|
||||
*(dest++) = index != reset_index_guest_endian
|
||||
? xenos::GpuSwap(index, HostSwap)
|
||||
? xenos::GpuSwapInline(index, HostSwap)
|
||||
: UINT32_MAX;
|
||||
}
|
||||
}
|
||||
|
@ -510,19 +510,19 @@ class PrimitiveProcessor {
|
|||
};
|
||||
struct To24Swapping8In16IndexTransform {
|
||||
uint32_t operator()(uint32_t index) const {
|
||||
return xenos::GpuSwap(index, xenos::Endian::k8in16) &
|
||||
return xenos::GpuSwapInline(index, xenos::Endian::k8in16) &
|
||||
xenos::kVertexIndexMask;
|
||||
}
|
||||
};
|
||||
struct To24Swapping8In32IndexTransform {
|
||||
uint32_t operator()(uint32_t index) const {
|
||||
return xenos::GpuSwap(index, xenos::Endian::k8in32) &
|
||||
return xenos::GpuSwapInline(index, xenos::Endian::k8in32) &
|
||||
xenos::kVertexIndexMask;
|
||||
}
|
||||
};
|
||||
struct To24Swapping16In32IndexTransform {
|
||||
uint32_t operator()(uint32_t index) const {
|
||||
return xenos::GpuSwap(index, xenos::Endian::k16in32) &
|
||||
return xenos::GpuSwapInline(index, xenos::Endian::k16in32) &
|
||||
xenos::kVertexIndexMask;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -388,6 +388,7 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
|
|||
|
||||
bool any_data_resolved = false;
|
||||
uint32_t block_first = page_first >> 6;
|
||||
swcache::PrefetchL1(&system_page_flags_[block_first]);
|
||||
uint32_t block_last = page_last >> 6;
|
||||
uint32_t range_start = UINT32_MAX;
|
||||
|
||||
|
|
|
@ -464,7 +464,8 @@ TextureGuestLayout GetGuestTextureLayout(
|
|||
|
||||
return layout;
|
||||
}
|
||||
|
||||
XE_NOINLINE
|
||||
XE_NOALIAS
|
||||
int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
|
||||
uint32_t bytes_per_block_log2) {
|
||||
// https://github.com/gildor2/UModel/blob/de8fbd3bc922427ea056b7340202dcdcc19ccff5/Unreal/UnTexture.cpp#L489
|
||||
|
@ -481,7 +482,8 @@ int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
|
|||
return ((offset & ~0x1FF) << 3) + ((y & 16) << 7) + ((offset & 0x1C0) << 2) +
|
||||
(((((y & 8) >> 2) + (x >> 3)) & 3) << 6) + (offset & 0x3F);
|
||||
}
|
||||
|
||||
XE_NOINLINE
|
||||
XE_NOALIAS
|
||||
int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
|
||||
uint32_t height, uint32_t bytes_per_block_log2) {
|
||||
// Reconstructed from disassembly of XGRAPHICS::TileVolume.
|
||||
|
@ -509,7 +511,8 @@ int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
|
|||
address += offset2 & 63;
|
||||
return address;
|
||||
}
|
||||
|
||||
XE_NOINLINE
|
||||
XE_NOALIAS
|
||||
uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
|
||||
uint32_t pitch,
|
||||
uint32_t bytes_per_block_log2) {
|
||||
|
@ -538,7 +541,8 @@ uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
|
|||
}
|
||||
return upper_bound;
|
||||
}
|
||||
|
||||
XE_NOINLINE
|
||||
XE_NOALIAS
|
||||
uint32_t GetTiledAddressUpperBound3D(uint32_t right, uint32_t bottom,
|
||||
uint32_t back, uint32_t pitch,
|
||||
uint32_t height,
|
||||
|
|
|
@ -280,8 +280,12 @@ void GetTextureTotalSize(xenos::DataDimension dimension,
|
|||
// bytes_per_block_log2 is log2_floor according to how Direct3D 9 calculates it,
|
||||
// but k_32_32_32 textures are never tiled anyway likely.
|
||||
|
||||
XE_NOINLINE
|
||||
XE_NOALIAS
|
||||
int32_t GetTiledOffset2D(int32_t x, int32_t y, uint32_t pitch,
|
||||
uint32_t bytes_per_block_log2);
|
||||
XE_NOINLINE
|
||||
XE_NOALIAS
|
||||
int32_t GetTiledOffset3D(int32_t x, int32_t y, int32_t z, uint32_t pitch,
|
||||
uint32_t height, uint32_t bytes_per_block_log2);
|
||||
// Because (0, 0, 0) within each 32x32x4-block tile is stored in memory first,
|
||||
|
@ -308,9 +312,13 @@ inline uint32_t GetTiledAddressLowerBound3D(uint32_t left, uint32_t top,
|
|||
// Supporting the right > pitch and bottom > height (in tiles) cases also, for
|
||||
// estimation how far addresses can actually go even potentially beyond the
|
||||
// subresource stride.
|
||||
XE_NOINLINE
|
||||
XE_NOALIAS
|
||||
uint32_t GetTiledAddressUpperBound2D(uint32_t right, uint32_t bottom,
|
||||
uint32_t pitch,
|
||||
uint32_t bytes_per_block_log2);
|
||||
XE_NOINLINE
|
||||
XE_NOALIAS
|
||||
uint32_t GetTiledAddressUpperBound3D(uint32_t right, uint32_t bottom,
|
||||
uint32_t back, uint32_t pitch,
|
||||
uint32_t height,
|
||||
|
|
|
@ -125,8 +125,8 @@ float Float7e3To32(uint32_t f10) {
|
|||
// Based on CFloat24 from d3dref9.dll and the 6e4 code from:
|
||||
// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
|
||||
// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
|
||||
|
||||
uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
|
||||
XE_NOALIAS
|
||||
uint32_t Float32To20e4(float f32, bool round_to_nearest_even) noexcept {
|
||||
if (!(f32 > 0.0f)) {
|
||||
// Positive only, and not -0 or NaN.
|
||||
return 0;
|
||||
|
@ -150,8 +150,8 @@ uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
|
|||
}
|
||||
return (f32u32 >> 3) & 0xFFFFFF;
|
||||
}
|
||||
|
||||
float Float20e4To32(uint32_t f24) {
|
||||
XE_NOALIAS
|
||||
float Float20e4To32(uint32_t f24) noexcept {
|
||||
f24 &= 0xFFFFFF;
|
||||
if (!f24) {
|
||||
return 0.0f;
|
||||
|
|
|
@ -421,10 +421,12 @@ float Float7e3To32(uint32_t f10);
|
|||
// floating-point number.
|
||||
// Converts an IEEE-754 32-bit floating-point number to Xenos floating-point
|
||||
// depth, rounding to the nearest even or towards zero.
|
||||
uint32_t Float32To20e4(float f32, bool round_to_nearest_even);
|
||||
XE_NOALIAS
|
||||
uint32_t Float32To20e4(float f32, bool round_to_nearest_even) noexcept;
|
||||
// Converts Xenos floating-point depth in bits 0:23 (not clamping) to an
|
||||
// IEEE-754 32-bit floating-point number.
|
||||
float Float20e4To32(uint32_t f24);
|
||||
XE_NOALIAS
|
||||
float Float20e4To32(uint32_t f24) noexcept;
|
||||
// Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit
|
||||
// floating-point number.
|
||||
constexpr float UNorm24To32(uint32_t n24) {
|
||||
|
@ -1045,9 +1047,9 @@ inline uint16_t GpuSwap(uint16_t value, Endian endianness) {
|
|||
return value;
|
||||
}
|
||||
}
|
||||
XE_NOINLINE
|
||||
XE_FORCEINLINE
|
||||
XE_NOALIAS
|
||||
static uint32_t GpuSwap(uint32_t value, Endian endianness) {
|
||||
static uint32_t GpuSwapInline(uint32_t value, Endian endianness) {
|
||||
switch (endianness) {
|
||||
default:
|
||||
case Endian::kNone:
|
||||
|
@ -1065,6 +1067,11 @@ static uint32_t GpuSwap(uint32_t value, Endian endianness) {
|
|||
return ((value >> 16) & 0xFFFF) | (value << 16);
|
||||
}
|
||||
}
|
||||
XE_NOINLINE
|
||||
XE_NOALIAS
|
||||
static uint32_t GpuSwap(uint32_t value, Endian endianness) {
|
||||
return GpuSwapInline(value, endianness);
|
||||
}
|
||||
|
||||
inline float GpuSwap(float value, Endian endianness) {
|
||||
union {
|
||||
|
|
|
@ -137,8 +137,8 @@ X_INPUT_VIBRATION InputSystem::ModifyVibrationLevel(
|
|||
modified_vibration.right_motor_speed = 0;
|
||||
return modified_vibration;
|
||||
}
|
||||
std::unique_lock<xe_unlikely_mutex> InputSystem::lock() {
|
||||
return std::unique_lock<xe_unlikely_mutex>{lock_};
|
||||
std::unique_lock<xe_mutex> InputSystem::lock() {
|
||||
return std::unique_lock<xe_mutex>{lock_};
|
||||
}
|
||||
} // namespace hid
|
||||
} // namespace xe
|
||||
|
|
|
@ -48,7 +48,7 @@ class InputSystem {
|
|||
void UpdateUsedSlot(uint8_t slot, bool connected);
|
||||
uint8_t GetConnectedSlots() const { return connected_slot; }
|
||||
|
||||
std::unique_lock<xe_unlikely_mutex> lock();
|
||||
std::unique_lock<xe_mutex> lock();
|
||||
|
||||
private:
|
||||
xe::ui::Window* window_ = nullptr;
|
||||
|
@ -57,7 +57,7 @@ class InputSystem {
|
|||
|
||||
X_INPUT_VIBRATION ModifyVibrationLevel(X_INPUT_VIBRATION* vibration);
|
||||
uint8_t connected_slot = 0b0001;
|
||||
xe_unlikely_mutex lock_;
|
||||
xe_mutex lock_;
|
||||
};
|
||||
|
||||
} // namespace hid
|
||||
|
|
|
@ -911,11 +911,17 @@ dword_result_t NtSignalAndWaitForSingleObjectEx_entry(dword_t signal_handle,
|
|||
DECLARE_XBOXKRNL_EXPORT3(NtSignalAndWaitForSingleObjectEx, kThreading,
|
||||
kImplemented, kBlocking, kHighFrequency);
|
||||
|
||||
static void PrefetchForCAS(const void* value) {
|
||||
if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
|
||||
swcache::PrefetchW(value);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) {
|
||||
// XELOGD(
|
||||
// "KfAcquireSpinLock({:08X})",
|
||||
// lock_ptr);
|
||||
|
||||
PrefetchForCAS(lock);
|
||||
// Lock.
|
||||
while (!xe::atomic_cas(0, 1, lock)) {
|
||||
// Spin!
|
||||
|
@ -956,6 +962,7 @@ DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented,
|
|||
void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
|
||||
// Lock.
|
||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||
PrefetchForCAS(lock);
|
||||
while (!xe::atomic_cas(0, 1, lock)) {
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
// todo: this is just a nop if they don't have SMT, which is not great
|
||||
|
@ -973,6 +980,7 @@ DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading,
|
|||
dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
|
||||
// Lock.
|
||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||
PrefetchForCAS(lock);
|
||||
if (!xe::atomic_cas(0, 1, lock)) {
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -763,7 +763,8 @@ void XThread::SetActiveCpu(uint8_t cpu_index) {
|
|||
thread_->set_affinity_mask(uint64_t(1) << cpu_index);
|
||||
}
|
||||
} else {
|
||||
XELOGW("Too few processor cores - scheduling will be wonky");
|
||||
//there no good reason why we need to log this... we don't perfectly emulate the 360's scheduler in any way
|
||||
// XELOGW("Too few processor cores - scheduling will be wonky");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -713,6 +713,8 @@ void BaseHeap::Initialize(Memory* memory, uint8_t* membase, HeapType heap_type,
|
|||
heap_base_ = heap_base;
|
||||
heap_size_ = heap_size;
|
||||
page_size_ = page_size;
|
||||
xenia_assert(xe::is_pow2(page_size_));
|
||||
page_size_shift_ = xe::log2_floor(page_size_);
|
||||
host_address_offset_ = host_address_offset;
|
||||
page_table_.resize(heap_size / page_size);
|
||||
unreserved_page_count_ = uint32_t(page_table_.size());
|
||||
|
@ -1234,14 +1236,14 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
|
|||
// fails and returns without modifying the access protection of any pages in
|
||||
// the specified region."
|
||||
|
||||
uint32_t start_page_number = (address - heap_base_) / page_size_;
|
||||
uint32_t start_page_number = (address - heap_base_) >> page_size_shift_;
|
||||
if (start_page_number >= page_table_.size()) {
|
||||
XELOGE("BaseHeap::Protect failed due to out-of-bounds base address {:08X}",
|
||||
address);
|
||||
return false;
|
||||
}
|
||||
uint32_t end_page_number =
|
||||
uint32_t((uint64_t(address) + size - 1 - heap_base_) / page_size_);
|
||||
uint32_t((uint64_t(address) + size - 1 - heap_base_) >> page_size_shift_);
|
||||
if (end_page_number >= page_table_.size()) {
|
||||
XELOGE(
|
||||
"BaseHeap::Protect failed due to out-of-bounds range ({:08X} bytes "
|
||||
|
@ -1268,17 +1270,21 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
|
|||
return false;
|
||||
}
|
||||
}
|
||||
uint32_t xe_page_size = static_cast<uint32_t>(xe::memory::page_size());
|
||||
|
||||
uint32_t page_size_mask = xe_page_size - 1;
|
||||
|
||||
// Attempt host change (hopefully won't fail).
|
||||
// We can only do this if our size matches system page granularity.
|
||||
uint32_t page_count = end_page_number - start_page_number + 1;
|
||||
if (page_size_ == xe::memory::page_size() ||
|
||||
(((page_count * page_size_) % xe::memory::page_size() == 0) &&
|
||||
((start_page_number * page_size_) % xe::memory::page_size() == 0))) {
|
||||
if (page_size_ == xe_page_size ||
|
||||
((((page_count << page_size_shift_) & page_size_mask) == 0) &&
|
||||
(((start_page_number << page_size_shift_) & page_size_mask) == 0))) {
|
||||
memory::PageAccess old_protect_access;
|
||||
if (!xe::memory::Protect(TranslateRelative(start_page_number * page_size_),
|
||||
page_count * page_size_, ToPageAccess(protect),
|
||||
old_protect ? &old_protect_access : nullptr)) {
|
||||
if (!xe::memory::Protect(
|
||||
TranslateRelative(start_page_number << page_size_shift_),
|
||||
page_count << page_size_shift_, ToPageAccess(protect),
|
||||
old_protect ? &old_protect_access : nullptr)) {
|
||||
XELOGE("BaseHeap::Protect failed due to host VirtualProtect failure");
|
||||
return false;
|
||||
}
|
||||
|
@ -1303,7 +1309,7 @@ bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
|
|||
|
||||
bool BaseHeap::QueryRegionInfo(uint32_t base_address,
|
||||
HeapAllocationInfo* out_info) {
|
||||
uint32_t start_page_number = (base_address - heap_base_) / page_size_;
|
||||
uint32_t start_page_number = (base_address - heap_base_) >> page_size_shift_;
|
||||
if (start_page_number > page_table_.size()) {
|
||||
XELOGE("BaseHeap::QueryRegionInfo base page out of range");
|
||||
return false;
|
||||
|
@ -1321,9 +1327,10 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
|
|||
if (start_page_entry.state) {
|
||||
// Committed/reserved region.
|
||||
out_info->allocation_base =
|
||||
heap_base_ + start_page_entry.base_address * page_size_;
|
||||
heap_base_ + (start_page_entry.base_address << page_size_shift_);
|
||||
out_info->allocation_protect = start_page_entry.allocation_protect;
|
||||
out_info->allocation_size = start_page_entry.region_page_count * page_size_;
|
||||
out_info->allocation_size = start_page_entry.region_page_count
|
||||
<< page_size_shift_;
|
||||
out_info->state = start_page_entry.state;
|
||||
out_info->protect = start_page_entry.current_protect;
|
||||
|
||||
|
@ -1358,7 +1365,7 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
|
|||
}
|
||||
|
||||
bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
|
||||
uint32_t page_number = (address - heap_base_) / page_size_;
|
||||
uint32_t page_number = (address - heap_base_) >> page_size_shift_;
|
||||
if (page_number > page_table_.size()) {
|
||||
XELOGE("BaseHeap::QuerySize base page out of range");
|
||||
*out_size = 0;
|
||||
|
@ -1366,12 +1373,12 @@ bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
|
|||
}
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
auto page_entry = page_table_[page_number];
|
||||
*out_size = (page_entry.region_page_count * page_size_);
|
||||
*out_size = (page_entry.region_page_count << page_size_shift_);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
|
||||
uint32_t page_number = (*in_out_address - heap_base_) / page_size_;
|
||||
uint32_t page_number = (*in_out_address - heap_base_) >> page_size_shift_;
|
||||
if (page_number > page_table_.size()) {
|
||||
XELOGE("BaseHeap::QuerySize base page out of range");
|
||||
*out_size = 0;
|
||||
|
@ -1379,13 +1386,13 @@ bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
|
|||
}
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
auto page_entry = page_table_[page_number];
|
||||
*in_out_address = (page_entry.base_address * page_size_);
|
||||
*out_size = (page_entry.region_page_count * page_size_);
|
||||
*in_out_address = (page_entry.base_address << page_size_shift_);
|
||||
*out_size = (page_entry.region_page_count << page_size_shift_);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool BaseHeap::QueryProtect(uint32_t address, uint32_t* out_protect) {
|
||||
uint32_t page_number = (address - heap_base_) / page_size_;
|
||||
uint32_t page_number = (address - heap_base_) >> page_size_shift_;
|
||||
if (page_number > page_table_.size()) {
|
||||
XELOGE("BaseHeap::QueryProtect base page out of range");
|
||||
*out_protect = 0;
|
||||
|
@ -1403,8 +1410,8 @@ xe::memory::PageAccess BaseHeap::QueryRangeAccess(uint32_t low_address,
|
|||
(high_address - heap_base_) >= heap_size_) {
|
||||
return xe::memory::PageAccess::kNoAccess;
|
||||
}
|
||||
uint32_t low_page_number = (low_address - heap_base_) / page_size_;
|
||||
uint32_t high_page_number = (high_address - heap_base_) / page_size_;
|
||||
uint32_t low_page_number = (low_address - heap_base_) >> page_size_shift_;
|
||||
uint32_t high_page_number = (high_address - heap_base_) >> page_size_shift_;
|
||||
uint32_t protect = kMemoryProtectRead | kMemoryProtectWrite;
|
||||
{
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
|
@ -1446,6 +1453,8 @@ void PhysicalHeap::Initialize(Memory* memory, uint8_t* membase,
|
|||
page_size, host_address_offset);
|
||||
parent_heap_ = parent_heap;
|
||||
system_page_size_ = uint32_t(xe::memory::page_size());
|
||||
xenia_assert(xe::is_pow2(system_page_size_));
|
||||
system_page_shift_ = xe::log2_floor(system_page_size_);
|
||||
|
||||
system_page_count_ =
|
||||
(size_t(heap_size_) + host_address_offset + (system_page_size_ - 1)) /
|
||||
|
@ -1665,10 +1674,11 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
|
|||
}
|
||||
|
||||
uint32_t system_page_first =
|
||||
(heap_relative_address + host_address_offset()) / system_page_size_;
|
||||
(heap_relative_address + host_address_offset()) >> system_page_shift_;
|
||||
swcache::PrefetchL1(&system_page_flags_[system_page_first >> 6]);
|
||||
uint32_t system_page_last =
|
||||
(heap_relative_address + length - 1 + host_address_offset()) /
|
||||
system_page_size_;
|
||||
(heap_relative_address + length - 1 + host_address_offset()) >>
|
||||
system_page_shift_;
|
||||
system_page_last = std::min(system_page_last, system_page_count_ - 1);
|
||||
assert_true(system_page_first <= system_page_last);
|
||||
|
||||
|
@ -1677,10 +1687,40 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
|
|||
xe::memory::PageAccess protect_access =
|
||||
enable_data_providers ? xe::memory::PageAccess::kNoAccess
|
||||
: xe::memory::PageAccess::kReadOnly;
|
||||
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
if (enable_invalidation_notifications) {
|
||||
EnableAccessCallbacksInner<true>(system_page_first, system_page_last,
|
||||
protect_access);
|
||||
} else {
|
||||
EnableAccessCallbacksInner<false>(system_page_first, system_page_last,
|
||||
protect_access);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool enable_invalidation_notifications>
|
||||
XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
|
||||
const uint32_t system_page_first, const uint32_t system_page_last,
|
||||
xe::memory::PageAccess protect_access) XE_RESTRICT {
|
||||
uint8_t* protect_base = membase_ + heap_base_;
|
||||
uint32_t protect_system_page_first = UINT32_MAX;
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
for (uint32_t i = system_page_first; i <= system_page_last; ++i) {
|
||||
|
||||
SystemPageFlagsBlock* XE_RESTRICT sys_page_flags = system_page_flags_.data();
|
||||
PageEntry* XE_RESTRICT page_table_ptr = page_table_.data();
|
||||
|
||||
// chrispy: a lot of time is spent in this loop, and i think some of the work
|
||||
// may be avoidable and repetitive profiling shows quite a bit of time spent
|
||||
// in this loop, but very little spent actually calling Protect
|
||||
uint32_t i = system_page_first;
|
||||
|
||||
uint32_t first_guest_page = SystemPagenumToGuestPagenum(system_page_first);
|
||||
uint32_t last_guest_page = SystemPagenumToGuestPagenum(system_page_last);
|
||||
|
||||
uint32_t guest_one =
|
||||
SystemPagenumToGuestPagenum(1);
|
||||
|
||||
uint32_t system_one = GuestPagenumToSystemPagenum(1);
|
||||
for (; i <= system_page_last; ++i) {
|
||||
// Check if need to enable callbacks for the page and raise its protection.
|
||||
//
|
||||
// If enabling invalidation notifications:
|
||||
|
@ -1702,12 +1742,19 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
|
|||
//
|
||||
// Enabling data providers doesn't need to be deferred - providers will be
|
||||
// polled for the last time without releasing the lock.
|
||||
SystemPageFlagsBlock& page_flags_block = system_page_flags_[i >> 6];
|
||||
SystemPageFlagsBlock& page_flags_block = sys_page_flags[i >> 6];
|
||||
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
// x86 modulus shift
|
||||
uint64_t page_flags_bit = uint64_t(1) << i;
|
||||
#else
|
||||
uint64_t page_flags_bit = uint64_t(1) << (i & 63);
|
||||
uint32_t guest_page_number =
|
||||
xe::sat_sub(i * system_page_size_, host_address_offset()) / page_size_;
|
||||
#endif
|
||||
|
||||
uint32_t guest_page_number = SystemPagenumToGuestPagenum(i);
|
||||
//swcache::PrefetchL1(&page_table_ptr[guest_page_number + 8]);
|
||||
xe::memory::PageAccess current_page_access =
|
||||
ToPageAccess(page_table_[guest_page_number].current_protect);
|
||||
ToPageAccess(page_table_ptr[guest_page_number].current_protect);
|
||||
bool protect_system_page = false;
|
||||
// Don't do anything with inaccessible pages - don't protect, don't enable
|
||||
// callbacks - because real access violations are needed there. And don't
|
||||
|
@ -1715,7 +1762,7 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
|
|||
// reason.
|
||||
if (current_page_access != xe::memory::PageAccess::kNoAccess) {
|
||||
// TODO(Triang3l): Enable data providers.
|
||||
if (enable_invalidation_notifications) {
|
||||
if constexpr (enable_invalidation_notifications) {
|
||||
if (current_page_access != xe::memory::PageAccess::kReadOnly &&
|
||||
(page_flags_block.notify_on_invalidation & page_flags_bit) == 0) {
|
||||
// TODO(Triang3l): Check if data providers are already enabled.
|
||||
|
@ -1733,21 +1780,22 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
|
|||
} else {
|
||||
if (protect_system_page_first != UINT32_MAX) {
|
||||
xe::memory::Protect(
|
||||
protect_base + protect_system_page_first * system_page_size_,
|
||||
(i - protect_system_page_first) * system_page_size_,
|
||||
protect_base + (protect_system_page_first << system_page_shift_),
|
||||
(i - protect_system_page_first) << system_page_shift_,
|
||||
protect_access);
|
||||
protect_system_page_first = UINT32_MAX;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (protect_system_page_first != UINT32_MAX) {
|
||||
xe::memory::Protect(
|
||||
protect_base + protect_system_page_first * system_page_size_,
|
||||
(system_page_last + 1 - protect_system_page_first) * system_page_size_,
|
||||
protect_base + (protect_system_page_first << system_page_shift_),
|
||||
(system_page_last + 1 - protect_system_page_first)
|
||||
<< system_page_shift_,
|
||||
protect_access);
|
||||
}
|
||||
}
|
||||
|
||||
bool PhysicalHeap::TriggerCallbacks(
|
||||
global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
|
||||
uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) {
|
||||
|
@ -1774,10 +1822,10 @@ bool PhysicalHeap::TriggerCallbacks(
|
|||
}
|
||||
|
||||
uint32_t system_page_first =
|
||||
(heap_relative_address + host_address_offset()) / system_page_size_;
|
||||
(heap_relative_address + host_address_offset()) >> system_page_shift_;
|
||||
uint32_t system_page_last =
|
||||
(heap_relative_address + length - 1 + host_address_offset()) /
|
||||
system_page_size_;
|
||||
(heap_relative_address + length - 1 + host_address_offset()) >>
|
||||
system_page_shift_;
|
||||
system_page_last = std::min(system_page_last, system_page_count_ - 1);
|
||||
assert_true(system_page_first <= system_page_last);
|
||||
uint32_t block_index_first = system_page_first >> 6;
|
||||
|
@ -1810,11 +1858,11 @@ bool PhysicalHeap::TriggerCallbacks(
|
|||
}
|
||||
uint32_t physical_address_offset = GetPhysicalAddress(heap_base_);
|
||||
uint32_t physical_address_start =
|
||||
xe::sat_sub(system_page_first * system_page_size_,
|
||||
xe::sat_sub(system_page_first << system_page_shift_,
|
||||
host_address_offset()) +
|
||||
physical_address_offset;
|
||||
uint32_t physical_length = std::min(
|
||||
xe::sat_sub(system_page_last * system_page_size_ + system_page_size_,
|
||||
xe::sat_sub((system_page_last << system_page_shift_) + system_page_size_,
|
||||
host_address_offset()) +
|
||||
physical_address_offset - physical_address_start,
|
||||
heap_size_ - (physical_address_start - physical_address_offset));
|
||||
|
@ -1858,8 +1906,8 @@ bool PhysicalHeap::TriggerCallbacks(
|
|||
unwatch_first += host_address_offset();
|
||||
unwatch_last += host_address_offset();
|
||||
assert_true(unwatch_first <= unwatch_last);
|
||||
system_page_first = unwatch_first / system_page_size_;
|
||||
system_page_last = unwatch_last / system_page_size_;
|
||||
system_page_first = unwatch_first >> system_page_shift_;
|
||||
system_page_last = unwatch_last >> system_page_shift_;
|
||||
block_index_first = system_page_first >> 6;
|
||||
block_index_last = system_page_last >> 6;
|
||||
}
|
||||
|
@ -1874,8 +1922,8 @@ bool PhysicalHeap::TriggerCallbacks(
|
|||
(uint64_t(1) << (i & 63))) != 0;
|
||||
if (unprotect_page) {
|
||||
uint32_t guest_page_number =
|
||||
xe::sat_sub(i * system_page_size_, host_address_offset()) /
|
||||
page_size_;
|
||||
xe::sat_sub(i << system_page_shift_, host_address_offset()) >>
|
||||
page_size_shift_;
|
||||
if (ToPageAccess(page_table_[guest_page_number].current_protect) !=
|
||||
xe::memory::PageAccess::kReadWrite) {
|
||||
unprotect_page = false;
|
||||
|
@ -1888,8 +1936,9 @@ bool PhysicalHeap::TriggerCallbacks(
|
|||
} else {
|
||||
if (unprotect_system_page_first != UINT32_MAX) {
|
||||
xe::memory::Protect(
|
||||
protect_base + unprotect_system_page_first * system_page_size_,
|
||||
(i - unprotect_system_page_first) * system_page_size_,
|
||||
protect_base +
|
||||
(unprotect_system_page_first << system_page_shift_),
|
||||
(i - unprotect_system_page_first) << system_page_shift_,
|
||||
xe::memory::PageAccess::kReadWrite);
|
||||
unprotect_system_page_first = UINT32_MAX;
|
||||
}
|
||||
|
@ -1897,9 +1946,9 @@ bool PhysicalHeap::TriggerCallbacks(
|
|||
}
|
||||
if (unprotect_system_page_first != UINT32_MAX) {
|
||||
xe::memory::Protect(
|
||||
protect_base + unprotect_system_page_first * system_page_size_,
|
||||
(system_page_last + 1 - unprotect_system_page_first) *
|
||||
system_page_size_,
|
||||
protect_base + (unprotect_system_page_first << system_page_shift_),
|
||||
(system_page_last + 1 - unprotect_system_page_first)
|
||||
<< system_page_shift_,
|
||||
xe::memory::PageAccess::kReadWrite);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -216,6 +216,7 @@ class BaseHeap {
|
|||
uint32_t heap_base_;
|
||||
uint32_t heap_size_;
|
||||
uint32_t page_size_;
|
||||
uint32_t page_size_shift_;
|
||||
uint32_t host_address_offset_;
|
||||
uint32_t unreserved_page_count_;
|
||||
xe::global_critical_region global_critical_region_;
|
||||
|
@ -270,18 +271,36 @@ class PhysicalHeap : public BaseHeap {
|
|||
void EnableAccessCallbacks(uint32_t physical_address, uint32_t length,
|
||||
bool enable_invalidation_notifications,
|
||||
bool enable_data_providers);
|
||||
template <bool enable_invalidation_notifications>
|
||||
XE_NOINLINE void EnableAccessCallbacksInner(
|
||||
const uint32_t system_page_first, const uint32_t system_page_last,
|
||||
xe::memory::PageAccess protect_access) XE_RESTRICT;
|
||||
|
||||
// Returns true if any page in the range was watched.
|
||||
bool TriggerCallbacks(global_unique_lock_type global_lock_locked_once,
|
||||
uint32_t virtual_address, uint32_t length, bool is_write,
|
||||
bool unwatch_exact_range, bool unprotect = true);
|
||||
uint32_t virtual_address, uint32_t length,
|
||||
bool is_write, bool unwatch_exact_range,
|
||||
bool unprotect = true);
|
||||
|
||||
uint32_t GetPhysicalAddress(uint32_t address) const;
|
||||
|
||||
uint32_t SystemPagenumToGuestPagenum(uint32_t num) const {
|
||||
return ((num << system_page_shift_) - host_address_offset()) >> page_size_shift_;
|
||||
}
|
||||
|
||||
uint32_t GuestPagenumToSystemPagenum(uint32_t num) {
|
||||
num <<= page_size_shift_;
|
||||
num += host_address_offset();
|
||||
num >>= system_page_shift_;
|
||||
return num;
|
||||
}
|
||||
protected:
|
||||
VirtualHeap* parent_heap_;
|
||||
|
||||
uint32_t system_page_size_;
|
||||
uint32_t system_page_count_;
|
||||
uint32_t system_page_shift_;
|
||||
uint32_t padding1_;
|
||||
|
||||
struct SystemPageFlagsBlock {
|
||||
// Whether writing to each page should result trigger invalidation
|
||||
|
@ -458,9 +477,9 @@ class Memory {
|
|||
// TODO(Triang3l): Implement data providers - this is why locking depth of 1
|
||||
// will be required in the future.
|
||||
bool TriggerPhysicalMemoryCallbacks(
|
||||
global_unique_lock_type global_lock_locked_once,
|
||||
uint32_t virtual_address, uint32_t length, bool is_write,
|
||||
bool unwatch_exact_range, bool unprotect = true);
|
||||
global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
|
||||
uint32_t length, bool is_write, bool unwatch_exact_range,
|
||||
bool unprotect = true);
|
||||
|
||||
// Allocates virtual memory from the 'system' heap.
|
||||
// System memory is kept separate from game memory but is still accessible
|
||||
|
@ -509,10 +528,10 @@ class Memory {
|
|||
const void* host_address);
|
||||
|
||||
bool AccessViolationCallback(global_unique_lock_type global_lock_locked_once,
|
||||
void* host_address, bool is_write);
|
||||
void* host_address, bool is_write);
|
||||
static bool AccessViolationCallbackThunk(
|
||||
global_unique_lock_type global_lock_locked_once,
|
||||
void* context, void* host_address, bool is_write);
|
||||
global_unique_lock_type global_lock_locked_once, void* context,
|
||||
void* host_address, bool is_write);
|
||||
|
||||
std::filesystem::path file_name_;
|
||||
uint32_t system_page_size_ = 0;
|
||||
|
|
Loading…
Reference in New Issue