Merge pull request #61 from chrisps/canary_experimental

performance improvements, kernel fixes, cpu accuracy improvements
This commit is contained in:
Radosław Gliński 2022-08-21 09:31:09 +02:00 committed by GitHub
commit 0b013fdc6b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
70 changed files with 1744 additions and 541 deletions

View File

@ -979,7 +979,12 @@ void EmulatorWindow::ToggleDisplayConfigDialog() {
}
void EmulatorWindow::ToggleControllerVibration() {
emulator()->input_system()->ToggleVibration();
auto input_sys = emulator()->input_system();
if (input_sys) {
auto input_lock = input_sys->lock();
input_sys->ToggleVibration();
}
}
void EmulatorWindow::ShowCompatibility() {

View File

@ -91,7 +91,7 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
}
bool XmaContext::Work() {
std::lock_guard<std::mutex> lock(lock_);
std::lock_guard<xe_mutex> lock(lock_);
if (!is_allocated() || !is_enabled()) {
return false;
}
@ -106,7 +106,7 @@ bool XmaContext::Work() {
}
void XmaContext::Enable() {
std::lock_guard<std::mutex> lock(lock_);
std::lock_guard<xe_mutex> lock(lock_);
auto context_ptr = memory()->TranslateVirtual(guest_ptr());
XMA_CONTEXT_DATA data(context_ptr);
@ -134,7 +134,7 @@ bool XmaContext::Block(bool poll) {
}
void XmaContext::Clear() {
std::lock_guard<std::mutex> lock(lock_);
std::lock_guard<xe_mutex> lock(lock_);
XELOGAPU("XmaContext: reset context {}", id());
auto context_ptr = memory()->TranslateVirtual(guest_ptr());
@ -151,14 +151,14 @@ void XmaContext::Clear() {
}
void XmaContext::Disable() {
std::lock_guard<std::mutex> lock(lock_);
std::lock_guard<xe_mutex> lock(lock_);
XELOGAPU("XmaContext: disabling context {}", id());
set_is_enabled(false);
}
void XmaContext::Release() {
// Lock it in case the decoder thread is working on it now.
std::lock_guard<std::mutex> lock(lock_);
std::lock_guard<xe_mutex> lock(lock_);
assert_true(is_allocated_ == true);
set_is_allocated(false);

View File

@ -200,7 +200,7 @@ class XmaContext {
uint32_t id_ = 0;
uint32_t guest_ptr_ = 0;
std::mutex lock_;
xe_mutex lock_;
bool is_allocated_ = false;
bool is_enabled_ = false;
// bool is_dirty_ = true;

View File

@ -15,6 +15,13 @@
#include "xenia/base/assert.h"
#include "xenia/base/math.h"
#include "xenia/base/mutex.h"
#if defined(_WIN32)
#include "xenia/base/platform_win.h"
#endif
DEFINE_bool(clock_no_scaling, false,
"Disable scaling code. Time management and locking is bypassed. "
@ -42,8 +49,13 @@ std::pair<uint64_t, uint64_t> guest_tick_ratio_ = std::make_pair(1, 1);
uint64_t last_guest_tick_count_ = 0;
// Last sampled host tick count.
uint64_t last_host_tick_count_ = Clock::QueryHostTickCount();
using tick_mutex_type = xe_unlikely_mutex;
// Mutex to ensure last_host_tick_count_ and last_guest_tick_count_ are in sync
std::mutex tick_mutex_;
// std::mutex tick_mutex_;
static tick_mutex_type tick_mutex_;
void RecomputeGuestTickScalar() {
// Create a rational number with numerator (first) and denominator (second)
@ -61,7 +73,7 @@ void RecomputeGuestTickScalar() {
// Keep this a rational calculation and reduce the fraction
reduce_fraction(frac);
std::lock_guard<std::mutex> lock(tick_mutex_);
std::lock_guard<tick_mutex_type> lock(tick_mutex_);
guest_tick_ratio_ = frac;
}
@ -75,7 +87,7 @@ uint64_t UpdateGuestClock() {
return host_tick_count * guest_tick_ratio_.first / guest_tick_ratio_.second;
}
std::unique_lock<std::mutex> lock(tick_mutex_, std::defer_lock);
std::unique_lock<tick_mutex_type> lock(tick_mutex_, std::defer_lock);
if (lock.try_lock()) {
// Translate host tick count to guest tick count.
uint64_t host_tick_delta = host_tick_count > last_host_tick_count_
@ -107,7 +119,6 @@ inline uint64_t QueryGuestSystemTimeOffset() {
return guest_tick_count * numerator / denominator;
}
uint64_t Clock::QueryHostTickFrequency() {
#if XE_CLOCK_RAW_AVAILABLE
if (cvars::clock_source_raw) {
@ -137,7 +148,7 @@ void Clock::set_guest_time_scalar(double scalar) {
}
std::pair<uint64_t, uint64_t> Clock::guest_tick_ratio() {
std::lock_guard<std::mutex> lock(tick_mutex_);
std::lock_guard<tick_mutex_type> lock(tick_mutex_);
return guest_tick_ratio_;
}
@ -159,6 +170,7 @@ uint64_t Clock::QueryGuestTickCount() {
return guest_tick_count;
}
uint64_t* Clock::GetGuestTickCountPointer() { return &last_guest_tick_count_; }
uint64_t Clock::QueryGuestSystemTime() {
if (cvars::clock_no_scaling) {
return Clock::QueryHostSystemTime();

View File

@ -33,11 +33,15 @@ class Clock {
// Either from platform suplied time source or from hardware directly.
static uint64_t host_tick_frequency_platform();
#if XE_CLOCK_RAW_AVAILABLE
XE_NOINLINE
static uint64_t host_tick_frequency_raw();
#endif
// Host tick count. Generally QueryHostTickCount() should be used.
static uint64_t host_tick_count_platform();
#if XE_CLOCK_RAW_AVAILABLE
//chrispy: the way msvc was ordering the branches was causing rdtsc to be speculatively executed each time
//the branch history was lost
XE_NOINLINE
static uint64_t host_tick_count_raw();
#endif
@ -70,6 +74,8 @@ class Clock {
// Queries the current guest tick count, accounting for frequency adjustment
// and scaling.
static uint64_t QueryGuestTickCount();
static uint64_t* GetGuestTickCountPointer();
// Queries the guest time, in FILETIME format, accounting for scaling.
static uint64_t QueryGuestSystemTime();
// Queries the milliseconds since the guest began, accounting for scaling.

View File

@ -12,7 +12,17 @@
#include "xenia/base/platform_win.h"
namespace xe {
#if XE_USE_KUSER_SHARED == 1
uint64_t Clock::host_tick_frequency_platform() { return 10000000ULL; }
uint64_t Clock::host_tick_count_platform() {
return *reinterpret_cast<volatile uint64_t*>(GetKUserSharedSystemTime());
}
uint64_t Clock::QueryHostSystemTime() {
return *reinterpret_cast<volatile uint64_t*>(GetKUserSharedSystemTime());
}
#else
uint64_t Clock::host_tick_frequency_platform() {
LARGE_INTEGER frequency;
QueryPerformanceFrequency(&frequency);
@ -27,13 +37,13 @@ uint64_t Clock::host_tick_count_platform() {
}
return time;
}
uint64_t Clock::QueryHostSystemTime() {
FILETIME t;
GetSystemTimeAsFileTime(&t);
return (uint64_t(t.dwHighDateTime) << 32) | t.dwLowDateTime;
}
#endif
uint64_t Clock::QueryHostUptimeMillis() {
return host_tick_count_platform() * 1000 / host_tick_frequency_platform();
}

View File

@ -41,10 +41,14 @@
"\n" \
"Set the cvar 'clock_source_raw' to 'false'.");
namespace xe {
// Getting the TSC frequency can be a bit tricky. This method here only works on
// Intel as it seems. There is no easy way to get the frequency outside of ring0
// on AMD, so we fail gracefully if not possible.
XE_NOINLINE
uint64_t Clock::host_tick_frequency_raw() {
uint32_t eax, ebx, ecx, edx;
@ -71,6 +75,8 @@ uint64_t Clock::host_tick_frequency_raw() {
return 0;
}
if (max_cpuid >= 0x15) {
// 15H Get TSC/Crystal ratio and Crystal Hz.
xe_cpu_cpuid(0x15, eax, ebx, ecx, edx);
@ -92,10 +98,11 @@ uint64_t Clock::host_tick_frequency_raw() {
return cpu_base_freq;
}
CLOCK_FATAL("The clock frequency could not be determined.");
return 0;
}
XE_NOINLINE
uint64_t Clock::host_tick_count_raw() { return xe_cpu_rdtsc(); }
} // namespace xe

View File

@ -19,7 +19,7 @@ namespace xe {
// TODO(Triang3l): Set the default depending on the actual subsystem. Currently
// it inhibits message boxes.
static bool has_console_attached_ = true;
static bool has_console_attached_ = false;
bool has_console_attached() { return has_console_attached_; }

View File

@ -78,17 +78,25 @@ std::pair<char*, size_t> GetThreadBuffer();
void AppendLogLine(LogLevel log_level, const char prefix_char, size_t written);
} // namespace internal
// Appends a line to the log with {fmt}-style formatting.
template <typename... Args>
void AppendLogLineFormat(LogLevel log_level, const char prefix_char,
XE_NOINLINE XE_COLD static void AppendLogLineFormat_Impl(LogLevel log_level,
const char prefix_char,
const char* format,
const Args&... args) {
auto target = internal::GetThreadBuffer();
auto result = fmt::format_to_n(target.first, target.second, format, args...);
internal::AppendLogLine(log_level, prefix_char, result.size);
}
// Appends a line to the log with {fmt}-style formatting.
//chrispy: inline the initial check, outline the append. the append should happen rarely for end users
template <typename... Args>
XE_FORCEINLINE static void AppendLogLineFormat(LogLevel log_level, const char prefix_char,
const char* format, const Args&... args) {
if (!internal::ShouldLog(log_level)) {
return;
}
auto target = internal::GetThreadBuffer();
auto result = fmt::format_to_n(target.first, target.second, format, args...);
internal::AppendLogLine(log_level, prefix_char, result.size);
AppendLogLineFormat_Impl(log_level, prefix_char, format, args...);
}
// Appends a line to the log.
@ -98,18 +106,19 @@ void AppendLogLine(LogLevel log_level, const char prefix_char,
} // namespace logging
// Logs a fatal error and aborts the program.
void FatalError(const std::string_view str);
[[noreturn]] void FatalError(const std::string_view str);
} // namespace xe
#if XE_OPTION_ENABLE_LOGGING
template <typename... Args>
void XELOGE(const char* format, const Args&... args) {
XE_COLD void XELOGE(const char* format, const Args&... args) {
xe::logging::AppendLogLineFormat(xe::LogLevel::Error, '!', format, args...);
}
template <typename... Args>
XE_COLD
void XELOGW(const char* format, const Args&... args) {
xe::logging::AppendLogLineFormat(xe::LogLevel::Warning, 'w', format, args...);
}
@ -131,12 +140,12 @@ void XELOGCPU(const char* format, const Args&... args) {
template <typename... Args>
void XELOGAPU(const char* format, const Args&... args) {
xe::logging::AppendLogLineFormat(xe::LogLevel::Info, 'A', format, args...);
xe::logging::AppendLogLineFormat(xe::LogLevel::Debug, 'A', format, args...);
}
template <typename... Args>
void XELOGGPU(const char* format, const Args&... args) {
xe::logging::AppendLogLineFormat(xe::LogLevel::Info, 'G', format, args...);
xe::logging::AppendLogLineFormat(xe::LogLevel::Debug, 'G', format, args...);
}
template <typename... Args>

View File

@ -376,6 +376,29 @@ template <int N>
int64_t m128_i64(const __m128& v) {
return m128_i64<N>(_mm_castps_pd(v));
}
/*
std::min/max float has handling for nans, where if either argument is nan the first argument is returned
minss/maxss are different, if either argument is nan the second operand to the instruction is returned
this is problematic because we have no assurances from the compiler on the argument ordering
so only use in places where nan handling is not needed
*/
static float xe_minf(float x, float y) {
return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y)));
}
static float xe_maxf(float x, float y) {
return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y)));
}
static float xe_rcpf(float den) {
return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
}
#else
static float xe_minf(float x, float y) { return std::min<float>(x, y); }
static float xe_maxf(float x, float y) { return std::max<float>(x, y); }
static float xe_rcpf(float den) { return 1.0f / den; }
#endif
// Similar to the C++ implementation of XMConvertFloatToHalf and

View File

@ -467,6 +467,259 @@ constexpr inline fourcc_t make_fourcc(const std::string_view fourcc) {
return make_fourcc(fourcc[0], fourcc[1], fourcc[2], fourcc[3]);
}
// chrispy::todo:use for command stream vector, resize happens a ton and has to
// call memset
template <size_t sz>
class FixedVMemVector {
static_assert((sz & 65535) == 0,
"Always give fixed_vmem_vector a size divisible by 65536 to "
"avoid wasting memory on windows");
uint8_t* data_;
size_t nbytes_;
public:
FixedVMemVector()
: data_((uint8_t*)memory::AllocFixed(
nullptr, sz, memory::AllocationType::kReserveCommit,
memory::PageAccess::kReadWrite)),
nbytes_(0) {}
~FixedVMemVector() {
if (data_) {
memory::DeallocFixed(data_, sz, memory::DeallocationType::kRelease);
data_ = nullptr;
}
nbytes_ = 0;
}
uint8_t* data() const { return data_; }
size_t size() const { return nbytes_; }
void resize(size_t newsize) {
nbytes_ = newsize;
xenia_assert(newsize < sz);
}
size_t alloc() const { return sz; }
void clear() {
resize(0); // todo:maybe zero out
}
void reserve(size_t size) { xenia_assert(size < sz); }
};
// software prefetches/cache operations
namespace swcache {
/*
warning, prefetchw's current behavior is not consistent across msvc and
clang, for clang it will only compile to prefetchw if the set architecture
supports it, for msvc however it will unconditionally compile to prefetchw!
so prefetchw support is still in process
only use these if you're absolutely certain you know what you're doing;
you can easily tank performance through misuse CPUS have excellent automatic
prefetchers that can predict patterns, but in situations where memory
accesses are super unpredictable and follow no pattern you can make use of
them
another scenario where it can be handy is when crossing page boundaries,
as many automatic prefetchers do not allow their streams to cross pages (no
idea what this means for huge pages)
I believe software prefetches do not kick off an automatic prefetcher
stream, so you can't just prefetch one line of the data you're about to
access and be fine, you need to go all the way
prefetchnta is implementation dependent, and that makes its use a bit
limited. For intel cpus, i believe it only prefetches the line into one way
of the L3
for amd cpus, it marks the line as requiring immediate eviction, the
next time an entry is needed in the set it resides in it will be evicted. ms
does dumb shit for memcpy, like looping over the contents of the source
buffer and doing prefetchnta on them, likely evicting some of the data they
just prefetched by the end of the buffer, and probably messing up data that
was already in the cache
another warning for these: this bypasses what i think is called
"critical word load", the data will always become available starting from the
very beginning of the line instead of from the piece that is needed
L1I cache is not prefetchable, however likely all cpus can fulfill
requests for the L1I from L2, so prefetchL2 on instructions should be fine
todo: clwb, clflush
*/
#if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
XE_FORCEINLINE
static void PrefetchW(const void* addr) { __builtin_prefetch(addr, 1, 0); }
XE_FORCEINLINE
static void PrefetchNTA(const void* addr) { __builtin_prefetch(addr, 0, 0); }
XE_FORCEINLINE
static void PrefetchL3(const void* addr) { __builtin_prefetch(addr, 0, 1); }
XE_FORCEINLINE
static void PrefetchL2(const void* addr) { __builtin_prefetch(addr, 0, 2); }
XE_FORCEINLINE
static void PrefetchL1(const void* addr) { __builtin_prefetch(addr, 0, 3); }
#elif XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1
XE_FORCEINLINE
static void PrefetchW(const void* addr) { _m_prefetchw(addr); }
XE_FORCEINLINE
static void PrefetchNTA(const void* addr) {
_mm_prefetch((const char*)addr, _MM_HINT_NTA);
}
XE_FORCEINLINE
static void PrefetchL3(const void* addr) {
_mm_prefetch((const char*)addr, _MM_HINT_T2);
}
XE_FORCEINLINE
static void PrefetchL2(const void* addr) {
_mm_prefetch((const char*)addr, _MM_HINT_T1);
}
XE_FORCEINLINE
static void PrefetchL1(const void* addr) {
_mm_prefetch((const char*)addr, _MM_HINT_T0);
}
#else
XE_FORCEINLINE
static void PrefetchW(const void* addr) {}
XE_FORCEINLINE
static void PrefetchNTA(const void* addr) {}
XE_FORCEINLINE
static void PrefetchL3(const void* addr) {}
XE_FORCEINLINE
static void PrefetchL2(const void* addr) {}
XE_FORCEINLINE
static void PrefetchL1(const void* addr) {}
#endif
enum class PrefetchTag { Write, Nontemporal, Level3, Level2, Level1 };
template <PrefetchTag tag>
static void Prefetch(const void* addr) {
static_assert(false, "Unknown tag");
}
template <>
static void Prefetch<PrefetchTag::Write>(const void* addr) {
PrefetchW(addr);
}
template <>
static void Prefetch<PrefetchTag::Nontemporal>(const void* addr) {
PrefetchNTA(addr);
}
template <>
static void Prefetch<PrefetchTag::Level3>(const void* addr) {
PrefetchL3(addr);
}
template <>
static void Prefetch<PrefetchTag::Level2>(const void* addr) {
PrefetchL2(addr);
}
template <>
static void Prefetch<PrefetchTag::Level1>(const void* addr) {
PrefetchL1(addr);
}
// todo: does aarch64 have streaming stores/loads?
/*
non-temporal stores/loads
the stores allow cacheable memory to behave like write-combining memory.
on the first nt store to a line, an intermediate buffer will be
allocated by the cpu for stores that come after. once the entire contents of
the line have been written the intermediate buffer will be transmitted to
memory
the written line will not be cached and if it is in the cache it will be
invalidated from all levels of the hierarchy
the cpu in this case does not have to read line from memory when we
first write to it if it is not anywhere in the cache, so we use half the
memory bandwidth using these stores
non-temporal loads are... loads, but they dont use the cache. you need
to manually insert memory barriers (_ReadWriteBarrier, ReadBarrier, etc, do
not use any barriers that generate actual code) if on msvc to prevent it from
moving the load of the data to just before the use of the data (immediately
requiring the memory to be available = big stall)
*/
#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL == 0
#define XE_MSVC_REORDER_BARRIER _ReadWriteBarrier
#else
// if the compiler actually has pipelining for instructions we dont need a
// barrier
#define XE_MSVC_REORDER_BARRIER() static_cast<void>(0)
#endif
#if XE_ARCH_AMD64 == 1
XE_FORCEINLINE
static void WriteLineNT(void* destination, const void* source) {
assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
__m256i low = _mm256_loadu_si256((const __m256i*)source);
__m256i high = _mm256_loadu_si256(&((const __m256i*)source)[1]);
XE_MSVC_REORDER_BARRIER();
_mm256_stream_si256((__m256i*)destination, low);
_mm256_stream_si256(&((__m256i*)destination)[1], high);
}
XE_FORCEINLINE
static void ReadLineNT(void* destination, const void* source) {
assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
__m256i low = _mm256_stream_load_si256((const __m256i*)source);
__m256i high = _mm256_stream_load_si256(&((const __m256i*)source)[1]);
XE_MSVC_REORDER_BARRIER();
_mm256_storeu_si256((__m256i*)destination, low);
_mm256_storeu_si256(&((__m256i*)destination)[1], high);
}
XE_FORCEINLINE
static void WriteFence() { _mm_sfence(); }
XE_FORCEINLINE
static void ReadFence() { _mm_lfence(); }
XE_FORCEINLINE
static void ReadWriteFence() { _mm_mfence(); }
#else
XE_FORCEINLINE
static void WriteLineNT(void* destination, const void* source) {
assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
memcpy(destination, source, 64);
}
XE_FORCEINLINE
static void ReadLineNT(void* destination, const void* source) {
assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
memcpy(destination, source, 64);
}
XE_FORCEINLINE
static void WriteFence() {}
XE_FORCEINLINE
static void ReadFence() {}
XE_FORCEINLINE
static void ReadWriteFence() {}
#endif
} // namespace swcache
} // namespace xe
#endif // XENIA_BASE_MEMORY_H_

View File

@ -8,11 +8,79 @@
*/
#include "xenia/base/mutex.h"
#if XE_PLATFORM_WIN32 == 1
#include "xenia/base/platform_win.h"
#endif
namespace xe {
#if XE_PLATFORM_WIN32 == 1 && XE_ENABLE_FAST_WIN32_MUTEX == 1
// default spincount for entercriticalsection is insane on windows, 0x20007D0i64
// (33556432 times!!) when a lock is highly contended performance degrades
// sharply on some processors todo: perhaps we should have a set of optional
// jobs that processors can do instead of spinning, for instance, sorting a list
// so we have better locality later or something
#define XE_CRIT_SPINCOUNT 128
/*
chrispy: todo, if a thread exits before releasing the global mutex we need to
check this and release the mutex one way to do this is by using FlsAlloc and
PFLS_CALLBACK_FUNCTION, which gets called with the fiber local data when a
thread exits
*/
thread_local unsigned global_mutex_depth = 0;
static CRITICAL_SECTION* global_critical_section(xe_global_mutex* mutex) {
return reinterpret_cast<CRITICAL_SECTION*>(mutex);
}
xe_global_mutex::xe_global_mutex() {
InitializeCriticalSectionEx(global_critical_section(this), XE_CRIT_SPINCOUNT,
CRITICAL_SECTION_NO_DEBUG_INFO);
}
xe_global_mutex ::~xe_global_mutex() {
DeleteCriticalSection(global_critical_section(this));
}
void xe_global_mutex::lock() {
if (global_mutex_depth) {
} else {
EnterCriticalSection(global_critical_section(this));
}
global_mutex_depth++;
}
void xe_global_mutex::unlock() {
if (--global_mutex_depth == 0) {
LeaveCriticalSection(global_critical_section(this));
}
}
bool xe_global_mutex::try_lock() {
if (global_mutex_depth) {
++global_mutex_depth;
return true;
} else {
BOOL success = TryEnterCriticalSection(global_critical_section(this));
if (success) {
++global_mutex_depth;
}
return success;
}
}
CRITICAL_SECTION* fast_crit(xe_fast_mutex* mutex) {
return reinterpret_cast<CRITICAL_SECTION*>(mutex);
}
xe_fast_mutex::xe_fast_mutex() {
InitializeCriticalSectionEx(fast_crit(this), XE_CRIT_SPINCOUNT,
CRITICAL_SECTION_NO_DEBUG_INFO);
}
xe_fast_mutex::~xe_fast_mutex() { DeleteCriticalSection(fast_crit(this)); }
void xe_fast_mutex::lock() { EnterCriticalSection(fast_crit(this)); }
void xe_fast_mutex::unlock() { LeaveCriticalSection(fast_crit(this)); }
bool xe_fast_mutex::try_lock() {
return TryEnterCriticalSection(fast_crit(this));
}
#endif
// chrispy: moved this out of body of function to eliminate the initialization
// guards
static std::recursive_mutex global_mutex;
std::recursive_mutex& global_critical_region::mutex() { return global_mutex; }
static global_mutex_type global_mutex;
global_mutex_type& global_critical_region::mutex() { return global_mutex; }
} // namespace xe

View File

@ -9,11 +9,90 @@
#ifndef XENIA_BASE_MUTEX_H_
#define XENIA_BASE_MUTEX_H_
#include <mutex>
#include "platform.h"
#define XE_ENABLE_FAST_WIN32_MUTEX 1
namespace xe {
#if XE_PLATFORM_WIN32 == 1 && XE_ENABLE_FAST_WIN32_MUTEX == 1
/*
must conform to
BasicLockable:https://en.cppreference.com/w/cpp/named_req/BasicLockable as
well as Lockable: https://en.cppreference.com/w/cpp/named_req/Lockable
this emulates a recursive mutex, except with far less overhead
*/
class alignas(4096) xe_global_mutex {
char detail[64];
public:
xe_global_mutex();
~xe_global_mutex();
void lock();
void unlock();
bool try_lock();
};
using global_mutex_type = xe_global_mutex;
class alignas(64) xe_fast_mutex {
char detail[64];
public:
xe_fast_mutex();
~xe_fast_mutex();
void lock();
void unlock();
bool try_lock();
};
// a mutex that is extremely unlikely to ever be locked
// use for race conditions that have extremely remote odds of happening
class xe_unlikely_mutex {
std::atomic<uint32_t> mut;
bool _tryget() {
uint32_t lock_expected = 0;
return mut.compare_exchange_strong(lock_expected, 1);
}
public:
xe_unlikely_mutex() : mut(0) {}
~xe_unlikely_mutex() { mut = 0; }
void lock() {
uint32_t lock_expected = 0;
if (XE_LIKELY(_tryget())) {
return;
} else {
do {
// chrispy: warning, if no SMT, mm_pause does nothing...
#if XE_ARCH_AMD64 == 1
_mm_pause();
#endif
} while (!_tryget());
}
}
void unlock() { mut.exchange(0); }
bool try_lock() { return _tryget(); }
};
using xe_mutex = xe_fast_mutex;
#else
using global_mutex_type = std::recursive_mutex;
using xe_mutex = std::mutex;
using xe_unlikely_mutex = std::mutex;
#endif
struct null_mutex {
public:
static void lock() {}
static void unlock() {}
static bool try_lock() { return true; }
};
using global_unique_lock_type = std::unique_lock<global_mutex_type>;
// The global critical region mutex singleton.
// This must guard any operation that may suspend threads or be sensitive to
// being suspended such as global table locks and such.
@ -54,30 +133,30 @@ namespace xe {
// };
class global_critical_region {
public:
static std::recursive_mutex& mutex();
static global_mutex_type& mutex();
// Acquires a lock on the global critical section.
// Use this when keeping an instance is not possible. Otherwise, prefer
// to keep an instance of global_critical_region near the members requiring
// it to keep things readable.
static std::unique_lock<std::recursive_mutex> AcquireDirect() {
return std::unique_lock<std::recursive_mutex>(mutex());
static global_unique_lock_type AcquireDirect() {
return global_unique_lock_type(mutex());
}
// Acquires a lock on the global critical section.
inline std::unique_lock<std::recursive_mutex> Acquire() {
return std::unique_lock<std::recursive_mutex>(mutex());
inline global_unique_lock_type Acquire() {
return global_unique_lock_type(mutex());
}
// Acquires a deferred lock on the global critical section.
inline std::unique_lock<std::recursive_mutex> AcquireDeferred() {
return std::unique_lock<std::recursive_mutex>(mutex(), std::defer_lock);
inline global_unique_lock_type AcquireDeferred() {
return global_unique_lock_type(mutex(), std::defer_lock);
}
// Tries to acquire a lock on the glboal critical section.
// Check owns_lock() to see if the lock was successfully acquired.
inline std::unique_lock<std::recursive_mutex> TryAcquire() {
return std::unique_lock<std::recursive_mutex>(mutex(), std::try_to_lock);
inline global_unique_lock_type TryAcquire() {
return global_unique_lock_type(mutex(), std::try_to_lock);
}
};

View File

@ -122,6 +122,7 @@
#define XE_COLD __attribute__((cold))
#define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
#define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
#else
#define XE_FORCEINLINE inline
#define XE_NOINLINE
@ -129,6 +130,24 @@
#define XE_LIKELY(...) (!!(__VA_ARGS__))
#define XE_UNLIKELY(...) (!!(__VA_ARGS__))
#endif
// only use __restrict if MSVC, for clang/gcc we can use -fstrict-aliasing which
// acts as __restrict across the board todo: __restrict is part of the type
// system, we might actually have to still emit it on clang and gcc
#if XE_COMPILER_CLANG_CL == 0 && XE_COMPILER_MSVC == 1
#define XE_RESTRICT __restrict
#else
#define XE_RESTRICT
#endif
#if XE_ARCH_AMD64 == 1
#define XE_HOST_CACHE_LINE_SIZE 64
#elif XE_ARCH_ARM64 == 1
#define XE_HOST_CACHE_LINE_SIZE 64
#else
#error unknown cache line size for unknown architecture!
#endif
namespace xe {

View File

@ -34,31 +34,48 @@
#undef DeleteFile
#undef GetFirstChild
#define XE_USE_NTDLL_FUNCTIONS 1
#if XE_USE_NTDLL_FUNCTIONS==1
#define XE_USE_NTDLL_FUNCTIONS 1
//chrispy: disabling this for now, more research needs to be done imo, although it does work very well on my machine
//
#define XE_USE_KUSER_SHARED 0
#if XE_USE_NTDLL_FUNCTIONS == 1
/*
ntdll versions of functions often skip through a lot of extra garbage in KernelBase
ntdll versions of functions often skip through a lot of extra garbage in
KernelBase
*/
#define XE_NTDLL_IMPORT(name, cls, clsvar) \
static class cls { \
public: \
FARPROC fn;\
cls() : fn(nullptr) {\
auto ntdll = GetModuleHandleA("ntdll.dll");\
if (ntdll) { \
fn = GetProcAddress(ntdll, #name );\
}\
} \
template <typename TRet = void, typename... TArgs> \
inline TRet invoke(TArgs... args) {\
return reinterpret_cast<NTSYSAPI TRet(NTAPI*)(TArgs...)>(fn)(args...);\
}\
inline operator bool() const {\
return fn!=nullptr;\
}\
#define XE_NTDLL_IMPORT(name, cls, clsvar) \
static class cls { \
public: \
FARPROC fn; \
cls() : fn(nullptr) { \
auto ntdll = GetModuleHandleA("ntdll.dll"); \
if (ntdll) { \
fn = GetProcAddress(ntdll, #name); \
} \
} \
template <typename TRet = void, typename... TArgs> \
inline TRet invoke(TArgs... args) { \
return reinterpret_cast<NTSYSAPI TRet(NTAPI*)(TArgs...)>(fn)(args...); \
} \
inline operator bool() const { return fn != nullptr; } \
} clsvar
#else
#define XE_NTDLL_IMPORT(name, cls, clsvar) static constexpr bool clsvar = false
#endif
#if XE_USE_KUSER_SHARED==1
// KUSER_SHARED
struct __declspec(align(4)) _KSYSTEM_TIME {
unsigned int LowPart;
int High1Time;
int High2Time;
};
static constexpr size_t KSUER_SHARED_SYSTEMTIME_OFFSET = 0x14;
static unsigned char* KUserShared() { return (unsigned char*)0x7FFE0000ULL; }
static volatile _KSYSTEM_TIME* GetKUserSharedSystemTime() {
return reinterpret_cast<volatile _KSYSTEM_TIME*>(
KUserShared() + KSUER_SHARED_SYSTEMTIME_OFFSET);
}
#endif
#endif // XENIA_BASE_PLATFORM_WIN_H_

View File

@ -8,46 +8,52 @@
*/
#include "xenia/base/ring_buffer.h"
#include <algorithm>
#include <cstring>
namespace xe {
RingBuffer::RingBuffer(uint8_t* buffer, size_t capacity)
: buffer_(buffer), capacity_(capacity) {}
: buffer_(buffer),
capacity_(static_cast<ring_size_t>(capacity)),
read_offset_(0),
write_offset_(0) {}
void RingBuffer::AdvanceRead(size_t count) {
void RingBuffer::AdvanceRead(size_t _count) {
ring_size_t count = static_cast<ring_size_t>(_count);
if (read_offset_ + count < capacity_) {
read_offset_ += count;
} else {
size_t left_half = capacity_ - read_offset_;
size_t right_half = count - left_half;
ring_size_t left_half = capacity_ - read_offset_;
ring_size_t right_half = count - left_half;
read_offset_ = right_half;
}
}
void RingBuffer::AdvanceWrite(size_t count) {
void RingBuffer::AdvanceWrite(size_t _count) {
ring_size_t count = static_cast<ring_size_t>(_count);
if (write_offset_ + count < capacity_) {
write_offset_ += count;
} else {
size_t left_half = capacity_ - write_offset_;
size_t right_half = count - left_half;
ring_size_t left_half = capacity_ - write_offset_;
ring_size_t right_half = count - left_half;
write_offset_ = right_half;
}
}
RingBuffer::ReadRange RingBuffer::BeginRead(size_t count) {
count = std::min(count, capacity_);
RingBuffer::ReadRange RingBuffer::BeginRead(size_t _count) {
ring_size_t count =
std::min<ring_size_t>(static_cast<ring_size_t>(_count), capacity_);
if (!count) {
return {0};
}
if (read_offset_ + count < capacity_) {
return {buffer_ + read_offset_, count, nullptr, 0};
return {buffer_ + read_offset_, nullptr, count, 0};
} else {
size_t left_half = capacity_ - read_offset_;
size_t right_half = count - left_half;
return {buffer_ + read_offset_, left_half, buffer_, right_half};
ring_size_t left_half = capacity_ - read_offset_;
ring_size_t right_half = count - left_half;
return {buffer_ + read_offset_, buffer_, left_half, right_half};
}
}
@ -59,7 +65,8 @@ void RingBuffer::EndRead(ReadRange read_range) {
}
}
size_t RingBuffer::Read(uint8_t* buffer, size_t count) {
size_t RingBuffer::Read(uint8_t* buffer, size_t _count) {
ring_size_t count = static_cast<ring_size_t>(_count);
count = std::min(count, capacity_);
if (!count) {
return 0;
@ -69,7 +76,7 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t count) {
if (read_offset_ < write_offset_) {
assert_true(read_offset_ + count <= write_offset_);
} else if (read_offset_ + count >= capacity_) {
size_t left_half = capacity_ - read_offset_;
ring_size_t left_half = capacity_ - read_offset_;
assert_true(count - left_half <= write_offset_);
}
@ -77,8 +84,8 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t count) {
std::memcpy(buffer, buffer_ + read_offset_, count);
read_offset_ += count;
} else {
size_t left_half = capacity_ - read_offset_;
size_t right_half = count - left_half;
ring_size_t left_half = capacity_ - read_offset_;
ring_size_t right_half = count - left_half;
std::memcpy(buffer, buffer_ + read_offset_, left_half);
std::memcpy(buffer + left_half, buffer_, right_half);
read_offset_ = right_half;
@ -87,7 +94,8 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t count) {
return count;
}
size_t RingBuffer::Write(const uint8_t* buffer, size_t count) {
size_t RingBuffer::Write(const uint8_t* buffer, size_t _count) {
ring_size_t count = static_cast<ring_size_t>(_count);
count = std::min(count, capacity_);
if (!count) {
return 0;
@ -105,8 +113,8 @@ size_t RingBuffer::Write(const uint8_t* buffer, size_t count) {
std::memcpy(buffer_ + write_offset_, buffer, count);
write_offset_ += count;
} else {
size_t left_half = capacity_ - write_offset_;
size_t right_half = count - left_half;
ring_size_t left_half = capacity_ - write_offset_;
ring_size_t right_half = count - left_half;
std::memcpy(buffer_ + write_offset_, buffer, left_half);
std::memcpy(buffer_, buffer + left_half, right_half);
write_offset_ = right_half;

View File

@ -17,6 +17,8 @@
#include "xenia/base/assert.h"
#include "xenia/base/byte_order.h"
#include "xenia/base/math.h"
#include "xenia/base/memory.h"
namespace xe {
/*
@ -39,18 +41,24 @@ namespace xe {
that the registers no longer need the rex prefix, shrinking the generated
code a bit.. like i said, every bit helps in this class
*/
using ring_size_t = uint32_t;
class RingBuffer {
public:
RingBuffer(uint8_t* buffer, size_t capacity);
uint8_t* buffer() const { return buffer_; }
size_t capacity() const { return capacity_; }
ring_size_t capacity() const { return capacity_; }
bool empty() const { return read_offset_ == write_offset_; }
size_t read_offset() const { return read_offset_; }
uintptr_t read_ptr() const { return uintptr_t(buffer_) + read_offset_; }
ring_size_t read_offset() const { return read_offset_; }
uintptr_t read_ptr() const {
return uintptr_t(buffer_) + static_cast<uintptr_t>(read_offset_);
}
// todo: offset/ capacity_ is probably always 1 when its over, just check and
// subtract instead
void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; }
size_t read_count() const {
ring_size_t read_count() const {
// chrispy: these branches are unpredictable
#if 0
if (read_offset_ == write_offset_) {
@ -61,14 +69,14 @@ class RingBuffer {
return (capacity_ - read_offset_) + write_offset_;
}
#else
size_t read_offs = read_offset_;
size_t write_offs = write_offset_;
size_t cap = capacity_;
ring_size_t read_offs = read_offset_;
ring_size_t write_offs = write_offset_;
ring_size_t cap = capacity_;
size_t offset_delta = write_offs - read_offs;
size_t wrap_read_count = (cap - read_offs) + write_offs;
ring_size_t offset_delta = write_offs - read_offs;
ring_size_t wrap_read_count = (cap - read_offs) + write_offs;
size_t comparison_value = read_offs <= write_offs;
ring_size_t comparison_value = read_offs <= write_offs;
#if 0
size_t selector =
static_cast<size_t>(-static_cast<ptrdiff_t>(comparison_value));
@ -89,10 +97,12 @@ class RingBuffer {
#endif
}
size_t write_offset() const { return write_offset_; }
ring_size_t write_offset() const { return write_offset_; }
uintptr_t write_ptr() const { return uintptr_t(buffer_) + write_offset_; }
void set_write_offset(size_t offset) { write_offset_ = offset % capacity_; }
size_t write_count() const {
void set_write_offset(size_t offset) {
write_offset_ = static_cast<ring_size_t>(offset) % capacity_;
}
ring_size_t write_count() const {
if (read_offset_ == write_offset_) {
return capacity_;
} else if (write_offset_ < read_offset_) {
@ -107,13 +117,35 @@ class RingBuffer {
struct ReadRange {
const uint8_t* first;
size_t first_length;
const uint8_t* second;
size_t second_length;
ring_size_t first_length;
ring_size_t second_length;
};
ReadRange BeginRead(size_t count);
void EndRead(ReadRange read_range);
/*
BeginRead, but if there is a second Range it will prefetch all lines of it
this does not prefetch the first range, because software prefetching can do that faster than we can
*/
template <swcache::PrefetchTag tag>
XE_FORCEINLINE ReadRange BeginPrefetchedRead(size_t count) {
ReadRange range = BeginRead(count);
if (range.second) {
ring_size_t numlines =
xe::align<ring_size_t>(range.second_length, XE_HOST_CACHE_LINE_SIZE) /
XE_HOST_CACHE_LINE_SIZE;
//chrispy: maybe unroll?
for (ring_size_t i = 0; i < numlines; ++i) {
swcache::Prefetch<tag>(range.second + (i * XE_HOST_CACHE_LINE_SIZE));
}
}
return range;
}
size_t Read(uint8_t* buffer, size_t count);
template <typename T>
size_t Read(T* buffer, size_t count) {
@ -156,29 +188,29 @@ class RingBuffer {
private:
uint8_t* buffer_ = nullptr;
size_t capacity_ = 0;
size_t read_offset_ = 0;
size_t write_offset_ = 0;
ring_size_t capacity_ = 0;
ring_size_t read_offset_ = 0;
ring_size_t write_offset_ = 0;
};
template <>
inline uint32_t RingBuffer::ReadAndSwap<uint32_t>() {
size_t read_offset = this->read_offset_;
ring_size_t read_offset = this->read_offset_;
xenia_assert(this->capacity_ >= 4);
size_t next_read_offset = read_offset + 4;
#if 0
ring_size_t next_read_offset = read_offset + 4;
#if 0
size_t zerotest = next_read_offset - this->capacity_;
// unpredictable branch, use bit arith instead
// todo: it would be faster to use lzcnt, but we need to figure out if all
// machines we support support it
next_read_offset &= -static_cast<ptrdiff_t>(!!zerotest);
#else
#else
if (XE_UNLIKELY(next_read_offset == this->capacity_)) {
next_read_offset = 0;
//todo: maybe prefetch next? or should that happen much earlier?
// todo: maybe prefetch next? or should that happen much earlier?
}
#endif
#endif
this->read_offset_ = next_read_offset;
unsigned int ring_value = *(uint32_t*)&this->buffer_[read_offset];
return xe::byte_swap(ring_value);

View File

@ -148,6 +148,7 @@ bool SetTlsValue(TlsHandle handle, uintptr_t value);
// be kept short or else all timers will be impacted. This is a simplified
// wrapper around QueueTimerRecurring which automatically cancels the timer on
// destruction.
//only used by XboxkrnlModule::XboxkrnlModule
class HighResolutionTimer {
HighResolutionTimer(std::chrono::milliseconds interval,
std::function<void()> callback) {

View File

@ -36,7 +36,7 @@ using WaitItem = TimerQueueWaitItem;
edit: actually had to change it back, when i was testing it only worked because i fixed disruptorplus' code to compile (it gives wrong args to condition_variable::wait_until) but now builds
*/
using WaitStrat = dp::spin_wait_strategy; //dp::blocking_wait_strategy;
using WaitStrat = dp::blocking_wait_strategy;
class TimerQueue {
public:
@ -205,7 +205,7 @@ void TimerQueueWaitItem::Disarm() {
spinner.spin_once();
}
}
//unused
std::weak_ptr<WaitItem> QueueTimerOnce(std::function<void(void*)> callback,
void* userdata,
WaitItem::clock::time_point due) {
@ -213,7 +213,7 @@ std::weak_ptr<WaitItem> QueueTimerOnce(std::function<void(void*)> callback,
std::make_shared<WaitItem>(std::move(callback), userdata, &timer_queue_,
due, WaitItem::clock::duration::zero()));
}
// only used by HighResolutionTimer
std::weak_ptr<WaitItem> QueueTimerRecurring(
std::function<void(void*)> callback, void* userdata,
WaitItem::clock::time_point due, WaitItem::clock::duration interval) {

View File

@ -10,7 +10,7 @@
#include "xenia/cpu/backend/x64/x64_backend.h"
#include <stddef.h>
#include <algorithm>
#include "third_party/capstone/include/capstone/capstone.h"
#include "third_party/capstone/include/capstone/x86.h"
@ -50,6 +50,9 @@ DEFINE_bool(record_mmio_access_exceptions, true,
"for them. This info can then be used on a subsequent run to "
"instruct the recompiler to emit checks",
"CPU");
#if XE_X64_PROFILER_AVAILABLE == 1
DECLARE_bool(instrument_call_times);
#endif
namespace xe {
namespace cpu {
@ -96,6 +99,68 @@ static void ForwardMMIOAccessForRecording(void* context, void* hostaddr) {
reinterpret_cast<X64Backend*>(context)
->RecordMMIOExceptionForGuestInstruction(hostaddr);
}
#if XE_X64_PROFILER_AVAILABLE == 1
// todo: better way of passing to atexit. maybe do in destructor instead?
// nope, destructor is never called
static GuestProfilerData* backend_profiler_data = nullptr;
static uint64_t nanosecond_lifetime_start = 0;
static void WriteGuestProfilerData() {
if (cvars::instrument_call_times) {
uint64_t end = Clock::QueryHostSystemTime();
uint64_t total = end - nanosecond_lifetime_start;
double totaltime_divisor = static_cast<double>(total);
FILE* output_file = nullptr;
std::vector<std::pair<uint32_t, uint64_t>> unsorted_profile{};
for (auto&& entry : *backend_profiler_data) {
if (entry.second) { // skip times of 0
unsorted_profile.emplace_back(entry.first, entry.second);
}
}
std::sort(unsorted_profile.begin(), unsorted_profile.end(),
[](auto& x, auto& y) { return x.second < y.second; });
fopen_s(&output_file, "profile_times.txt", "w");
FILE* idapy_file = nullptr;
fopen_s(&idapy_file, "profile_print_times.py", "w");
for (auto&& sorted_entry : unsorted_profile) {
// double time_in_seconds =
// static_cast<double>(sorted_entry.second) / 10000000.0;
double time_in_milliseconds =
static_cast<double>(sorted_entry.second) / (10000000.0 / 1000.0);
double slice = static_cast<double>(sorted_entry.second) /
static_cast<double>(totaltime_divisor);
fprintf(output_file,
"%X took %.20f milliseconds, totaltime slice percentage %.20f \n",
sorted_entry.first, time_in_milliseconds, slice);
fprintf(idapy_file,
"print(get_name(0x%X) + ' took %.20f ms, %.20f percent')\n",
sorted_entry.first, time_in_milliseconds, slice);
}
fclose(output_file);
fclose(idapy_file);
}
}
static void GuestProfilerUpdateThreadProc() {
nanosecond_lifetime_start = Clock::QueryHostSystemTime();
do {
xe::threading::Sleep(std::chrono::seconds(30));
WriteGuestProfilerData();
} while (true);
}
static std::unique_ptr<xe::threading::Thread> g_profiler_update_thread{};
#endif
bool X64Backend::Initialize(Processor* processor) {
if (!Backend::Initialize(processor)) {
@ -159,6 +224,21 @@ bool X64Backend::Initialize(Processor* processor) {
processor->memory()->SetMMIOExceptionRecordingCallback(
ForwardMMIOAccessForRecording, (void*)this);
#if XE_X64_PROFILER_AVAILABLE == 1
if (cvars::instrument_call_times) {
backend_profiler_data = &profiler_data_;
xe::threading::Thread::CreationParameters slimparams;
slimparams.create_suspended = false;
slimparams.initial_priority = xe::threading::ThreadPriority::kLowest;
slimparams.stack_size = 65536 * 4;
g_profiler_update_thread = std::move(xe::threading::Thread::Create(
slimparams, GuestProfilerUpdateThreadProc));
}
#endif
return true;
}
@ -734,6 +814,7 @@ void X64Backend::InitializeBackendContext(void* ctx) {
bctx->flags = 0;
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
bctx->Ox1000 = 0x1000;
bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
}
const uint32_t mxcsr_table[8] = {
0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
@ -747,6 +828,23 @@ void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) {
bctx->mxcsr_fpu = mxcsr_table[control];
((ppc::PPCContext*)ctx)->fpscr.bits.rn = control;
}
#if XE_X64_PROFILER_AVAILABLE == 1
uint64_t* X64Backend::GetProfilerRecordForFunction(uint32_t guest_address) {
// who knows, we might want to compile different versions of a function one
// day
auto entry = profiler_data_.find(guest_address);
if (entry != profiler_data_.end()) {
return &entry->second;
} else {
profiler_data_[guest_address] = 0;
return &profiler_data_[guest_address];
}
}
#endif
} // namespace x64
} // namespace backend
} // namespace cpu

View File

@ -15,6 +15,14 @@
#include "xenia/base/cvar.h"
#include "xenia/cpu/backend/backend.h"
#if XE_PLATFORM_WIN32 == 1
// we use KUSER_SHARED's systemtime field, which is at a fixed address and
// obviously windows specific, to get the start/end time for a function using
// rdtsc would be too slow and skew the results by consuming extra cpu time, so
// we have lower time precision but better overall accuracy
#define XE_X64_PROFILER_AVAILABLE 1
#endif
DECLARE_int32(x64_extension_mask);
namespace xe {
@ -24,6 +32,8 @@ namespace xe {
namespace cpu {
namespace backend {
namespace x64 {
// mapping of guest function addresses to total nanoseconds taken in the func
using GuestProfilerData = std::map<uint32_t, uint64_t>;
class X64CodeCache;
@ -37,8 +47,10 @@ typedef void (*ResolveFunctionThunk)();
// negatively index the membase reg)
struct X64BackendContext {
void* ResolveFunction_Ptr; // cached pointer to resolvefunction
unsigned int mxcsr_fpu; // currently, the way we implement rounding mode
// affects both vmx and the fpu
uint64_t* guest_tick_count;
unsigned int mxcsr_fpu; // currently, the way we implement rounding mode
// affects both vmx and the fpu
unsigned int mxcsr_vmx;
unsigned int flags; // bit 0 = 0 if mxcsr is fpu, else it is vmx
unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted
@ -93,7 +105,9 @@ class X64Backend : public Backend {
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;
void RecordMMIOExceptionForGuestInstruction(void* host_address);
#if XE_X64_PROFILER_AVAILABLE == 1
uint64_t* GetProfilerRecordForFunction(uint32_t guest_address);
#endif
private:
static bool ExceptionCallbackThunk(Exception* ex, void* data);
bool ExceptionCallback(Exception* ex);
@ -106,6 +120,10 @@ class X64Backend : public Backend {
HostToGuestThunk host_to_guest_thunk_;
GuestToHostThunk guest_to_host_thunk_;
ResolveFunctionThunk resolve_function_thunk_;
#if XE_X64_PROFILER_AVAILABLE == 1
GuestProfilerData profiler_data_;
#endif
};
} // namespace x64

View File

@ -57,6 +57,12 @@ DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
"code. The workaround may cause reduced CPU performance but is a "
"more accurate emulation",
"x64");
#if XE_X64_PROFILER_AVAILABLE == 1
DEFINE_bool(instrument_call_times, false,
"Compute time taken for functions, for profiling guest code",
"x64");
#endif
namespace xe {
namespace cpu {
namespace backend {
@ -120,28 +126,37 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
*/
unsigned int data[4];
Xbyak::util::Cpu::getCpuid(0x80000001, data);
if (data[2] & (1U << 5)) {
unsigned amd_flags = data[2];
if (amd_flags & (1U << 5)) {
if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
feature_flags_ |= kX64EmitLZCNT;
}
}
// todo: although not reported by cpuid, zen 1 and zen+ also have fma4
if (amd_flags & (1U << 16)) {
if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
feature_flags_ |= kX64EmitFMA4;
}
}
if (amd_flags & (1U << 21)) {
if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
feature_flags_ |= kX64EmitTBM;
}
}
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
bool is_zennish = cpu_.displayFamily >= 0x17;
/*
chrispy: according to agner's tables, all amd architectures that
we support (ones with avx) have the same timings for
jrcxz/loop/loope/loopne as for other jmps
*/
feature_flags_ |= kX64FastJrcx;
feature_flags_ |= kX64FastLoop;
if (is_zennish) {
// ik that i heard somewhere that this is the case for zen, but i need to
// verify. cant find my original source for that.
// todo: ask agner?
feature_flags_ |= kX64FlagsIndependentVars;
feature_flags_ |= kX64FastJrcx;
if (cpu_.displayFamily > 0x17) {
feature_flags_ |= kX64FastLoop;
} else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
feature_flags_ |= kX64FastLoop;
} // todo:figure out at model zen+ became zen2, this is just the model
// for my cpu, which is ripper90
}
}
may_use_membase32_as_zero_reg_ =
@ -157,6 +172,7 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
std::vector<SourceMapEntry>* out_source_map) {
SCOPE_profile_cpu_f("cpu");
guest_module_ = dynamic_cast<XexModule*>(function->module());
current_guest_function_ = function->address();
// Reset.
debug_info_ = debug_info;
debug_info_flags_ = debug_info_flags;
@ -286,10 +302,19 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
* chrispy: removed this, it serves no purpose
mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
*/
mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax); // 0
#if XE_X64_PROFILER_AVAILABLE == 1
if (cvars::instrument_call_times) {
mov(rdx, 0x7ffe0014); // load pointer to kusershared systemtime
mov(rdx, qword[rdx]);
mov(qword[rsp + StackLayout::GUEST_PROFILER_START],
rdx); // save time for end of function
}
#endif
// Safe now to do some tracing.
if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctions) {
// We require 32-bit addresses.
@ -363,6 +388,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
*/
code_offsets.epilog = getSize();
EmitProfilerEpilogue();
add(rsp, (uint32_t)stack_size);
ret();
@ -391,6 +417,27 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
return true;
}
// dont use rax, we do this in tail call handling
void X64Emitter::EmitProfilerEpilogue() {
#if XE_X64_PROFILER_AVAILABLE == 1
if (cvars::instrument_call_times) {
uint64_t* profiler_entry =
backend()->GetProfilerRecordForFunction(current_guest_function_);
mov(ecx, 0x7ffe0014);
mov(rdx, qword[rcx]);
mov(rbx, (uintptr_t)profiler_entry);
sub(rdx, qword[rsp + StackLayout::GUEST_PROFILER_START]);
// atomic add our time to the profiler entry
// this could be atomic free if we had per thread profile counts, and on a
// threads exit we lock and sum up to the global counts, which would make
// this a few cycles less intrusive, but its good enough for now
// actually... lets just try without atomics lol
// lock();
add(qword[rbx], rdx);
}
#endif
}
void X64Emitter::MarkSourceOffset(const Instr* i) {
auto entry = source_map_arena_.Alloc<SourceMapEntry>();
@ -558,7 +605,7 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
if (instr->flags & hir::CALL_TAIL) {
// Since we skip the prolog we need to mark the return here.
EmitTraceUserCallReturn();
EmitProfilerEpilogue();
// Pass the callers return address over.
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
@ -602,7 +649,7 @@ void X64Emitter::CallIndirect(const hir::Instr* instr,
if (instr->flags & hir::CALL_TAIL) {
// Since we skip the prolog we need to mark the return here.
EmitTraceUserCallReturn();
EmitProfilerEpilogue();
// Pass the callers return address over.
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
@ -952,7 +999,34 @@ static const vec128_t xmm_consts[] = {
/*XMMVSRShlByteshuf*/
v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
// XMMVSRMask
vec128b(1)};
vec128b(1),
/*
XMMF16UnpackLCPI2
*/
vec128i(0x38000000),
/*
XMMF16UnpackLCPI3
*/
vec128q(0x7fe000007fe000ULL),
/* XMMF16PackLCPI0*/
vec128i(0x8000000),
/*XMMF16PackLCPI2*/
vec128i(0x47ffe000),
/*XMMF16PackLCPI3*/
vec128i(0xc7800000),
/*XMMF16PackLCPI4
*/
vec128i(0xf7fdfff),
/*XMMF16PackLCPI5*/
vec128i(0x7fff),
/*
XMMF16PackLCPI6
*/
vec128i(0x8000)
};
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
for (auto& vec : xmm_consts) {

View File

@ -159,7 +159,15 @@ enum XmmConst {
XMMThreeFloatMask, // for clearing the fourth float prior to DOT_PRODUCT_3
XMMXenosF16ExtRangeStart,
XMMVSRShlByteshuf,
XMMVSRMask
XMMVSRMask,
XMMF16UnpackLCPI2, // 0x38000000, 1/ 32768
XMMF16UnpackLCPI3, // 0x0x7fe000007fe000
XMMF16PackLCPI0,
XMMF16PackLCPI2,
XMMF16PackLCPI3,
XMMF16PackLCPI4,
XMMF16PackLCPI5,
XMMF16PackLCPI6
};
// X64Backend specific Instr->runtime_flags
enum : uint32_t {
@ -177,7 +185,7 @@ class XbyakAllocator : public Xbyak::Allocator {
enum X64EmitterFeatureFlags {
kX64EmitAVX2 = 1 << 0,
kX64EmitFMA = 1 << 1,
kX64EmitLZCNT = 1 << 2,
kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount
kX64EmitBMI1 = 1 << 3,
kX64EmitBMI2 = 1 << 4,
kX64EmitF16C = 1 << 5,
@ -201,7 +209,11 @@ enum X64EmitterFeatureFlags {
// inc/dec) do not introduce false dependencies on EFLAGS
// because the individual flags are treated as different vars by
// the processor. (this applies to zen)
kX64EmitPrefetchW = 1 << 16
kX64EmitPrefetchW = 1 << 16,
kX64EmitXOP = 1 << 17, // chrispy: xop maps really well to many vmx
// instructions, and FX users need the boost
kX64EmitFMA4 = 1 << 18, // todo: also use on zen1?
kX64EmitTBM = 1 << 19
};
class ResolvableGuestCall {
public:
@ -337,6 +349,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
XexModule* GuestModule() { return guest_module_; }
void EmitProfilerEpilogue();
protected:
void* Emplace(const EmitFunctionInfo& func_info,
GuestFunction* function = nullptr);
@ -352,7 +366,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
XexModule* guest_module_ = nullptr;
Xbyak::util::Cpu cpu_;
uint32_t feature_flags_ = 0;
uint32_t current_guest_function_ = 0;
Xbyak::Label* epilog_label_ = nullptr;
hir::Instr* current_instr_ = nullptr;

View File

@ -19,10 +19,6 @@
#include "xenia/base/cvar.h"
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
DEFINE_bool(use_extended_range_half, true,
"Emulate extended range half-precision, may be slower on games "
"that use it heavily",
"CPU");
namespace xe {
namespace cpu {
namespace backend {
@ -1982,6 +1978,137 @@ struct PERMUTE_V128
};
EMITTER_OPCODE_TABLE(OPCODE_PERMUTE, PERMUTE_I32, PERMUTE_V128);
#define LCPI(name, quad1) const __m128i name = _mm_set1_epi32(quad1)
// xmm0 is precasted to int, but contains float
// chrispy: todo: make available to gpu code
static __m128i xenos_float4_to_float16_x4(__m128i xmm0) {
LCPI(LCPI0_0, 2147483647);
LCPI(LCPI0_1, 1207951360);
LCPI(LCPI0_2, 134217728);
LCPI(LCPI0_3, 3347054592);
LCPI(LCPI0_4, 260038655);
LCPI(LCPI0_5, 32767);
LCPI(LCPI0_6, 4294934528);
__m128i xmm1 = _mm_and_si128(xmm0, LCPI0_0);
__m128i xmm2 = LCPI0_1;
__m128i xmm3 = _mm_add_epi32(xmm0, LCPI0_2);
xmm2 = _mm_cmpgt_epi32(xmm2, xmm1);
xmm3 = _mm_srli_epi32(xmm3, 13);
xmm1 = _mm_add_epi32(xmm1, LCPI0_3);
__m128i xmm4 = _mm_min_epu32(xmm1, LCPI0_4);
xmm1 = _mm_cmpeq_epi32(xmm1, xmm4);
xmm4 = LCPI0_5;
xmm3 = _mm_and_si128(xmm3, xmm4);
xmm1 = _mm_and_si128(xmm1, xmm3);
xmm1 = _mm_castps_si128(_mm_blendv_ps(
_mm_castsi128_ps(xmm4), _mm_castsi128_ps(xmm1), _mm_castsi128_ps(xmm2)));
xmm0 = _mm_srli_epi32(xmm0, 16);
xmm0 = _mm_and_si128(xmm0, LCPI0_6);
xmm0 = _mm_or_si128(xmm1, xmm0);
xmm0 = _mm_packus_epi32(xmm0, _mm_setzero_si128());
return xmm0;
}
// returns floats, uncasted
// chrispy: todo, make this available to gpu code?
static __m128i xenos_halves_to_floats(__m128i xmm0) {
LCPI(LCPI3_0, 0x1f);
LCPI(LCPI3_1, 0x80000000);
LCPI(LCPI3_2, 0x38000000);
LCPI(LCPI3_3, 0x7fe000);
__m128i xmm1, xmm2, xmm3, xmm4;
xmm1 = _mm_cvtepu16_epi32(xmm0);
xmm2 = _mm_srli_epi32(xmm1, 10);
xmm2 = _mm_and_si128(xmm2, LCPI3_0);
xmm0 = _mm_cvtepi16_epi32(xmm0);
xmm0 = _mm_and_si128(xmm0, LCPI3_1);
xmm3 = _mm_setzero_si128();
xmm4 = _mm_slli_epi32(xmm2, 23);
xmm4 = _mm_add_epi32(xmm4, LCPI3_2);
xmm2 = _mm_cmpeq_epi32(xmm2, xmm3);
xmm1 = _mm_slli_epi32(xmm1, 13);
xmm1 = _mm_and_si128(xmm1, LCPI3_3);
xmm3 = _mm_andnot_si128(xmm2, xmm4);
xmm1 = _mm_andnot_si128(xmm2, xmm1);
xmm0 = _mm_or_si128(xmm1, xmm0);
xmm0 = _mm_or_si128(xmm0, xmm3);
return xmm0;
}
#undef LCPI
template <typename Inst>
static void emit_fast_f16_unpack(X64Emitter& e, const Inst& i,
XmmConst initial_shuffle) {
auto src1 = i.src1;
e.vpshufb(i.dest, src1, e.GetXmmConstPtr(initial_shuffle));
e.vpmovsxwd(e.xmm1, i.dest);
e.vpsrld(e.xmm2, e.xmm1, 10);
e.vpmovsxwd(e.xmm0, i.dest);
e.vpand(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMSignMaskPS));
e.vpand(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMPermuteByteMask));
e.vpslld(e.xmm3, e.xmm2, 23);
e.vpaddd(e.xmm3, e.xmm3, e.GetXmmConstPtr(XMMF16UnpackLCPI2));
e.vpcmpeqd(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMZero));
e.vpslld(e.xmm1, e.xmm1, 13);
e.vpandn(e.xmm1, e.xmm2, e.xmm1);
e.vpandn(e.xmm2, e.xmm2, e.xmm3);
e.vpand(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMF16UnpackLCPI3));
e.vpor(e.xmm0, e.xmm1, e.xmm0);
e.vpor(i.dest, e.xmm0, e.xmm2);
}
template <typename Inst>
static void emit_fast_f16_pack(X64Emitter& e, const Inst& i,
XmmConst final_shuffle) {
e.vpaddd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMF16PackLCPI0));
e.vpand(e.xmm2, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS));
e.vmovdqa(e.xmm3, e.GetXmmConstPtr(XMMF16PackLCPI2));
e.vpcmpgtd(e.xmm3, e.xmm3, e.xmm2);
e.vpsrld(e.xmm1, e.xmm1, 13);
e.vpaddd(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMF16PackLCPI3));
e.vpminud(e.xmm0, e.xmm2, e.GetXmmConstPtr(XMMF16PackLCPI4));
e.vpcmpeqd(e.xmm2, e.xmm2, e.xmm0);
e.vmovdqa(e.xmm0, e.GetXmmConstPtr(XMMF16PackLCPI5));
e.vpand(e.xmm1, e.xmm1, e.xmm0);
e.vpand(e.xmm1, e.xmm2, e.xmm1);
e.vpxor(e.xmm2, e.xmm2, e.xmm2);
e.vblendvps(e.xmm1, e.xmm0, e.xmm1, e.xmm3);
e.vpsrld(e.xmm0, i.src1, 16);
e.vpand(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMF16PackLCPI6));
e.vorps(e.xmm0, e.xmm1, e.xmm0);
e.vpackusdw(i.dest, e.xmm0, e.xmm2);
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(final_shuffle));
}
// ============================================================================
// OPCODE_SWIZZLE
// ============================================================================
@ -2081,14 +2208,9 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
alignas(16) uint16_t b[8];
_mm_store_ps(a, src1);
std::memset(b, 0, sizeof(b));
if (!cvars::use_extended_range_half) {
for (int i = 0; i < 2; i++) {
b[7 - i] = half_float::detail::float2half<std::round_toward_zero>(a[i]);
}
} else {
for (int i = 0; i < 2; i++) {
b[7 - i] = float_to_xenos_half(a[i]);
}
for (int i = 0; i < 2; i++) {
b[7 - i] = float_to_xenos_half(a[i]);
}
return _mm_load_si128(reinterpret_cast<__m128i*>(b));
@ -2098,70 +2220,26 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
// http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx
// dest = [(src1.x | src1.y), 0, 0, 0]
if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
Xmm src;
if (i.src1.is_constant) {
src = i.dest;
e.LoadConstantXmm(src, i.src1.constant());
} else {
src = i.src1;
}
// 0|0|0|0|W|Z|Y|X
e.vcvtps2ph(i.dest, src, 0b00000011);
// Shuffle to X|Y|0|0|0|0|0|0
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_2));
if (i.src1.is_constant) {
e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
} else {
if (i.src1.is_constant) {
e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
} else {
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
}
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
e.vmovaps(i.dest, e.xmm0);
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
}
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
e.vmovaps(i.dest, e.xmm0);
}
static __m128i EmulateFLOAT16_4(void*, __m128 src1) {
alignas(16) float a[4];
alignas(16) uint16_t b[8];
_mm_store_ps(a, src1);
std::memset(b, 0, sizeof(b));
if (!cvars::use_extended_range_half) {
for (int i = 0; i < 4; i++) {
b[7 - (i ^ 2)] =
half_float::detail::float2half<std::round_toward_zero>(a[i]);
}
} else {
for (int i = 0; i < 4; i++) {
b[7 - (i ^ 2)] = float_to_xenos_half(a[i]);
}
}
return _mm_load_si128(reinterpret_cast<__m128i*>(b));
}
static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) {
assert_true(i.src2.value->IsConstantZero());
// dest = [(src1.z | src1.w), (src1.x | src1.y), 0, 0]
if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
Xmm src;
if (i.src1.is_constant) {
src = i.dest;
e.LoadConstantXmm(src, i.src1.constant());
} else {
src = i.src1;
}
// 0|0|0|0|W|Z|Y|X
e.vcvtps2ph(i.dest, src, 0b00000011);
// Shuffle to Z|W|X|Y|0|0|0|0
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_4));
if (!i.src1.is_constant) {
emit_fast_f16_pack(e, i, XMMPackFLOAT16_4);
} else {
if (i.src1.is_constant) {
e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
} else {
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
vec128_t result = vec128b(0);
for (unsigned idx = 0; idx < 4; ++idx) {
result.u16[(7 - (idx ^ 2))] =
float_to_xenos_half(i.src1.constant().f32[idx]);
}
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_4));
e.vmovaps(i.dest, e.xmm0);
e.LoadConstantXmm(i.dest, result);
}
}
static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) {
@ -2508,15 +2586,10 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
alignas(16) float b[4];
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
if (!cvars::use_extended_range_half) {
for (int i = 0; i < 2; i++) {
b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]);
}
} else {
for (int i = 0; i < 2; i++) {
b[i] = xenos_half_to_float(a[VEC128_W(6 + i)]);
}
for (int i = 0; i < 2; i++) {
b[i] = xenos_half_to_float(a[VEC128_W(6 + i)]);
}
// Constants, or something
b[2] = 0.f;
b[3] = 1.f;
@ -2536,74 +2609,28 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
// Also zero out the high end.
// TODO(benvanik): special case constant unpacks that just get 0/1/etc.
if (e.IsFeatureEnabled(kX64EmitF16C) &&
!cvars::use_extended_range_half) { // todo: can use cvtph and bit logic
// to implement
Xmm src;
if (i.src1.is_constant) {
src = i.dest;
e.LoadConstantXmm(src, i.src1.constant());
} else {
src = i.src1;
}
// sx = src.iw >> 16;
// sy = src.iw & 0xFFFF;
// dest = { XMConvertHalfToFloat(sx),
// XMConvertHalfToFloat(sy),
// 0.0,
// 1.0 };
// Shuffle to 0|0|0|0|0|0|Y|X
e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_2));
e.vcvtph2ps(i.dest, i.dest);
e.vpshufd(i.dest, i.dest, 0b10100100);
e.vpor(i.dest, e.GetXmmConstPtr(XMM0001));
if (i.src1.is_constant) {
e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
} else {
if (i.src1.is_constant) {
e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
} else {
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
}
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
e.vmovaps(i.dest, e.xmm0);
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
}
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_2));
e.vmovaps(i.dest, e.xmm0);
}
static __m128 EmulateFLOAT16_4(void*, __m128i src1) {
alignas(16) uint16_t a[8];
alignas(16) float b[4];
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
if (!cvars::use_extended_range_half) {
for (int i = 0; i < 4; i++) {
b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]);
}
} else {
for (int i = 0; i < 4; i++) {
b[i] = xenos_half_to_float(a[VEC128_W(4 + i)]);
}
}
return _mm_load_ps(b);
}
static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) {
// src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0]
if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) {
Xmm src;
if (i.src1.is_constant) {
src = i.dest;
e.LoadConstantXmm(src, i.src1.constant());
} else {
src = i.src1;
if (i.src1.is_constant) {
vec128_t result{};
for (int idx = 0; idx < 4; ++idx) {
result.f32[idx] =
xenos_half_to_float(i.src1.constant().u16[VEC128_W(4 + idx)]);
}
// Shuffle to 0|0|0|0|W|Z|Y|X
e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_4));
e.vcvtph2ps(i.dest, i.dest);
e.LoadConstantXmm(i.dest, result);
} else {
if (i.src1.is_constant) {
e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant()));
} else {
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
}
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_4));
e.vmovaps(i.dest, e.xmm0);
emit_fast_f16_unpack(e, i, XMMUnpackFLOAT16_4);
}
}
static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) {

View File

@ -50,6 +50,10 @@ DEFINE_bool(no_round_to_single, false,
"Not for users, breaks games. Skip rounding double values to "
"single precision and back",
"CPU");
DEFINE_bool(
inline_loadclock, false,
"Directly read cached guest clock without calling the LoadClock method (it gets repeatedly updated by calls from other threads)",
"CPU");
namespace xe {
namespace cpu {
namespace backend {
@ -475,33 +479,39 @@ EMITTER_OPCODE_TABLE(OPCODE_ROUND, ROUND_F32, ROUND_F64, ROUND_V128);
// ============================================================================
struct LOAD_CLOCK : Sequence<LOAD_CLOCK, I<OPCODE_LOAD_CLOCK, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
// When scaling is disabled and the raw clock source is selected, the code
// in the Clock class is actually just forwarding tick counts after one
// simple multiply and division. In that case we rather bake the scaling in
// here to cut extra function calls with CPU cache misses and stack frame
// overhead.
if (cvars::clock_no_scaling && cvars::clock_source_raw) {
auto ratio = Clock::guest_tick_ratio();
// The 360 CPU is an in-order CPU, AMD64 usually isn't. Without
// mfence/lfence magic the rdtsc instruction can be executed sooner or
// later in the cache window. Since it's resolution however is much higher
// than the 360's mftb instruction this can safely be ignored.
// Read time stamp in edx (high part) and eax (low part).
e.rdtsc();
// Make it a 64 bit number in rax.
e.shl(e.rdx, 32);
e.or_(e.rax, e.rdx);
// Apply tick frequency scaling.
e.mov(e.rcx, ratio.first);
e.mul(e.rcx);
// We actually now have a 128 bit number in rdx:rax.
e.mov(e.rcx, ratio.second);
e.div(e.rcx);
e.mov(i.dest, e.rax);
if (cvars::inline_loadclock) {
e.mov(e.rcx,
e.GetBackendCtxPtr(offsetof(X64BackendContext, guest_tick_count)));
e.mov(i.dest, e.qword[e.rcx]);
} else {
e.CallNative(LoadClock);
e.mov(i.dest, e.rax);
// When scaling is disabled and the raw clock source is selected, the code
// in the Clock class is actually just forwarding tick counts after one
// simple multiply and division. In that case we rather bake the scaling
// in here to cut extra function calls with CPU cache misses and stack
// frame overhead.
if (cvars::clock_no_scaling && cvars::clock_source_raw) {
auto ratio = Clock::guest_tick_ratio();
// The 360 CPU is an in-order CPU, AMD64 usually isn't. Without
// mfence/lfence magic the rdtsc instruction can be executed sooner or
// later in the cache window. Since it's resolution however is much
// higher than the 360's mftb instruction this can safely be ignored.
// Read time stamp in edx (high part) and eax (low part).
e.rdtsc();
// Make it a 64 bit number in rax.
e.shl(e.rdx, 32);
e.or_(e.rax, e.rdx);
// Apply tick frequency scaling.
e.mov(e.rcx, ratio.first);
e.mul(e.rcx);
// We actually now have a 128 bit number in rdx:rax.
e.mov(e.rcx, ratio.second);
e.div(e.rcx);
e.mov(i.dest, e.rax);
} else {
e.CallNative(LoadClock);
e.mov(i.dest, e.rax);
}
}
}
static uint64_t LoadClock(void* raw_context) {
@ -539,10 +549,12 @@ struct MAX_F64 : Sequence<MAX_F64, I<OPCODE_MAX, F64Op, F64Op, F64Op>> {
struct MAX_V128 : Sequence<MAX_V128, I<OPCODE_MAX, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Vmx);
EmitCommutativeBinaryXmmOp(e, i,
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
e.vmaxps(dest, src1, src2);
});
//if 0 and -0, return 0! opposite of minfp
auto src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
auto src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
e.vmaxps(e.xmm2, src1, src2);
e.vmaxps(e.xmm3, src2, src1);
e.vandps(i.dest, e.xmm2, e.xmm3);
}
};
EMITTER_OPCODE_TABLE(OPCODE_MAX, MAX_F32, MAX_F64, MAX_V128);
@ -597,10 +609,11 @@ struct MIN_F64 : Sequence<MIN_F64, I<OPCODE_MIN, F64Op, F64Op, F64Op>> {
struct MIN_V128 : Sequence<MIN_V128, I<OPCODE_MIN, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Vmx);
EmitCommutativeBinaryXmmOp(e, i,
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
e.vminps(dest, src1, src2);
});
auto src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
auto src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
e.vminps(e.xmm2, src1, src2);
e.vminps(e.xmm3, src2, src1);
e.vorps(i.dest, e.xmm2, e.xmm3);
}
};
EMITTER_OPCODE_TABLE(OPCODE_MIN, MIN_I8, MIN_I16, MIN_I32, MIN_I64, MIN_F32,
@ -768,6 +781,7 @@ struct SELECT_V128_V128
} else if (mayblend == PermittedBlend::Ps) {
e.vblendvps(i.dest, src2, src3, src1);
} else {
//ideally we would have an xop path here...
// src1 ? src2 : src3;
e.vpandn(e.xmm3, src1, src2);
e.vpand(i.dest, src1, src3);
@ -1932,6 +1946,53 @@ struct MUL_ADD_V128
};
EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);
struct NEGATED_MUL_ADD_F64
: Sequence<NEGATED_MUL_ADD_F64,
I<OPCODE_NEGATED_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Fpu);
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
if (e.IsFeatureEnabled(kX64EmitFMA)) {
// todo: this is garbage
e.vmovapd(e.xmm3, src1);
e.vfmadd213sd(e.xmm3, src2, src3);
e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
} else {
// todo: might need to use x87 in this case...
e.vmulsd(e.xmm3, src1, src2);
e.vaddsd(i.dest, e.xmm3, src3);
e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
}
}
};
struct NEGATED_MUL_ADD_V128
: Sequence<NEGATED_MUL_ADD_V128,
I<OPCODE_NEGATED_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Vmx);
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
if (e.IsFeatureEnabled(kX64EmitFMA)) {
// todo: this is garbage
e.vmovaps(e.xmm3, src1);
e.vfmadd213ps(e.xmm3, src2, src3);
e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
} else {
// todo: might need to use x87 in this case...
e.vmulps(e.xmm3, src1, src2);
e.vaddps(i.dest, e.xmm3, src3);
e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
}
}
};
EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_ADD, NEGATED_MUL_ADD_F64,
NEGATED_MUL_ADD_V128);
// ============================================================================
// OPCODE_MUL_SUB
// ============================================================================
@ -1944,12 +2005,7 @@ EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);
// - 132 -> $1 = $1 * $3 - $2
// - 213 -> $1 = $2 * $1 - $3
// - 231 -> $1 = $2 * $3 - $1
struct MUL_SUB_F32
: Sequence<MUL_SUB_F32, I<OPCODE_MUL_SUB, F32Op, F32Op, F32Op, F32Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
assert_impossible_sequence(MUL_SUB_F32);
}
};
struct MUL_SUB_F64
: Sequence<MUL_SUB_F64, I<OPCODE_MUL_SUB, F64Op, F64Op, F64Op, F64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
@ -1991,7 +2047,54 @@ struct MUL_SUB_V128
}
}
};
EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F32, MUL_SUB_F64, MUL_SUB_V128);
EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F64, MUL_SUB_V128);
struct NEGATED_MUL_SUB_F64
: Sequence<NEGATED_MUL_SUB_F64,
I<OPCODE_NEGATED_MUL_SUB, F64Op, F64Op, F64Op, F64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Fpu);
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
if (e.IsFeatureEnabled(kX64EmitFMA)) {
// todo: this is garbage
e.vmovapd(e.xmm3, src1);
e.vfmsub213sd(e.xmm3, src2, src3);
e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
} else {
// todo: might need to use x87 in this case...
e.vmulsd(e.xmm3, src1, src2);
e.vsubsd(i.dest, e.xmm3, src3);
e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
}
}
};
struct NEGATED_MUL_SUB_V128
: Sequence<NEGATED_MUL_SUB_V128,
I<OPCODE_NEGATED_MUL_SUB, V128Op, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Vmx);
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
if (e.IsFeatureEnabled(kX64EmitFMA)) {
// todo: this is garbage
e.vmovaps(e.xmm3, src1);
e.vfmsub213ps(e.xmm3, src2, src3);
e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
} else {
// todo: might need to use x87 in this case...
e.vmulps(e.xmm3, src1, src2);
e.vsubps(i.dest, e.xmm3, src3);
e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
}
}
};
EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_SUB, NEGATED_MUL_SUB_F64,
NEGATED_MUL_SUB_V128);
// ============================================================================
// OPCODE_NEG
@ -2264,7 +2367,7 @@ struct DOT_PRODUCT_3_V128
e.ChangeMxcsrMode(MXCSRMode::Vmx);
// todo: add fast_dot_product path that just checks for infinity instead of
// using mxcsr
auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64];
auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH];
// this is going to hurt a bit...
/*
@ -2380,7 +2483,7 @@ struct DOT_PRODUCT_4_V128
e.ChangeMxcsrMode(MXCSRMode::Vmx);
// todo: add fast_dot_product path that just checks for infinity instead of
// using mxcsr
auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64];
auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH];
bool is_lensqr = i.instr->src1.value == i.instr->src2.value;
@ -3162,9 +3265,9 @@ struct SET_ROUNDING_MODE_I32
// backends dont have to worry about it
if (i.src1.is_constant) {
e.mov(e.eax, mxcsr_table[i.src1.constant()]);
e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH64], e.eax);
e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH], e.eax);
e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.eax);
e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH64]);
e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH]);
} else {
e.mov(e.ecx, i.src1);

View File

@ -123,7 +123,10 @@ class StackLayout {
*/
static const size_t GUEST_STACK_SIZE = 104;
//was GUEST_CTX_HOME, can't remove because that'd throw stack alignment off. instead, can be used as a temporary in sequences
static const size_t GUEST_SCRATCH64 = 80;
static const size_t GUEST_SCRATCH = 0;
//when profiling is on, this stores the nanosecond time at the start of the function
static const size_t GUEST_PROFILER_START = 80;
static const size_t GUEST_RET_ADDR = 88;
static const size_t GUEST_CALL_RET_ADDR = 96;
};

View File

@ -600,6 +600,9 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
break;
case OPCODE_MAX:
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
if (should_skip_because_of_float) {
break;
}
v->set_from(i->src1.value);
v->Max(i->src2.value);
i->Remove();

View File

@ -1636,15 +1636,7 @@ Value* HIRBuilder::Div(Value* value1, Value* value2,
Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) {
ASSERT_TYPES_EQUAL(value1, value2);
ASSERT_TYPES_EQUAL(value1, value3);
#if 0
bool c1 = value1->IsConstant();
bool c2 = value2->IsConstant();
if (c1 && c2) {
Value* dest = CloneValue(value1);
dest->Mul(value2);
return Add(dest, value3);
}
#endif
Instr* i = AppendInstr(OPCODE_MUL_ADD_info, 0, AllocValue(value1->type));
i->set_src1(value1);
i->set_src2(value2);
@ -1655,15 +1647,7 @@ Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) {
Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
ASSERT_TYPES_EQUAL(value1, value2);
ASSERT_TYPES_EQUAL(value1, value3);
#if 0
bool c1 = value1->IsConstant();
bool c2 = value2->IsConstant();
if (c1 && c2) {
Value* dest = CloneValue(value1);
dest->Mul(value2);
return Sub(dest, value3);
}
#endif
Instr* i = AppendInstr(OPCODE_MUL_SUB_info, 0, AllocValue(value1->type));
i->set_src1(value1);
i->set_src2(value2);
@ -1671,6 +1655,30 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
return i->dest;
}
Value* HIRBuilder::NegatedMulAdd(Value* value1, Value* value2, Value* value3) {
ASSERT_TYPES_EQUAL(value1, value2);
ASSERT_TYPES_EQUAL(value1, value3);
Instr* i =
AppendInstr(OPCODE_NEGATED_MUL_ADD_info, 0, AllocValue(value1->type));
i->set_src1(value1);
i->set_src2(value2);
i->set_src3(value3);
return i->dest;
}
Value* HIRBuilder::NegatedMulSub(Value* value1, Value* value2, Value* value3) {
ASSERT_TYPES_EQUAL(value1, value2);
ASSERT_TYPES_EQUAL(value1, value3);
Instr* i =
AppendInstr(OPCODE_NEGATED_MUL_SUB_info, 0, AllocValue(value1->type));
i->set_src1(value1);
i->set_src2(value2);
i->set_src3(value3);
return i->dest;
}
Value* HIRBuilder::Neg(Value* value) {
Instr* i = AppendInstr(OPCODE_NEG_info, 0, AllocValue(value->type));
i->set_src1(value);

View File

@ -214,6 +214,10 @@ class HIRBuilder {
Value* Div(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
Value* MulAdd(Value* value1, Value* value2, Value* value3); // (1 * 2) + 3
Value* MulSub(Value* value1, Value* value2, Value* value3); // (1 * 2) - 3
Value* NegatedMulAdd(Value* value1, Value* value2,
Value* value3); // -((1 * 2) + 3)
Value* NegatedMulSub(Value* value1, Value* value2,
Value* value3); // -((1 * 2) - 3)
Value* Neg(Value* value);
Value* Abs(Value* value);
Value* Sqrt(Value* value);
@ -265,6 +269,7 @@ class HIRBuilder {
Value* AtomicAdd(Value* address, Value* value);
Value* AtomicSub(Value* address, Value* value);
void SetNJM(Value* value);
protected:
void DumpValue(StringBuffer* str, Value* value);
void DumpOp(StringBuffer* str, OpcodeSignatureType sig_type, Instr::Op* op);

View File

@ -208,6 +208,12 @@ enum Opcode {
OPCODE_STORE_OFFSET,
OPCODE_LOAD,
OPCODE_STORE,
// chrispy: todo: implement, our current codegen for the unaligned loads is
// very bad
OPCODE_LVLX,
OPCODE_LVRX,
OPCODE_STVLX,
OPCODE_STVRX,
OPCODE_MEMSET,
OPCODE_CACHE_CONTROL,
OPCODE_MEMORY_BARRIER,
@ -244,7 +250,9 @@ enum Opcode {
OPCODE_MUL_HI, // TODO(benvanik): remove this and add INT128 type.
OPCODE_DIV,
OPCODE_MUL_ADD,
OPCODE_NEGATED_MUL_ADD,
OPCODE_MUL_SUB,
OPCODE_NEGATED_MUL_SUB,
OPCODE_NEG,
OPCODE_ABS,
OPCODE_SQRT,
@ -284,7 +292,8 @@ enum Opcode {
OPCODE_TO_SINGLE, // i could not find a decent name to assign to this opcode,
// as we already have OPCODE_ROUND. round double to float (
// ppc "single" fpu instruction result rounding behavior )
OPCODE_SET_NJM,
OPCODE_SET_NJM,
__OPCODE_MAX_VALUE, // Keep at end.
};

View File

@ -464,6 +464,19 @@ DEFINE_OPCODE(
OPCODE_SIG_V_V_V_V,
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
DEFINE_OPCODE(
OPCODE_NEGATED_MUL_ADD,
"negated_mul_add",
OPCODE_SIG_V_V_V_V,
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
DEFINE_OPCODE(
OPCODE_NEGATED_MUL_SUB,
"negated_mul_sub",
OPCODE_SIG_V_V_V_V,
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
DEFINE_OPCODE(
OPCODE_NEG,
"neg",
@ -692,4 +705,5 @@ DEFINE_OPCODE(
"set_njm",
OPCODE_SIG_X_V,
0
)
)

View File

@ -613,10 +613,12 @@ class Value {
// returns true if every single use is as an operand to a single instruction
// (add var2, var1, var1)
bool AllUsesByOneInsn() const;
//the maybe is here because this includes vec128, which is untyped data that can be treated as float or int depending on the context
// the maybe is here because this includes vec128, which is untyped data that
// can be treated as float or int depending on the context
bool MaybeFloaty() const {
return type == FLOAT32_TYPE || type == FLOAT64_TYPE || type == VEC128_TYPE;
}
private:
static bool CompareInt8(Opcode opcode, Value* a, Value* b);
static bool CompareInt16(Opcode opcode, Value* a, Value* b);

View File

@ -48,7 +48,7 @@ class MMIOHandler {
typedef uint32_t (*HostToGuestVirtual)(const void* context,
const void* host_address);
typedef bool (*AccessViolationCallback)(
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
global_unique_lock_type global_lock_locked_once, //not passed by reference with const like the others?
void* context, void* host_address, bool is_write);
// access_violation_callback is called with global_critical_region locked once

View File

@ -14,8 +14,8 @@
#include <mutex>
#include <string>
#include "xenia/base/mutex.h"
#include "xenia/base/vec128.h"
namespace xe {
namespace cpu {
class Processor;
@ -390,9 +390,11 @@ typedef struct alignas(64) PPCContext_s {
// These are split to make it easier to do DCE on unused stores.
uint64_t cr() const;
void set_cr(uint64_t value);
// todo: remove, saturation should be represented by a vector
uint8_t vscr_sat;
uint32_t vrsave;
// uint32_t get_fprf() {
// return fpscr.value & 0x000F8000;
// }
@ -405,7 +407,7 @@ typedef struct alignas(64) PPCContext_s {
// Global interrupt lock, held while interrupts are disabled or interrupts are
// executing. This is shared among all threads and comes from the processor.
std::recursive_mutex* global_mutex;
global_mutex_type* global_mutex;
// Used to shuttle data into externs. Contents volatile.
uint64_t scratch;

View File

@ -1197,7 +1197,7 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
Value* b = f.VectorDenormFlush(f.LoadVR(vb));
Value* c = f.VectorDenormFlush(f.LoadVR(vc));
Value* v = f.Neg(f.MulSub(a, c, b));
Value* v = f.NegatedMulSub(a, c, b);
f.StoreVR(vd, v);
return 0;
}

View File

@ -16,6 +16,12 @@
#include "xenia/cpu/ppc/ppc_hir_builder.h"
#include <stddef.h>
// chrispy: added this, we can have simpler control flow and do dce on the
// inputs
DEFINE_bool(ignore_trap_instructions, true,
"Generate no code for powerpc trap instructions, can result in "
"better performance in games that aggressively check with trap.",
"CPU");
namespace xe {
namespace cpu {
@ -449,6 +455,9 @@ constexpr uint32_t TRAP_SLT = 1 << 4, TRAP_SGT = 1 << 3, TRAP_EQ = 1 << 2,
int InstrEmit_trap(PPCHIRBuilder& f, const InstrData& i, Value* va, Value* vb,
uint32_t TO) {
if (cvars::ignore_trap_instructions) {
return 0;
}
// if (a < b) & TO[0] then TRAP
// if (a > b) & TO[1] then TRAP
// if (a = b) & TO[2] then TRAP
@ -521,6 +530,9 @@ int InstrEmit_trap(PPCHIRBuilder& f, const InstrData& i, Value* va, Value* vb,
}
int InstrEmit_td(PPCHIRBuilder& f, const InstrData& i) {
if (cvars::ignore_trap_instructions) {
return 0;
}
// a <- (RA)
// b <- (RB)
// if (a < b) & TO[0] then TRAP
@ -534,6 +546,9 @@ int InstrEmit_td(PPCHIRBuilder& f, const InstrData& i) {
}
int InstrEmit_tdi(PPCHIRBuilder& f, const InstrData& i) {
if (cvars::ignore_trap_instructions) {
return 0;
}
// a <- (RA)
// if (a < EXTS(SI)) & TO[0] then TRAP
// if (a > EXTS(SI)) & TO[1] then TRAP
@ -546,6 +561,9 @@ int InstrEmit_tdi(PPCHIRBuilder& f, const InstrData& i) {
}
int InstrEmit_tw(PPCHIRBuilder& f, const InstrData& i) {
if (cvars::ignore_trap_instructions) {
return 0;
}
// a <- EXTS((RA)[32:63])
// b <- EXTS((RB)[32:63])
// if (a < b) & TO[0] then TRAP
@ -561,6 +579,9 @@ int InstrEmit_tw(PPCHIRBuilder& f, const InstrData& i) {
}
int InstrEmit_twi(PPCHIRBuilder& f, const InstrData& i) {
if (cvars::ignore_trap_instructions) {
return 0;
}
// a <- EXTS((RA)[32:63])
// if (a < EXTS(SI)) & TO[0] then TRAP
// if (a > EXTS(SI)) & TO[1] then TRAP
@ -645,7 +666,9 @@ int InstrEmit_mfspr(PPCHIRBuilder& f, const InstrData& i) {
break;
case 256:
// VRSAVE
v = f.LoadZeroInt64();
v = f.ZeroExtend(f.LoadContext(offsetof(PPCContext, vrsave), INT32_TYPE),
INT64_TYPE);
break;
case 268:
// TB
@ -749,6 +772,8 @@ int InstrEmit_mtspr(PPCHIRBuilder& f, const InstrData& i) {
f.StoreCTR(rt);
break;
case 256:
f.StoreContext(offsetof(PPCContext, vrsave), f.Truncate(rt, INT32_TYPE));
// VRSAVE
break;
default:
@ -768,6 +793,7 @@ int InstrEmit_mfmsr(PPCHIRBuilder& f, const InstrData& i) {
// bit 48 = EE; interrupt enabled
// bit 62 = RI; recoverable interrupt
// return 8000h if unlocked (interrupts enabled), else 0
#if 0
f.MemoryBarrier();
if (cvars::disable_global_lock || true) {
f.StoreGPR(i.X.RT, f.LoadConstantUint64(0));
@ -777,63 +803,23 @@ int InstrEmit_mfmsr(PPCHIRBuilder& f, const InstrData& i) {
f.StoreGPR(i.X.RT,
f.LoadContext(offsetof(PPCContext, scratch), INT64_TYPE));
}
#else
f.StoreGPR(i.X.RT, f.LoadConstantUint64(0));
#endif
return 0;
}
int InstrEmit_mtmsr(PPCHIRBuilder& f, const InstrData& i) {
if (i.X.RA & 0x01) {
// L = 1
// iff storing from r13
f.MemoryBarrier();
f.StoreContext(
offsetof(PPCContext, scratch),
f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE));
#if 0
if (i.X.RT == 13) {
// iff storing from r13 we are taking a lock (disable interrupts).
if (!cvars::disable_global_lock) {
f.CallExtern(f.builtins()->enter_global_lock);
}
} else {
// Otherwise we are restoring interrupts (probably).
if (!cvars::disable_global_lock) {
f.CallExtern(f.builtins()->leave_global_lock);
}
}
#endif
return 0;
} else {
// L = 0
XEINSTRNOTIMPLEMENTED();
return 1;
}
f.StoreContext(
offsetof(PPCContext, scratch),
f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE));
return 0;
}
int InstrEmit_mtmsrd(PPCHIRBuilder& f, const InstrData& i) {
if (i.X.RA & 0x01) {
// L = 1
f.MemoryBarrier();
f.StoreContext(offsetof(PPCContext, scratch),
f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE));
#if 0
if (i.X.RT == 13) {
// iff storing from r13 we are taking a lock (disable interrupts).
if (!cvars::disable_global_lock) {
f.CallExtern(f.builtins()->enter_global_lock);
}
} else {
// Otherwise we are restoring interrupts (probably).
if (!cvars::disable_global_lock) {
f.CallExtern(f.builtins()->leave_global_lock);
}
}
#endif
return 0;
} else {
// L = 0
XEINSTRNOTIMPLEMENTED();
return 1;
}
f.StoreContext(offsetof(PPCContext, scratch),
f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE));
return 0;
}
void RegisterEmitCategoryControl() {

View File

@ -195,8 +195,8 @@ int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- -([frA x frC] + frB)
Value* v = f.Neg(
f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
f.LoadFPR(i.A.FRB));
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
@ -204,8 +204,8 @@ int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- -([frA x frC] + frB)
Value* v = f.Neg(
f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
f.LoadFPR(i.A.FRB));
v = f.ToSingle(v);
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
@ -214,8 +214,8 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- -([frA x frC] - frB)
Value* v = f.Neg(
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
f.LoadFPR(i.A.FRB));
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
@ -223,8 +223,8 @@ int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- -([frA x frC] - frB)
Value* v = f.Neg(
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
f.LoadFPR(i.A.FRB));
v = f.ToSingle(v);
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);

View File

@ -834,6 +834,7 @@ int InstrEmit_stdcx(PPCHIRBuilder& f, const InstrData& i) {
// Issue memory barrier for when we go out of lock and want others to see our
// updates.
f.MemoryBarrier();
return 0;

View File

@ -77,7 +77,8 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
// Allocate with 64b alignment.
context_ = reinterpret_cast<ppc::PPCContext*>(AllocateContext()); // memory::AlignedAlloc<ppc::PPCContext>(64);
context_ = reinterpret_cast<ppc::PPCContext*>(
AllocateContext());
processor->backend()->InitializeBackendContext(context_);
assert_true(((uint64_t)context_ & 0x3F) == 0);
std::memset(context_, 0, sizeof(ppc::PPCContext));
@ -93,6 +94,7 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id,
// Set initial registers.
context_->r[1] = stack_base;
context_->r[13] = pcr_address;
// fixme: VSCR must be set here!
}
ThreadState::~ThreadState() {
@ -105,7 +107,7 @@ ThreadState::~ThreadState() {
if (context_) {
FreeContext(reinterpret_cast<void*>(context_));
}
// memory::AlignedFree(context_);
// memory::AlignedFree(context_);
}
void ThreadState::Bind(ThreadState* thread_state) {

View File

@ -29,10 +29,20 @@
#include "xenia/kernel/kernel_state.h"
#include "xenia/kernel/user_module.h"
#if defined(NDEBUG)
static constexpr bool should_log_unknown_reg_writes() { return false; }
#else
DEFINE_bool(log_unknown_register_writes, false,
"Log writes to unknown registers from "
"CommandProcessor::WriteRegister. Has significant performance hit.",
"GPU");
static bool should_log_unknown_reg_writes() {
return cvars::log_unknown_register_writes;
}
#endif
namespace xe {
namespace gpu {
@ -465,7 +475,7 @@ void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index,
}
}
void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
if (XE_UNLIKELY(cvars::log_unknown_register_writes)) {
if (should_log_unknown_reg_writes()) {
// chrispy: rearrange check order, place set after checks
if (XE_UNLIKELY(!register_file_->IsValidRegister(index))) {
XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value);
@ -493,15 +503,45 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
// very unlikely. these ORS here are meant to be bitwise ors, so that we do
// not do branching evaluation of the conditions (we will almost always take
// all of the branches)
if (XE_UNLIKELY(
(index - XE_GPU_REG_SCRATCH_REG0 < 8) |
(index == XE_GPU_REG_COHER_STATUS_HOST) |
((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)))) {
unsigned expr = (index - XE_GPU_REG_SCRATCH_REG0 < 8) |
(index == XE_GPU_REG_COHER_STATUS_HOST) |
((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
// chrispy: reordered for msvc branch probability (assumes if is taken and
// else is not)
if (XE_LIKELY(expr == 0)) {
} else {
HandleSpecialRegisterWrite(index, value);
}
}
void CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
uint32_t* base,
uint32_t num_registers) {
for (uint32_t i = 0; i < num_registers; ++i) {
uint32_t data = xe::load_and_swap<uint32_t>(base + i);
this->WriteRegister(start_index + i, data);
}
}
void CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
uint32_t base,
uint32_t num_registers) {
for (uint32_t i = 0; i < num_registers; ++i) {
uint32_t data = ring->ReadAndSwap<uint32_t>();
WriteRegister(base + i, data);
}
}
void CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring,
uint32_t base,
uint32_t num_times) {
for (uint32_t m = 0; m < num_times; m++) {
uint32_t reg_data = ring->ReadAndSwap<uint32_t>();
uint32_t target_index = base;
WriteRegister(target_index, reg_data);
}
}
void CommandProcessor::MakeCoherent() {
SCOPE_profile_cpu_f("gpu");
@ -623,15 +663,20 @@ void CommandProcessor::ExecutePacket(uint32_t ptr, uint32_t count) {
}
bool CommandProcessor::ExecutePacket(RingBuffer* reader) {
// prefetch the wraparound range
// it likely is already in L3 cache, but in a zen system it may be another
// chiplets l3
reader->BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
reader->read_count());
const uint32_t packet = reader->ReadAndSwap<uint32_t>();
const uint32_t packet_type = packet >> 30;
if (packet == 0 || packet == 0x0BADF00D) {
if (XE_UNLIKELY(packet == 0 || packet == 0x0BADF00D)) {
trace_writer_.WritePacketStart(uint32_t(reader->read_ptr() - 4), 1);
trace_writer_.WritePacketEnd();
return true;
}
if (packet == 0xCDCDCDCD) {
if (XE_UNLIKELY(packet == 0xCDCDCDCD)) {
XELOGW("GPU packet is CDCDCDCD - probably read uninitialized memory!");
}
@ -667,10 +712,10 @@ bool CommandProcessor::ExecutePacketType0(RingBuffer* reader, uint32_t packet) {
uint32_t base_index = (packet & 0x7FFF);
uint32_t write_one_reg = (packet >> 15) & 0x1;
for (uint32_t m = 0; m < count; m++) {
uint32_t reg_data = reader->ReadAndSwap<uint32_t>();
uint32_t target_index = write_one_reg ? base_index : base_index + m;
WriteRegister(target_index, reg_data);
if (write_one_reg) {
WriteOneRegisterFromRing(reader, base_index, count);
} else {
WriteRegisterRangeFromRing(reader, base_index, count);
}
trace_writer_.WritePacketEnd();
@ -934,7 +979,7 @@ bool CommandProcessor::ExecutePacketType3_XE_SWAP(RingBuffer* reader,
uint32_t count) {
SCOPE_profile_cpu_f("gpu");
XELOGI("XE_SWAP");
XELOGD("XE_SWAP");
Profiler::Flip();
@ -1467,10 +1512,9 @@ bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingBuffer* reader,
reader->AdvanceRead((count - 1) * sizeof(uint32_t));
return true;
}
for (uint32_t n = 0; n < count - 1; n++, index++) {
uint32_t data = reader->ReadAndSwap<uint32_t>();
WriteRegister(index, data);
}
WriteRegisterRangeFromRing(reader, index, count - 1);
return true;
}
@ -1479,10 +1523,9 @@ bool CommandProcessor::ExecutePacketType3_SET_CONSTANT2(RingBuffer* reader,
uint32_t count) {
uint32_t offset_type = reader->ReadAndSwap<uint32_t>();
uint32_t index = offset_type & 0xFFFF;
for (uint32_t n = 0; n < count - 1; n++, index++) {
uint32_t data = reader->ReadAndSwap<uint32_t>();
WriteRegister(index, data);
}
WriteRegisterRangeFromRing(reader, index, count - 1);
return true;
}
@ -1517,12 +1560,12 @@ bool CommandProcessor::ExecutePacketType3_LOAD_ALU_CONSTANT(RingBuffer* reader,
assert_always();
return true;
}
trace_writer_.WriteMemoryRead(CpuToGpu(address), size_dwords * 4);
for (uint32_t n = 0; n < size_dwords; n++, index++) {
uint32_t data = xe::load_and_swap<uint32_t>(
memory_->TranslatePhysical(address + n * 4));
WriteRegister(index, data);
}
WriteRegistersFromMem(index, (uint32_t*)memory_->TranslatePhysical(address),
size_dwords);
return true;
}
@ -1530,10 +1573,9 @@ bool CommandProcessor::ExecutePacketType3_SET_SHADER_CONSTANTS(
RingBuffer* reader, uint32_t packet, uint32_t count) {
uint32_t offset_type = reader->ReadAndSwap<uint32_t>();
uint32_t index = offset_type & 0xFFFF;
for (uint32_t n = 0; n < count - 1; n++, index++) {
uint32_t data = reader->ReadAndSwap<uint32_t>();
WriteRegister(index, data);
}
WriteRegisterRangeFromRing(reader, index, count - 1);
return true;
}

View File

@ -153,8 +153,24 @@ class CommandProcessor {
// rarely needed, most register writes have no special logic here
XE_NOINLINE
void HandleSpecialRegisterWrite(uint32_t index, uint32_t value);
XE_FORCEINLINE
virtual void WriteRegister(uint32_t index, uint32_t value);
// mem has big-endian register values
XE_FORCEINLINE
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
uint32_t num_registers);
XE_FORCEINLINE
virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base,
uint32_t num_registers);
XE_FORCEINLINE
virtual void WriteOneRegisterFromRing(
xe::RingBuffer* ring, uint32_t base,
uint32_t
num_times); // repeatedly write a value to one register, presumably a
// register with special handling for writes
const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const {
return gamma_ramp_256_entry_table_;
}

View File

@ -1710,7 +1710,60 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
}
}
}
void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
uint32_t* base,
uint32_t num_registers) {
for (uint32_t i = 0; i < num_registers; ++i) {
uint32_t data = xe::load_and_swap<uint32_t>(base + i);
D3D12CommandProcessor::WriteRegister(start_index + i, data);
}
}
void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
uint32_t base,
uint32_t num_registers) {
// we already brought it into L2 earlier
RingBuffer::ReadRange range =
ring->BeginPrefetchedRead<swcache::PrefetchTag::Level1>(num_registers *
sizeof(uint32_t));
uint32_t num_regs_firstrange =
static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
D3D12CommandProcessor::WriteRegistersFromMem(
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
num_regs_firstrange);
if (range.second) {
D3D12CommandProcessor::WriteRegistersFromMem(
base + num_regs_firstrange,
reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)),
num_registers - num_regs_firstrange);
}
ring->EndRead(range);
}
void D3D12CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring,
uint32_t base,
uint32_t num_times) {
auto read = ring->BeginPrefetchedRead<swcache::PrefetchTag::Level1>(
num_times * sizeof(uint32_t));
uint32_t first_length = read.first_length / sizeof(uint32_t);
for (uint32_t i = 0; i < first_length; ++i) {
D3D12CommandProcessor::WriteRegister(
base, xe::load_and_swap<uint32_t>(read.first + (sizeof(uint32_t) * i)));
}
if (read.second) {
uint32_t second_length = read.second_length / sizeof(uint32_t);
for (uint32_t i = 0; i < second_length; ++i) {
D3D12CommandProcessor::WriteRegister(
base,
xe::load_and_swap<uint32_t>(read.second + (sizeof(uint32_t) * i)));
}
}
ring->EndRead(read);
}
void D3D12CommandProcessor::OnGammaRamp256EntryTableValueWritten() {
gamma_ramp_256_entry_table_up_to_date_ = false;
}

View File

@ -42,7 +42,7 @@ namespace xe {
namespace gpu {
namespace d3d12 {
class D3D12CommandProcessor : public CommandProcessor {
class D3D12CommandProcessor final : public CommandProcessor {
public:
explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system,
kernel::KernelState* kernel_state);
@ -203,9 +203,17 @@ class D3D12CommandProcessor : public CommandProcessor {
protected:
bool SetupContext() override;
void ShutdownContext() override;
XE_FORCEINLINE
void WriteRegister(uint32_t index, uint32_t value) override;
XE_FORCEINLINE
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
uint32_t num_registers) override;
XE_FORCEINLINE
virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base,
uint32_t num_registers) override;
XE_FORCEINLINE
virtual void WriteOneRegisterFromRing(xe::RingBuffer* ring, uint32_t base,
uint32_t num_times) override;
void OnGammaRamp256EntryTableValueWritten() override;
void OnGammaRampPWLValueWritten() override;

View File

@ -406,14 +406,16 @@ bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange(
}
bool D3D12SharedMemory::UploadRanges(
const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) {
if (upload_page_ranges.empty()) {
const std::pair<uint32_t, uint32_t>* upload_page_ranges, unsigned num_upload_page_ranges) {
if (!num_upload_page_ranges) {
return true;
}
CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
command_processor_.SubmitBarriers();
auto& command_list = command_processor_.GetDeferredCommandList();
for (auto upload_range : upload_page_ranges) {
//for (auto upload_range : upload_page_ranges) {
for (unsigned int i = 0; i < num_upload_page_ranges; ++i) {
auto& upload_range = upload_page_ranges[i];
uint32_t upload_range_start = upload_range.first;
uint32_t upload_range_length = upload_range.second;
trace_writer_.WriteMemoryRead(upload_range_start << page_size_log2(),

View File

@ -91,8 +91,8 @@ class D3D12SharedMemory : public SharedMemory {
bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
uint32_t length_allocations) override;
bool UploadRanges(const std::vector<std::pair<uint32_t, uint32_t>>&
upload_page_ranges) override;
bool UploadRanges(const std::pair<uint32_t, uint32_t>*
upload_page_ranges, unsigned num_ranges) override;
private:
D3D12CommandProcessor& command_processor_;

View File

@ -31,8 +31,8 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
SCOPE_profile_cpu_f("gpu");
#endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
const uintmax_t* stream = command_stream_.data();
size_t stream_remaining = command_stream_.size();
const uintmax_t* stream = (const uintmax_t*)command_stream_.data();
size_t stream_remaining = command_stream_.size() / sizeof(uintmax_t);
ID3D12PipelineState* current_pipeline_state = nullptr;
while (stream_remaining != 0) {
const CommandHeader& header =
@ -266,8 +266,12 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
void* DeferredCommandList::WriteCommand(Command command,
size_t arguments_size_bytes) {
size_t arguments_size_elements =
(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
round_up(arguments_size_bytes, sizeof(uintmax_t), false);
//(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);
#if 0
size_t offset = command_stream_.size();
command_stream_.resize(offset + kCommandHeaderSizeElements +
arguments_size_elements);
@ -276,6 +280,19 @@ void* DeferredCommandList::WriteCommand(Command command,
header.command = command;
header.arguments_size_elements = uint32_t(arguments_size_elements);
return command_stream_.data() + (offset + kCommandHeaderSizeElements);
#else
size_t offset = command_stream_.size();
constexpr size_t kCommandHeaderSizeBytes =
kCommandHeaderSizeElements * sizeof(uintmax_t);
command_stream_.resize(offset + kCommandHeaderSizeBytes +
arguments_size_elements);
CommandHeader& header =
*reinterpret_cast<CommandHeader*>(command_stream_.data() + offset);
header.command = command;
header.arguments_size_elements = uint32_t(arguments_size_elements) / sizeof(uintmax_t);
return command_stream_.data() + (offset + kCommandHeaderSizeBytes);
#endif
}
} // namespace d3d12

View File

@ -19,7 +19,7 @@
#include "xenia/base/literals.h"
#include "xenia/base/math.h"
#include "xenia/ui/d3d12/d3d12_api.h"
#include "xenia/base/memory.h"
namespace xe {
namespace gpu {
namespace d3d12 {
@ -30,8 +30,12 @@ class D3D12CommandProcessor;
class DeferredCommandList {
public:
static constexpr size_t MAX_SIZEOF_COMMANDLIST = 65536 * 128; //around 8 mb
/*
chrispy: upped from 1_MiB to 4_MiB, m:durandal hits frequent resizes in large open maps
*/
DeferredCommandList(const D3D12CommandProcessor& command_processor,
size_t initial_size_bytes = 1_MiB);
size_t initial_size_bytes = MAX_SIZEOF_COMMANDLIST);
void Reset();
void Execute(ID3D12GraphicsCommandList* command_list,
@ -562,7 +566,8 @@ class DeferredCommandList {
const D3D12CommandProcessor& command_processor_;
// uintmax_t to ensure uint64_t and pointer alignment of all structures.
std::vector<uintmax_t> command_stream_;
//std::vector<uintmax_t> command_stream_;
FixedVMemVector<MAX_SIZEOF_COMMANDLIST> command_stream_;
};
} // namespace d3d12

View File

@ -868,7 +868,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
xenos::kMaxResolveSize);
y1 = y0 + int32_t(xenos::kMaxResolveSize);
}
//fails in forza horizon 1
assert_true(x0 < x1 && y0 < y1);
if (x0 >= x1 || y0 >= y1) {
XELOGE("Resolve region is empty");

View File

@ -883,7 +883,7 @@ class PrimitiveProcessor {
// Must be called in a global critical region.
void UpdateCacheBucketsNonEmptyL2(
uint32_t bucket_index_div_64,
[[maybe_unused]] const std::unique_lock<std::recursive_mutex>&
[[maybe_unused]] const global_unique_lock_type&
global_lock) {
uint64_t& cache_buckets_non_empty_l2_ref =
cache_buckets_non_empty_l2_[bucket_index_div_64 >> 6];

View File

@ -320,8 +320,7 @@ void Shader::GatherVertexFetchInformation(
for (auto& vertex_binding : vertex_bindings_) {
if (vertex_binding.fetch_constant == op.fetch_constant_index()) {
// It may not hold that all strides are equal, but I hope it does.
assert_true(!fetch_instr.attributes.stride ||
vertex_binding.stride_words == fetch_instr.attributes.stride);
vertex_binding.attributes.push_back({});
attrib = &vertex_binding.attributes.back();
break;

View File

@ -14,6 +14,7 @@
#include "xenia/base/assert.h"
#include "xenia/base/bit_range.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/memory.h"
#include "xenia/base/profiling.h"
@ -344,7 +345,7 @@ void SharedMemory::UnlinkWatchRange(WatchRange* range) {
range->next_free = watch_range_first_free_;
watch_range_first_free_ = range;
}
// todo: optimize, an enormous amount of cpu time (1.34%) is spent here.
bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
bool* any_data_resolved_out) {
if (!length) {
@ -364,14 +365,20 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
return false;
}
unsigned int current_upload_range = 0;
uint32_t page_first = start >> page_size_log2_;
uint32_t page_last = (start + length - 1) >> page_size_log2_;
upload_ranges_.clear();
std::pair<uint32_t, uint32_t>* uploads =
reinterpret_cast<std::pair<uint32_t, uint32_t>*>(upload_ranges_.data());
bool any_data_resolved = false;
uint32_t block_first = page_first >> 6;
uint32_t block_last = page_last >> 6;
uint32_t range_start = UINT32_MAX;
{
auto global_lock = global_critical_region_.Acquire();
for (uint32_t i = block_first; i <= block_last; ++i) {
@ -412,8 +419,13 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
if (!xe::bit_scan_forward(block_valid_from_start, &block_page)) {
break;
}
upload_ranges_.push_back(
std::make_pair(range_start, (i << 6) + block_page - range_start));
if (current_upload_range + 1 >= MAX_UPLOAD_RANGES) {
xe::FatalError(
"Hit max upload ranges in shared_memory.cc, tell a dev to "
"raise the limit!");
}
uploads[current_upload_range++] =
std::make_pair(range_start, (i << 6) + block_page - range_start);
// In the next iteration within this block, consider this range valid
// since it has been queued for upload.
block_valid |= (uint64_t(1) << block_page) - 1;
@ -423,17 +435,17 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
}
}
if (range_start != UINT32_MAX) {
upload_ranges_.push_back(
std::make_pair(range_start, page_last + 1 - range_start));
uploads[current_upload_range++] =
(std::make_pair(range_start, page_last + 1 - range_start));
}
if (any_data_resolved_out) {
*any_data_resolved_out = any_data_resolved;
}
if (upload_ranges_.empty()) {
if (!current_upload_range) {
return true;
}
return UploadRanges(upload_ranges_);
return UploadRanges(uploads, current_upload_range);
}
std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallbackThunk(

View File

@ -35,7 +35,7 @@ class SharedMemory {
virtual void SetSystemPageBlocksValidWithGpuDataWritten();
typedef void (*GlobalWatchCallback)(
const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
const global_unique_lock_type& global_lock, void* context,
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
typedef void* GlobalWatchHandle;
// Registers a callback invoked when something is invalidated in the GPU
@ -49,9 +49,9 @@ class SharedMemory {
GlobalWatchHandle RegisterGlobalWatch(GlobalWatchCallback callback,
void* callback_context);
void UnregisterGlobalWatch(GlobalWatchHandle handle);
typedef void (*WatchCallback)(
const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
void* data, uint64_t argument, bool invalidated_by_gpu);
typedef void (*WatchCallback)(const global_unique_lock_type& global_lock,
void* context, void* data, uint64_t argument,
bool invalidated_by_gpu);
typedef void* WatchHandle;
// Registers a callback invoked when the specified memory range is invalidated
// in the GPU memory copy by the CPU or (if triggered explicitly - such as by
@ -140,7 +140,8 @@ class SharedMemory {
// ascending address order, so front and back can be used to determine the
// overall bounds of pages to be uploaded.
virtual bool UploadRanges(
const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) = 0;
const std::pair<uint32_t, uint32_t>* upload_page_ranges,
unsigned num_upload_ranges) = 0;
const std::vector<std::pair<uint32_t, uint32_t>>& trace_download_ranges() {
return trace_download_ranges_;
@ -174,10 +175,13 @@ class SharedMemory {
void* memory_invalidation_callback_handle_ = nullptr;
void* memory_data_provider_handle_ = nullptr;
static constexpr unsigned int MAX_UPLOAD_RANGES = 65536;
// Ranges that need to be uploaded, generated by GetRangesToUpload (a
// persistently allocated vector).
std::vector<std::pair<uint32_t, uint32_t>> upload_ranges_;
// std::vector<std::pair<uint32_t, uint32_t>> upload_ranges_;
FixedVMemVector<MAX_UPLOAD_RANGES * sizeof(std::pair<uint32_t, uint32_t>)>
upload_ranges_;
// GPU-written memory downloading for traces. <Start address, length>.
std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;

View File

@ -507,7 +507,7 @@ TextureCache::Texture::~Texture() {
}
void TextureCache::Texture::MakeUpToDateAndWatch(
const std::unique_lock<std::recursive_mutex>& global_lock) {
const global_unique_lock_type& global_lock) {
SharedMemory& shared_memory = texture_cache().shared_memory();
if (base_outdated_) {
assert_not_zero(GetGuestBaseSize());
@ -552,7 +552,7 @@ void TextureCache::Texture::MarkAsUsed() {
}
void TextureCache::Texture::WatchCallback(
[[maybe_unused]] const std::unique_lock<std::recursive_mutex>& global_lock,
[[maybe_unused]] const global_unique_lock_type& global_lock,
bool is_mip) {
if (is_mip) {
assert_not_zero(GetGuestMipsSize());
@ -565,8 +565,8 @@ void TextureCache::Texture::WatchCallback(
}
}
void TextureCache::WatchCallback(
const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
void TextureCache::WatchCallback(const global_unique_lock_type& global_lock,
void* context,
void* data, uint64_t argument, bool invalidated_by_gpu) {
Texture& texture = *static_cast<Texture*>(context);
texture.WatchCallback(global_lock, argument != 0);
@ -902,7 +902,7 @@ bool TextureCache::IsRangeScaledResolved(uint32_t start_unscaled,
}
void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
const global_unique_lock_type& global_lock, void* context,
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
TextureCache* texture_cache = reinterpret_cast<TextureCache*>(context);
texture_cache->ScaledResolveGlobalWatchCallback(
@ -910,7 +910,7 @@ void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
}
void TextureCache::ScaledResolveGlobalWatchCallback(
const std::unique_lock<std::recursive_mutex>& global_lock,
const global_unique_lock_type& global_lock,
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
assert_true(IsDrawResolutionScaled());
if (invalidated_by_gpu) {

View File

@ -230,19 +230,15 @@ class TextureCache {
}
bool IsResolved() const { return base_resolved_ || mips_resolved_; }
bool base_outdated(
const std::unique_lock<std::recursive_mutex>& global_lock) const {
bool base_outdated(const global_unique_lock_type& global_lock) const {
return base_outdated_;
}
bool mips_outdated(
const std::unique_lock<std::recursive_mutex>& global_lock) const {
bool mips_outdated(const global_unique_lock_type& global_lock) const {
return mips_outdated_;
}
void MakeUpToDateAndWatch(
const std::unique_lock<std::recursive_mutex>& global_lock);
void MakeUpToDateAndWatch(const global_unique_lock_type& global_lock);
void WatchCallback(
const std::unique_lock<std::recursive_mutex>& global_lock, bool is_mip);
void WatchCallback(const global_unique_lock_type& global_lock, bool is_mip);
// For LRU caching - updates the last usage frame and moves the texture to
// the end of the usage queue. Must be called any time the texture is
@ -579,8 +575,8 @@ class TextureCache {
void UpdateTexturesTotalHostMemoryUsage(uint64_t add, uint64_t subtract);
// Shared memory callback for texture data invalidation.
static void WatchCallback(
const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
static void WatchCallback(const global_unique_lock_type& global_lock,
void* context,
void* data, uint64_t argument, bool invalidated_by_gpu);
// Checks if there are any pages that contain scaled resolve data within the
@ -589,10 +585,10 @@ class TextureCache {
// Global shared memory invalidation callback for invalidating scaled resolved
// texture data.
static void ScaledResolveGlobalWatchCallbackThunk(
const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
const global_unique_lock_type& global_lock, void* context,
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
void ScaledResolveGlobalWatchCallback(
const std::unique_lock<std::recursive_mutex>& global_lock,
const global_unique_lock_type& global_lock,
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
const RegisterFile& register_file_;

View File

@ -1157,7 +1157,14 @@ void VulkanCommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
}
}
}
void VulkanCommandProcessor::WriteRegistersFromMem(uint32_t start_index,
uint32_t* base,
uint32_t num_registers) {
for (uint32_t i = 0; i < num_registers; ++i) {
uint32_t data = xe::load_and_swap<uint32_t>(base + i);
VulkanCommandProcessor::WriteRegister(start_index + i, data);
}
}
void VulkanCommandProcessor::SparseBindBuffer(
VkBuffer buffer, uint32_t bind_count, const VkSparseMemoryBind* binds,
VkPipelineStageFlags wait_stage_mask) {

View File

@ -45,7 +45,7 @@ namespace xe {
namespace gpu {
namespace vulkan {
class VulkanCommandProcessor : public CommandProcessor {
class VulkanCommandProcessor final : public CommandProcessor {
public:
// Single-descriptor layouts for use within a single frame.
enum class SingleTransientDescriptorLayout {
@ -259,8 +259,11 @@ class VulkanCommandProcessor : public CommandProcessor {
protected:
bool SetupContext() override;
void ShutdownContext() override;
XE_FORCEINLINE
void WriteRegister(uint32_t index, uint32_t value) override;
XE_FORCEINLINE
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
uint32_t num_registers) override;
void OnGammaRamp256EntryTableValueWritten() override;
void OnGammaRampPWLValueWritten() override;

View File

@ -376,18 +376,21 @@ bool VulkanSharedMemory::AllocateSparseHostGpuMemoryRange(
}
bool VulkanSharedMemory::UploadRanges(
const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) {
if (upload_page_ranges.empty()) {
const std::pair<uint32_t, uint32_t>* upload_page_ranges,
unsigned num_upload_ranges) {
if (!num_upload_ranges) {
return true;
}
auto& range_front = upload_page_ranges[0];
auto& range_back = upload_page_ranges[num_upload_ranges - 1];
// upload_page_ranges are sorted, use them to determine the range for the
// ordering barrier.
Use(Usage::kTransferDestination,
std::make_pair(
upload_page_ranges.front().first << page_size_log2(),
(upload_page_ranges.back().first + upload_page_ranges.back().second -
upload_page_ranges.front().first)
<< page_size_log2()));
std::make_pair(range_front.first << page_size_log2(),
(range_back.first + range_back.second - range_front.first)
<< page_size_log2()));
command_processor_.SubmitBarriers(true);
DeferredCommandBuffer& command_buffer =
command_processor_.deferred_command_buffer();
@ -395,9 +398,11 @@ bool VulkanSharedMemory::UploadRanges(
bool successful = true;
upload_regions_.clear();
VkBuffer upload_buffer_previous = VK_NULL_HANDLE;
for (auto upload_range : upload_page_ranges) {
uint32_t upload_range_start = upload_range.first;
uint32_t upload_range_length = upload_range.second;
// for (auto upload_range : upload_page_ranges) {
for (unsigned int i = 0; i < num_upload_ranges; ++i) {
uint32_t upload_range_start = upload_page_ranges[i].first;
uint32_t upload_range_length = upload_page_ranges[i].second;
trace_writer_.WriteMemoryRead(upload_range_start << page_size_log2(),
upload_range_length << page_size_log2());
while (upload_range_length) {

View File

@ -62,8 +62,8 @@ class VulkanSharedMemory : public SharedMemory {
bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
uint32_t length_allocations) override;
bool UploadRanges(const std::vector<std::pair<uint32_t, uint32_t>>&
upload_page_ranges) override;
bool UploadRanges(const std::pair<uint32_t, uint32_t>*
upload_page_ranges, unsigned num_ranges) override;
private:
void GetUsageMasks(Usage usage, VkPipelineStageFlags& stage_mask,

View File

@ -137,6 +137,8 @@ X_INPUT_VIBRATION InputSystem::ModifyVibrationLevel(
modified_vibration.right_motor_speed = 0;
return modified_vibration;
}
std::unique_lock<xe_unlikely_mutex> InputSystem::lock() {
return std::unique_lock<xe_unlikely_mutex>{lock_};
}
} // namespace hid
} // namespace xe

View File

@ -12,7 +12,7 @@
#include <memory>
#include <vector>
#include "xenia/base/mutex.h"
#include "xenia/hid/input.h"
#include "xenia/hid/input_driver.h"
#include "xenia/xbox.h"
@ -48,6 +48,8 @@ class InputSystem {
void UpdateUsedSlot(uint8_t slot, bool connected);
uint8_t GetConnectedSlots() const { return connected_slot; }
std::unique_lock<xe_unlikely_mutex> lock();
private:
xe::ui::Window* window_ = nullptr;
@ -55,6 +57,7 @@ class InputSystem {
X_INPUT_VIBRATION ModifyVibrationLevel(X_INPUT_VIBRATION* vibration);
uint8_t connected_slot = 0b0001;
xe_unlikely_mutex lock_;
};
} // namespace hid

View File

@ -14,9 +14,9 @@
#include <xinput.h> // NOLINT(build/include_order)
#include "xenia/base/clock.h"
#include "xenia/base/logging.h"
#include "xenia/hid/hid_flags.h"
namespace xe {
namespace hid {
namespace xinput {
@ -81,13 +81,39 @@ X_STATUS XInputInputDriver::Setup() {
}
return X_STATUS_SUCCESS;
}
constexpr uint64_t SKIP_INVALID_CONTROLLER_TIME = 1100;
static uint64_t last_invalid_time[4];
static DWORD should_skip(uint32_t user_index) {
uint64_t time = last_invalid_time[user_index];
if (time) {
uint64_t deltatime = xe::Clock::QueryHostUptimeMillis() - time;
if (deltatime < SKIP_INVALID_CONTROLLER_TIME) {
return ERROR_DEVICE_NOT_CONNECTED;
}
last_invalid_time[user_index] = 0;
}
return 0;
}
static void set_skip(uint32_t user_index) {
last_invalid_time[user_index] = xe::Clock::QueryHostUptimeMillis();
}
X_RESULT XInputInputDriver::GetCapabilities(uint32_t user_index, uint32_t flags,
X_INPUT_CAPABILITIES* out_caps) {
DWORD skipper = should_skip(user_index);
if (skipper) {
return skipper;
}
XINPUT_CAPABILITIES native_caps;
auto xigc = (decltype(&XInputGetCapabilities))XInputGetCapabilities_;
DWORD result = xigc(user_index, flags, &native_caps);
if (result) {
if (result == ERROR_DEVICE_NOT_CONNECTED) {
set_skip(user_index);
}
return result;
}
@ -110,10 +136,18 @@ X_RESULT XInputInputDriver::GetCapabilities(uint32_t user_index, uint32_t flags,
X_RESULT XInputInputDriver::GetState(uint32_t user_index,
X_INPUT_STATE* out_state) {
DWORD skipper = should_skip(user_index);
if (skipper) {
return skipper;
}
XINPUT_STATE native_state;
auto xigs = (decltype(&XInputGetState))XInputGetState_;
DWORD result = xigs(user_index, &native_state);
if (result) {
if (result == ERROR_DEVICE_NOT_CONNECTED) {
set_skip(user_index);
}
return result;
}
@ -131,11 +165,18 @@ X_RESULT XInputInputDriver::GetState(uint32_t user_index,
X_RESULT XInputInputDriver::SetState(uint32_t user_index,
X_INPUT_VIBRATION* vibration) {
DWORD skipper = should_skip(user_index);
if (skipper) {
return skipper;
}
XINPUT_VIBRATION native_vibration;
native_vibration.wLeftMotorSpeed = vibration->left_motor_speed;
native_vibration.wRightMotorSpeed = vibration->right_motor_speed;
auto xiss = (decltype(&XInputSetState))XInputSetState_;
DWORD result = xiss(user_index, &native_vibration);
if (result == ERROR_DEVICE_NOT_CONNECTED) {
set_skip(user_index);
}
return result;
}

View File

@ -948,7 +948,11 @@ bool KernelState::Restore(ByteStream* stream) {
}
uint8_t KernelState::GetConnectedUsers() const {
return emulator_->input_system()->GetConnectedSlots();
auto input_sys = emulator_->input_system();
auto lock = input_sys->lock();
return input_sys->GetConnectedSlots();
}
void KernelState::UpdateUsedUserProfiles() {

View File

@ -58,6 +58,7 @@ dword_result_t XamInputGetCapabilities_entry(
}
auto input_system = kernel_state()->emulator()->input_system();
auto lock = input_system->lock();
return input_system->GetCapabilities(actual_user_index, flags, caps);
}
DECLARE_XAM_EXPORT1(XamInputGetCapabilities, kInput, kSketchy);
@ -81,6 +82,7 @@ dword_result_t XamInputGetCapabilitiesEx_entry(
}
auto input_system = kernel_state()->emulator()->input_system();
auto lock = input_system->lock();
return input_system->GetCapabilities(actual_user_index, flags, caps);
}
DECLARE_XAM_EXPORT1(XamInputGetCapabilitiesEx, kInput, kSketchy);
@ -88,6 +90,13 @@ DECLARE_XAM_EXPORT1(XamInputGetCapabilitiesEx, kInput, kSketchy);
// https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputgetstate(v=vs.85).aspx
dword_result_t XamInputGetState_entry(dword_t user_index, dword_t flags,
pointer_t<X_INPUT_STATE> input_state) {
if (input_state) {
memset((void*)input_state.host_address(), 0, sizeof X_INPUT_STATE);
}
if (user_index >= 4) {
return X_ERROR_DEVICE_NOT_CONNECTED;
}
// Games call this with a NULL state ptr, probably as a query.
if ((flags & 0xFF) && (flags & XINPUT_FLAG_GAMEPAD) == 0) {
@ -96,12 +105,14 @@ dword_result_t XamInputGetState_entry(dword_t user_index, dword_t flags,
}
uint32_t actual_user_index = user_index;
// chrispy: change this, logic is not right
if ((actual_user_index & 0xFF) == 0xFF || (flags & XINPUT_FLAG_ANY_USER)) {
// Always pin user to 0.
actual_user_index = 0;
}
auto input_system = kernel_state()->emulator()->input_system();
auto lock = input_system->lock();
return input_system->GetState(user_index, input_state);
}
DECLARE_XAM_EXPORT2(XamInputGetState, kInput, kImplemented, kHighFrequency);
@ -109,6 +120,9 @@ DECLARE_XAM_EXPORT2(XamInputGetState, kInput, kImplemented, kHighFrequency);
// https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputsetstate(v=vs.85).aspx
dword_result_t XamInputSetState_entry(dword_t user_index, dword_t unk,
pointer_t<X_INPUT_VIBRATION> vibration) {
if (user_index >= 4) {
return X_E_DEVICE_NOT_CONNECTED;
}
if (!vibration) {
return X_ERROR_BAD_ARGUMENTS;
}
@ -120,6 +134,7 @@ dword_result_t XamInputSetState_entry(dword_t user_index, dword_t unk,
}
auto input_system = kernel_state()->emulator()->input_system();
auto lock = input_system->lock();
return input_system->SetState(user_index, vibration);
}
DECLARE_XAM_EXPORT1(XamInputSetState, kInput, kImplemented);
@ -147,6 +162,7 @@ dword_result_t XamInputGetKeystroke_entry(
}
auto input_system = kernel_state()->emulator()->input_system();
auto lock = input_system->lock();
return input_system->GetKeystroke(user_index, flags, keystroke);
}
DECLARE_XAM_EXPORT1(XamInputGetKeystroke, kInput, kImplemented);
@ -166,14 +182,15 @@ dword_result_t XamInputGetKeystrokeEx_entry(
uint32_t user_index = *user_index_ptr;
auto input_system = kernel_state()->emulator()->input_system();
auto lock = input_system->lock();
if ((user_index & 0xFF) == 0xFF) {
// Always pin user to 0.
user_index = 0;
}
if (flags & XINPUT_FLAG_ANY_USER) {
// That flag means we should iterate over every connected controller and check which one have pending request.
// That flag means we should iterate over every connected controller and
// check which one have pending request.
auto result = X_ERROR_DEVICE_NOT_CONNECTED;
for (uint32_t i = 0; i < 4; i++) {
auto result = input_system->GetKeystroke(i, flags, keystroke);
@ -188,6 +205,7 @@ dword_result_t XamInputGetKeystrokeEx_entry(
}
auto result = input_system->GetKeystroke(user_index, flags, keystroke);
if (XSUCCEEDED(result)) {
*user_index_ptr = keystroke->user_index;
}
@ -202,7 +220,8 @@ X_HRESULT_result_t XamUserGetDeviceContext_entry(dword_t user_index,
// If this function fails they assume zero, so let's fail AND
// set zero just to be safe.
*out_ptr = 0;
if (kernel_state()->IsUserSignedIn(user_index) || (user_index & 0xFF) == 0xFF) {
if (kernel_state()->IsUserSignedIn(user_index) ||
(user_index & 0xFF) == 0xFF) {
*out_ptr = (uint32_t)user_index;
return X_E_SUCCESS;
} else {

View File

@ -121,7 +121,8 @@ dword_result_t NtAllocateVirtualMemory_entry(lpdword_t base_addr_ptr,
? -int32_t(region_size_ptr.value())
: region_size_ptr.value();
adjusted_size = xe::round_up(adjusted_size, adjusted_base ? page_size : 64 * 1024);
adjusted_size =
xe::round_up(adjusted_size, adjusted_base ? page_size : 64 * 1024);
// Allocate.
uint32_t allocation_type = 0;
@ -295,10 +296,19 @@ struct X_MEMORY_BASIC_INFORMATION {
be<uint32_t> protect;
be<uint32_t> type;
};
// chrispy: added region_type ? guessed name, havent seen any except 0 used
dword_result_t NtQueryVirtualMemory_entry(
dword_t base_address,
pointer_t<X_MEMORY_BASIC_INFORMATION> memory_basic_information_ptr) {
pointer_t<X_MEMORY_BASIC_INFORMATION> memory_basic_information_ptr,
dword_t region_type) {
switch (region_type) {
case 0:
case 1:
case 2:
break;
default:
return X_STATUS_INVALID_PARAMETER;
}
auto heap = kernel_state()->memory()->LookupHeap(base_address);
HeapAllocationInfo alloc_info;
if (heap == nullptr || !heap->QueryRegionInfo(base_address, &alloc_info)) {
@ -373,8 +383,9 @@ dword_result_t MmAllocatePhysicalMemoryEx_entry(
// min_addr_range/max_addr_range are bounds in physical memory, not virtual.
uint32_t heap_base = heap->heap_base();
uint32_t heap_physical_address_offset = heap->GetPhysicalAddress(heap_base);
// TODO(Gliniak): Games like 545108B4 compares min_addr_range with value returned.
// 0x1000 offset causes it to go below that minimal range and goes haywire
// TODO(Gliniak): Games like 545108B4 compares min_addr_range with value
// returned. 0x1000 offset causes it to go below that minimal range and goes
// haywire
if (min_addr_range && max_addr_range) {
heap_physical_address_offset = 0;
}

View File

@ -53,21 +53,20 @@ DECLARE_XBOXKRNL_EXPORT1(RtlCompareMemory, kMemory, kImplemented);
// https://msdn.microsoft.com/en-us/library/ff552123
dword_result_t RtlCompareMemoryUlong_entry(lpvoid_t source, dword_t length,
dword_t pattern) {
// Return 0 if source/length not aligned
if (source.guest_address() % 4 || length % 4) {
return 0;
}
uint32_t num_compared_bytes = 0;
uint32_t n = 0;
for (uint32_t i = 0; i < (length / 4); i++) {
// FIXME: This assumes as_array returns xe::be
uint32_t val = source.as_array<uint32_t>()[i];
if (val == pattern) {
n++;
uint32_t swapped_pattern = xe::byte_swap(pattern.value());
char* host_source = (char*)source.host_address();
for (uint32_t aligned_length = length & 0xFFFFFFFCU; aligned_length;
num_compared_bytes += 4) {
if (*(uint32_t*)(host_source + num_compared_bytes) != swapped_pattern) {
break;
}
aligned_length = aligned_length - 4;
}
return n;
return num_compared_bytes;
}
DECLARE_XBOXKRNL_EXPORT1(RtlCompareMemoryUlong, kMemory, kImplemented);
@ -85,23 +84,61 @@ void RtlFillMemoryUlong_entry(lpvoid_t destination, dword_t length,
}
DECLARE_XBOXKRNL_EXPORT1(RtlFillMemoryUlong, kMemory, kImplemented);
dword_result_t RtlUpperChar_entry(dword_t in) {
char c = in & 0xFF;
if (c >= 'a' && c <= 'z') {
return c ^ 0x20;
}
static constexpr const unsigned char rtl_lower_table[256] = {
0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB,
0xC, 0xD, 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,
0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,
0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,
0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,
0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
0xFC, 0xFD, 0xFE, 0xFF};
return c;
static constexpr const unsigned char rtl_upper_table[256] = {
0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB,
0xC, 0xD, 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,
0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53,
0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B,
0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
0x58, 0x59, 0x5A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB,
0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xC0, 0xC1, 0xC2, 0xC3,
0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xF7, 0xD8, 0xD9, 0xDA, 0xDB,
0xDC, 0xDD, 0xDE, 0x3F};
dword_result_t RtlUpperChar_entry(dword_t in) {
return rtl_upper_table[in & 0xff];
}
DECLARE_XBOXKRNL_EXPORT1(RtlUpperChar, kNone, kImplemented);
dword_result_t RtlLowerChar_entry(dword_t in) {
char c = in & 0xFF;
if (c >= 'A' && c <= 'Z') {
return c ^ 0x20;
}
return c;
return rtl_lower_table[in & 0xff];
}
DECLARE_XBOXKRNL_EXPORT1(RtlLowerChar, kNone, kImplemented);

View File

@ -473,7 +473,8 @@ void VdSwap_entry(
dwords[i] = xenos::MakePacketType2();
}
}
DECLARE_XBOXKRNL_EXPORT2(VdSwap, kVideo, kImplemented, kImportant);
DECLARE_XBOXKRNL_EXPORT3(VdSwap, kVideo, kImplemented, kHighFrequency,
kImportant);
void RegisterVideoExports(xe::cpu::ExportResolver* export_resolver,
KernelState* kernel_state) {

View File

@ -465,7 +465,7 @@ cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) {
}
bool Memory::AccessViolationCallback(
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
global_unique_lock_type global_lock_locked_once,
void* host_address, bool is_write) {
// Access via physical_membase_ is special, when need to bypass everything
// (for instance, for a data provider to actually write the data) so only
@ -493,14 +493,14 @@ bool Memory::AccessViolationCallback(
}
bool Memory::AccessViolationCallbackThunk(
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
global_unique_lock_type global_lock_locked_once,
void* context, void* host_address, bool is_write) {
return reinterpret_cast<Memory*>(context)->AccessViolationCallback(
std::move(global_lock_locked_once), host_address, is_write);
}
bool Memory::TriggerPhysicalMemoryCallbacks(
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
global_unique_lock_type global_lock_locked_once,
uint32_t virtual_address, uint32_t length, bool is_write,
bool unwatch_exact_range, bool unprotect) {
BaseHeap* heap = LookupHeap(virtual_address);
@ -1711,7 +1711,7 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
}
bool PhysicalHeap::TriggerCallbacks(
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
global_unique_lock_type global_lock_locked_once,
uint32_t virtual_address, uint32_t length, bool is_write,
bool unwatch_exact_range, bool unprotect) {
// TODO(Triang3l): Support read watches.

View File

@ -271,8 +271,7 @@ class PhysicalHeap : public BaseHeap {
bool enable_invalidation_notifications,
bool enable_data_providers);
// Returns true if any page in the range was watched.
bool TriggerCallbacks(
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
bool TriggerCallbacks(global_unique_lock_type global_lock_locked_once,
uint32_t virtual_address, uint32_t length, bool is_write,
bool unwatch_exact_range, bool unprotect = true);
@ -459,7 +458,7 @@ class Memory {
// TODO(Triang3l): Implement data providers - this is why locking depth of 1
// will be required in the future.
bool TriggerPhysicalMemoryCallbacks(
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
global_unique_lock_type global_lock_locked_once,
uint32_t virtual_address, uint32_t length, bool is_write,
bool unwatch_exact_range, bool unprotect = true);
@ -508,11 +507,10 @@ class Memory {
static uint32_t HostToGuestVirtualThunk(const void* context,
const void* host_address);
bool AccessViolationCallback(
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
bool AccessViolationCallback(global_unique_lock_type global_lock_locked_once,
void* host_address, bool is_write);
static bool AccessViolationCallbackThunk(
std::unique_lock<std::recursive_mutex> global_lock_locked_once,
global_unique_lock_type global_lock_locked_once,
void* context, void* host_address, bool is_write);
std::filesystem::path file_name_;

2
third_party/FFmpeg vendored

@ -1 +1 @@
Subproject commit 15ece0882e8d5875051ff5b73c5a8326f7cee9f5
Subproject commit a437fe6d8efef17c8ad33d39f5815032e7adf5d7

@ -1 +1 @@
Subproject commit 5787e9cb7551c8c79cf9ce14f7be860dc907e9a4
Subproject commit 302b6e03e829c6d6a70415f10d818a5088cb6ccf